feat(enrichment): tag-first metadata pipeline (§1D)
Implements the §6.2 enrichment pipeline: embedded tags → Chromaprint
fingerprint → AcoustID lookup. Well-tagged files get correct
artist/album/title offline; the rest are identified via AcoustID
(which also yields a MusicBrainz recording id in one call).
- domain: AudioTags/Fingerprint/RecordingMatch value objects; ports
AudioTagReader, AudioFingerprinter, AcoustIdClient; TrackRepository
.apply_enrichment (gap-fill, never erases) + AlbumRepository.get_or_create
- infrastructure/metadata: MutagenTagReader, FpcalcFingerprinter,
AcoustIdHttpClient (rich meta=recordings+releasegroups, throttled)
- application: MetadataEnrichmentService — tags preferred, AcoustID fills
gaps; resolves artist/album; status enriched/failed; skips manual;
every external step wrapped (graceful degradation)
- workers: enrich_task registered; enqueue_enrich is best-effort and
deferred so the caller's txn commits before the worker reads the row
- wiring: upload enqueues after add; import returns imported_ids and
enqueues post-commit (mid-scan would race the worker); manual
POST /tracks/{id}/metadata/enrich endpoint
- deps: add mutagen (fpcalc/ffmpeg already in the image)
Tests: metadata service orchestration, AcoustID parser, tag helpers.
125 passed; mypy strict + ruff clean.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
"""arq worker settings — the queue runtime. Task functions register here.
|
||||
|
||||
Run with: ``arq app.workers.arq_worker.WorkerSettings``.
|
||||
Tasks (download, enrich, transcode) are appended to ``functions`` in later steps.
|
||||
Tasks (download, transcode) are appended to ``functions`` in later steps.
|
||||
"""
|
||||
|
||||
from typing import Any, ClassVar
|
||||
@@ -10,6 +10,7 @@ from arq.connections import RedisSettings
|
||||
|
||||
from app.core.config import get_settings
|
||||
from app.core.logging import configure_logging, get_logger
|
||||
from app.workers.tasks.enrich_task import enrich_track
|
||||
from app.workers.tasks.import_task import scan_local_folder
|
||||
|
||||
log = get_logger("worker")
|
||||
@@ -26,7 +27,7 @@ async def shutdown(_ctx: dict[str, Any]) -> None:
|
||||
|
||||
|
||||
class WorkerSettings:
|
||||
functions: ClassVar[list[Any]] = [scan_local_folder]
|
||||
functions: ClassVar[list[Any]] = [scan_local_folder, enrich_track]
|
||||
on_startup = startup
|
||||
on_shutdown = shutdown
|
||||
max_jobs = get_settings().max_parallel_downloads
|
||||
|
||||
@@ -4,14 +4,18 @@ A short-lived pool per call keeps things simple (enqueues are rare, admin-driven
|
||||
actions). Redis being down degrades to a clean 503 rather than a crash
|
||||
(graceful degradation)."""
|
||||
|
||||
import uuid
|
||||
from typing import Any
|
||||
|
||||
from arq import create_pool
|
||||
from arq.connections import RedisSettings
|
||||
|
||||
from app.core.config import get_settings
|
||||
from app.core.logging import get_logger
|
||||
from app.domain.errors import DependencyUnavailableError
|
||||
|
||||
log = get_logger("worker.queue")
|
||||
|
||||
|
||||
async def enqueue(function: str, **kwargs: Any) -> str:
|
||||
"""Enqueue ``function`` by name, returning the job id. Raises
|
||||
@@ -28,3 +32,18 @@ async def enqueue(function: str, **kwargs: Any) -> str:
|
||||
if job is None:
|
||||
raise DependencyUnavailableError("Could not enqueue job.")
|
||||
return str(job.job_id)
|
||||
|
||||
|
||||
async def enqueue_enrich(track_id: uuid.UUID) -> None:
|
||||
"""Best-effort enqueue of metadata enrichment for a freshly stored track.
|
||||
|
||||
The track is already persisted, so enrichment is a follow-up, not a barrier:
|
||||
if the queue is unreachable we log and move on (graceful degradation). The
|
||||
track stays ``metadata_status=pending`` and can be re-enriched later.
|
||||
|
||||
Deferred a few seconds so the caller's DB transaction is committed before the
|
||||
worker looks the track up (the upload request commits only after it returns)."""
|
||||
try:
|
||||
await enqueue("enrich_track", track_id=str(track_id), _defer_by=5)
|
||||
except DependencyUnavailableError:
|
||||
log.warning("enrich_enqueue_failed", track_id=str(track_id))
|
||||
|
||||
@@ -0,0 +1,56 @@
|
||||
"""arq task: enrich one track's metadata (plan §6.2, §1D).
|
||||
|
||||
Wires the §6.2 pipeline adapters to :class:`MetadataEnrichmentService` and runs
|
||||
it in the worker's own transactional session. Enqueued (deferred) after upload
|
||||
and after a local-folder import. Idempotent and best-effort — a missing track or
|
||||
a ``manual`` one is a clean no-op.
|
||||
"""
|
||||
|
||||
import uuid
|
||||
from typing import Any
|
||||
|
||||
from app.application.metadata_service import MetadataEnrichmentService
|
||||
from app.core.config import get_settings
|
||||
from app.core.logging import get_logger
|
||||
from app.infrastructure.db import session_scope
|
||||
from app.infrastructure.db.repositories import (
|
||||
SqlAlchemyAlbumRepository,
|
||||
SqlAlchemyArtistRepository,
|
||||
SqlAlchemyTrackRepository,
|
||||
)
|
||||
from app.infrastructure.metadata.acoustid import AcoustIdHttpClient
|
||||
from app.infrastructure.metadata.fingerprint import FpcalcFingerprinter
|
||||
from app.infrastructure.metadata.tags import MutagenTagReader
|
||||
from app.infrastructure.storage.provider import get_file_storage
|
||||
|
||||
log = get_logger("worker.enrich")
|
||||
|
||||
|
||||
async def enrich_track(_ctx: dict[str, Any], *, track_id: str) -> dict[str, Any]:
|
||||
settings = get_settings()
|
||||
api_key = (
|
||||
settings.acoustid_api_key.get_secret_value() if settings.acoustid_api_key else None
|
||||
)
|
||||
acoustid = AcoustIdHttpClient(
|
||||
api_key=api_key,
|
||||
user_agent=settings.musicbrainz_user_agent,
|
||||
api_url=settings.acoustid_api_url,
|
||||
)
|
||||
|
||||
async with session_scope() as session:
|
||||
service = MetadataEnrichmentService(
|
||||
tracks=SqlAlchemyTrackRepository(session),
|
||||
artists=SqlAlchemyArtistRepository(session),
|
||||
albums=SqlAlchemyAlbumRepository(session),
|
||||
storage=get_file_storage(),
|
||||
tag_reader=MutagenTagReader(),
|
||||
fingerprinter=FpcalcFingerprinter(settings.fpcalc_path),
|
||||
acoustid=acoustid,
|
||||
)
|
||||
result = await service.enrich(uuid.UUID(track_id))
|
||||
|
||||
return {
|
||||
"track_id": str(result.track_id),
|
||||
"status": result.status,
|
||||
"mbid": result.matched_mbid,
|
||||
}
|
||||
@@ -18,6 +18,7 @@ from app.infrastructure.db.repositories import (
|
||||
)
|
||||
from app.infrastructure.sources.registry import build_source_registry
|
||||
from app.infrastructure.storage.provider import get_file_storage
|
||||
from app.workers.queue import enqueue_enrich
|
||||
|
||||
log = get_logger("worker.import")
|
||||
|
||||
@@ -37,6 +38,11 @@ async def scan_local_folder(
|
||||
)
|
||||
summary = await service.scan_and_import(backend, added_by=actor)
|
||||
|
||||
# Enqueue enrichment only after the import transaction has committed above,
|
||||
# so the enrich worker is guaranteed to see the new rows.
|
||||
for track_id in summary.imported_ids:
|
||||
await enqueue_enrich(track_id)
|
||||
|
||||
return {
|
||||
"source": summary.source,
|
||||
"seen": summary.seen,
|
||||
|
||||
Reference in New Issue
Block a user