feat(enrichment): tag-first metadata pipeline (§1D)

Implements the §6.2 enrichment pipeline: embedded tags → Chromaprint fingerprint → AcoustID lookup. Well-tagged files get correct artist/album/title offline; the rest are identified via AcoustID (which also yields a MusicBrainz recording id in one call). - domain: AudioTags/Fingerprint/RecordingMatch value objects; ports AudioTagReader, AudioFingerprinter, AcoustIdClient; TrackRepository .apply_enrichment (gap-fill, never erases) + AlbumRepository.get_or_create - infrastructure/metadata: MutagenTagReader, FpcalcFingerprinter, AcoustIdHttpClient (rich meta=recordings+releasegroups, throttled) - application: MetadataEnrichmentService — tags preferred, AcoustID fills gaps; resolves artist/album; status enriched/failed; skips manual; every external step wrapped (graceful degradation) - workers: enrich_task registered; enqueue_enrich is best-effort and deferred so the caller's txn commits before the worker reads the row - wiring: upload enqueues after add; import returns imported_ids and enqueues post-commit (mid-scan would race the worker); manual POST /tracks/{id}/metadata/enrich endpoint - deps: add mutagen (fpcalc/ffmpeg already in the image) Tests: metadata service orchestration, AcoustID parser, tag helpers. 125 passed; mypy strict + ruff clean. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-09 13:04:02 +03:00
parent 48e3418c7f
commit c72d19599a
24 changed files with 1934 additions and 763 deletions
@@ -1,7 +1,7 @@
 """arq worker settings — the queue runtime. Task functions register here.

 Run with: ``arq app.workers.arq_worker.WorkerSettings``.
-Tasks (download, enrich, transcode) are appended to ``functions`` in later steps.
+Tasks (download, transcode) are appended to ``functions`` in later steps.
 """

 from typing import Any, ClassVar
@@ -10,6 +10,7 @@ from arq.connections import RedisSettings

 from app.core.config import get_settings
 from app.core.logging import configure_logging, get_logger
+from app.workers.tasks.enrich_task import enrich_track
 from app.workers.tasks.import_task import scan_local_folder

 log = get_logger("worker")
@@ -26,7 +27,7 @@ async def shutdown(_ctx: dict[str, Any]) -> None:


 class WorkerSettings:
-    functions: ClassVar[list[Any]] = [scan_local_folder]
+    functions: ClassVar[list[Any]] = [scan_local_folder, enrich_track]
    on_startup = startup
    on_shutdown = shutdown
    max_jobs = get_settings().max_parallel_downloads
@@ -4,14 +4,18 @@ A short-lived pool per call keeps things simple (enqueues are rare, admin-driven
 actions). Redis being down degrades to a clean 503 rather than a crash
 (graceful degradation)."""

+import uuid
 from typing import Any

 from arq import create_pool
 from arq.connections import RedisSettings

 from app.core.config import get_settings
+from app.core.logging import get_logger
 from app.domain.errors import DependencyUnavailableError

+log = get_logger("worker.queue")
+

 async def enqueue(function: str, **kwargs: Any) -> str:
    """Enqueue ``function`` by name, returning the job id. Raises
@@ -28,3 +32,18 @@ async def enqueue(function: str, **kwargs: Any) -> str:
    if job is None:
        raise DependencyUnavailableError("Could not enqueue job.")
    return str(job.job_id)
+
+
+async def enqueue_enrich(track_id: uuid.UUID) -> None:
+    """Best-effort enqueue of metadata enrichment for a freshly stored track.
+
+    The track is already persisted, so enrichment is a follow-up, not a barrier:
+    if the queue is unreachable we log and move on (graceful degradation). The
+    track stays ``metadata_status=pending`` and can be re-enriched later.
+
+    Deferred a few seconds so the caller's DB transaction is committed before the
+    worker looks the track up (the upload request commits only after it returns)."""
+    try:
+        await enqueue("enrich_track", track_id=str(track_id), _defer_by=5)
+    except DependencyUnavailableError:
+        log.warning("enrich_enqueue_failed", track_id=str(track_id))
@@ -0,0 +1,56 @@
+"""arq task: enrich one track's metadata (plan §6.2, §1D).
+
+Wires the §6.2 pipeline adapters to :class:`MetadataEnrichmentService` and runs
+it in the worker's own transactional session. Enqueued (deferred) after upload
+and after a local-folder import. Idempotent and best-effort — a missing track or
+a ``manual`` one is a clean no-op.
+"""
+
+import uuid
+from typing import Any
+
+from app.application.metadata_service import MetadataEnrichmentService
+from app.core.config import get_settings
+from app.core.logging import get_logger
+from app.infrastructure.db import session_scope
+from app.infrastructure.db.repositories import (
+    SqlAlchemyAlbumRepository,
+    SqlAlchemyArtistRepository,
+    SqlAlchemyTrackRepository,
+)
+from app.infrastructure.metadata.acoustid import AcoustIdHttpClient
+from app.infrastructure.metadata.fingerprint import FpcalcFingerprinter
+from app.infrastructure.metadata.tags import MutagenTagReader
+from app.infrastructure.storage.provider import get_file_storage
+
+log = get_logger("worker.enrich")
+
+
+async def enrich_track(_ctx: dict[str, Any], *, track_id: str) -> dict[str, Any]:
+    settings = get_settings()
+    api_key = (
+        settings.acoustid_api_key.get_secret_value() if settings.acoustid_api_key else None
+    )
+    acoustid = AcoustIdHttpClient(
+        api_key=api_key,
+        user_agent=settings.musicbrainz_user_agent,
+        api_url=settings.acoustid_api_url,
+    )
+
+    async with session_scope() as session:
+        service = MetadataEnrichmentService(
+            tracks=SqlAlchemyTrackRepository(session),
+            artists=SqlAlchemyArtistRepository(session),
+            albums=SqlAlchemyAlbumRepository(session),
+            storage=get_file_storage(),
+            tag_reader=MutagenTagReader(),
+            fingerprinter=FpcalcFingerprinter(settings.fpcalc_path),
+            acoustid=acoustid,
+        )
+        result = await service.enrich(uuid.UUID(track_id))
+
+    return {
+        "track_id": str(result.track_id),
+        "status": result.status,
+        "mbid": result.matched_mbid,
+    }
@@ -18,6 +18,7 @@ from app.infrastructure.db.repositories import (
 )
 from app.infrastructure.sources.registry import build_source_registry
 from app.infrastructure.storage.provider import get_file_storage
+from app.workers.queue import enqueue_enrich

 log = get_logger("worker.import")

@@ -37,6 +38,11 @@ async def scan_local_folder(
        )
        summary = await service.scan_and_import(backend, added_by=actor)

+    # Enqueue enrichment only after the import transaction has committed above,
+    # so the enrich worker is guaranteed to see the new rows.
+    for track_id in summary.imported_ids:
+        await enqueue_enrich(track_id)
+
    return {
        "source": summary.source,
        "seen": summary.seen,