feat(enrichment): tag-first metadata pipeline (§1D)

Implements the §6.2 enrichment pipeline: embedded tags → Chromaprint fingerprint → AcoustID lookup. Well-tagged files get correct artist/album/title offline; the rest are identified via AcoustID (which also yields a MusicBrainz recording id in one call). - domain: AudioTags/Fingerprint/RecordingMatch value objects; ports AudioTagReader, AudioFingerprinter, AcoustIdClient; TrackRepository .apply_enrichment (gap-fill, never erases) + AlbumRepository.get_or_create - infrastructure/metadata: MutagenTagReader, FpcalcFingerprinter, AcoustIdHttpClient (rich meta=recordings+releasegroups, throttled) - application: MetadataEnrichmentService — tags preferred, AcoustID fills gaps; resolves artist/album; status enriched/failed; skips manual; every external step wrapped (graceful degradation) - workers: enrich_task registered; enqueue_enrich is best-effort and deferred so the caller's txn commits before the worker reads the row - wiring: upload enqueues after add; import returns imported_ids and enqueues post-commit (mid-scan would race the worker); manual POST /tracks/{id}/metadata/enrich endpoint - deps: add mutagen (fpcalc/ffmpeg already in the image) Tests: metadata service orchestration, AcoustID parser, tag helpers. 125 passed; mypy strict + ruff clean. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-09 13:04:02 +03:00
parent 48e3418c7f
commit c72d19599a
24 changed files with 1934 additions and 763 deletions
@@ -0,0 +1,129 @@
+"""AcoustIdHttpClient — identifies a recording from its Chromaprint fingerprint.
+
+One ``/v2/lookup`` call with ``meta=recordings+releasegroups`` returns the
+AcoustID id, the MusicBrainz recording id, and canonical title/artist/album —
+metadata that itself originates from MusicBrainz, so a separate MB call is not
+needed for Phase 1 (plan §6.2 steps 2-3 collapsed into one request).
+
+Graceful degradation: no API key → ``is_available()`` is False and the whole
+fingerprint path is skipped; any network/parse error → ``lookup`` returns
+``None``. A small inter-call delay keeps us within AcoustID's rate limit.
+"""
+
+import asyncio
+import time
+
+import httpx
+
+from app.core.logging import get_logger
+from app.domain.entities.metadata import Fingerprint, RecordingMatch
+
+log = get_logger(__name__)
+
+_DEFAULT_URL = "https://api.acoustid.org/v2/lookup"
+_TIMEOUT_SECONDS = 10.0
+_MIN_INTERVAL_SECONDS = 0.34  # AcoustID allows ~3 req/s; stay polite
+
+
+class AcoustIdHttpClient:
+    """Implements :class:`app.domain.ports.AcoustIdClient`."""
+
+    _throttle_lock = asyncio.Lock()
+    _last_call_monotonic = 0.0
+
+    def __init__(
+        self,
+        *,
+        api_key: str | None,
+        user_agent: str,
+        api_url: str = _DEFAULT_URL,
+    ) -> None:
+        self._api_key = api_key
+        self._user_agent = user_agent
+        self._api_url = api_url
+
+    def is_available(self) -> bool:
+        return bool(self._api_key)
+
+    async def lookup(self, fingerprint: Fingerprint) -> RecordingMatch | None:
+        if not self._api_key:
+            return None
+        try:
+            await self._throttle()
+            async with httpx.AsyncClient(
+                timeout=_TIMEOUT_SECONDS,
+                headers={"User-Agent": self._user_agent},
+            ) as client:
+                resp = await client.get(
+                    self._api_url,
+                    params={
+                        "client": self._api_key,
+                        "duration": str(fingerprint.duration_seconds),
+                        "fingerprint": fingerprint.fingerprint,
+                        "meta": "recordings releasegroups",
+                        "format": "json",
+                    },
+                )
+            resp.raise_for_status()
+            payload = resp.json()
+        except (httpx.HTTPError, ValueError):
+            log.warning("acoustid_lookup_failed")
+            return None
+
+        return _parse_best_match(payload)
+
+    @classmethod
+    async def _throttle(cls) -> None:
+        async with cls._throttle_lock:
+            elapsed = time.monotonic() - cls._last_call_monotonic
+            wait = _MIN_INTERVAL_SECONDS - elapsed
+            if wait > 0:
+                await asyncio.sleep(wait)
+            cls._last_call_monotonic = time.monotonic()
+
+
+def _parse_best_match(payload: object) -> RecordingMatch | None:
+    if not isinstance(payload, dict) or payload.get("status") != "ok":
+        return None
+    results = payload.get("results")
+    if not isinstance(results, list) or not results:
+        return None
+
+    # Results are returned best-score-first; take the top scoring one.
+    best = max(results, key=lambda r: r.get("score", 0.0) if isinstance(r, dict) else 0.0)
+    if not isinstance(best, dict):
+        return None
+
+    acoustid = best.get("id")
+    if not isinstance(acoustid, str):
+        return None
+    score = float(best.get("score", 0.0))
+
+    recording_mbid: str | None = None
+    title: str | None = None
+    artist: str | None = None
+    album: str | None = None
+
+    recordings = best.get("recordings")
+    if isinstance(recordings, list) and recordings and isinstance(recordings[0], dict):
+        rec = recordings[0]
+        recording_mbid = rec.get("id") if isinstance(rec.get("id"), str) else None
+        title = rec.get("title") if isinstance(rec.get("title"), str) else None
+        artists = rec.get("artists")
+        if isinstance(artists, list) and artists and isinstance(artists[0], dict):
+            name = artists[0].get("name")
+            artist = name if isinstance(name, str) else None
+        groups = rec.get("releasegroups")
+        if isinstance(groups, list) and groups and isinstance(groups[0], dict):
+            gtitle = groups[0].get("title")
+            album = gtitle if isinstance(gtitle, str) else None
+
+    return RecordingMatch(
+        acoustid=acoustid,
+        score=score,
+        recording_mbid=recording_mbid,
+        title=title,
+        artist=artist,
+        album=album,
+        year=None,
+    )