mcma-backend/app/infrastructure/metadata/acoustid.py

"""AcoustIdHttpClient — identifies a recording from its Chromaprint fingerprint.

One ``/v2/lookup`` call with ``meta=recordings+releasegroups`` returns the
AcoustID id, the MusicBrainz recording id, and canonical title/artist/album —
metadata that itself originates from MusicBrainz, so a separate MB call is not
needed for Phase 1 (plan §6.2 steps 2-3 collapsed into one request).

Graceful degradation: no API key → ``is_available()`` is False and the whole
fingerprint path is skipped; any network/parse error → ``lookup`` returns
``None``. A small inter-call delay keeps us within AcoustID's rate limit.
"""

import asyncio
import time

import httpx

from app.core.logging import get_logger
from app.domain.entities.metadata import Fingerprint, RecordingMatch

log = get_logger(__name__)

_DEFAULT_URL = "https://api.acoustid.org/v2/lookup"
_TIMEOUT_SECONDS = 10.0
_MIN_INTERVAL_SECONDS = 0.34  # AcoustID allows ~3 req/s; stay polite


class AcoustIdHttpClient:
    """Implements :class:`app.domain.ports.AcoustIdClient`."""

    _throttle_lock = asyncio.Lock()
    _last_call_monotonic = 0.0

    def __init__(
        self,
        *,
        api_key: str | None,
        user_agent: str,
        api_url: str = _DEFAULT_URL,
    ) -> None:
        self._api_key = api_key
        self._user_agent = user_agent
        self._api_url = api_url

    def is_available(self) -> bool:
        return bool(self._api_key)

    async def lookup(self, fingerprint: Fingerprint) -> RecordingMatch | None:
        payload = await self._lookup_raw(fingerprint)
        if payload is None:
            return None
        return _parse_best_match(payload)

    async def lookup_all(self, fingerprint: Fingerprint) -> list[RecordingMatch]:
        payload = await self._lookup_raw(fingerprint)
        if payload is None:
            return []
        return _parse_matches(payload)

    async def _lookup_raw(self, fingerprint: Fingerprint) -> object | None:
        if not self._api_key:
            return None
        try:
            await self._throttle()
            async with httpx.AsyncClient(
                timeout=_TIMEOUT_SECONDS,
                headers={"User-Agent": self._user_agent},
            ) as client:
                resp = await client.get(
                    self._api_url,
                    params={
                        "client": self._api_key,
                        "duration": str(fingerprint.duration_seconds),
                        "fingerprint": fingerprint.fingerprint,
                        "meta": "recordings releasegroups",
                        "format": "json",
                    },
                )
            resp.raise_for_status()
            return resp.json()  # type: ignore[no-any-return]
        except httpx.HTTPError, ValueError:
            log.warning("acoustid_lookup_failed")
            return None

    @classmethod
    async def _throttle(cls) -> None:
        async with cls._throttle_lock:
            elapsed = time.monotonic() - cls._last_call_monotonic
            wait = _MIN_INTERVAL_SECONDS - elapsed
            if wait > 0:
                await asyncio.sleep(wait)
            cls._last_call_monotonic = time.monotonic()


_MAX_MATCHES = 5


def _parse_best_match(payload: object) -> RecordingMatch | None:
    matches = _parse_matches(payload)
    return matches[0] if matches else None


def _parse_matches(payload: object) -> list[RecordingMatch]:
    if not isinstance(payload, dict) or payload.get("status") != "ok":
        return []
    results = payload.get("results")
    if not isinstance(results, list) or not results:
        return []

    # Results are returned best-score-first, but sort defensively and cap the
    # number of candidates surfaced to the editor.
    candidates = [r for r in results if isinstance(r, dict)]
    candidates.sort(key=lambda r: r.get("score", 0.0), reverse=True)

    matches: list[RecordingMatch] = []
    for result in candidates[:_MAX_MATCHES]:
        match = _parse_one(result)
        if match is not None:
            matches.append(match)
    return matches


def _parse_one(result: dict[str, object]) -> RecordingMatch | None:
    acoustid = result.get("id")
    if not isinstance(acoustid, str):
        return None
    score = float(result.get("score", 0.0))  # type: ignore[arg-type]

    recording_mbid: str | None = None
    release_group_mbid: str | None = None
    title: str | None = None
    artist: str | None = None
    album: str | None = None

    recordings = result.get("recordings")
    if isinstance(recordings, list) and recordings and isinstance(recordings[0], dict):
        rec = recordings[0]
        recording_mbid = rec.get("id") if isinstance(rec.get("id"), str) else None
        title = rec.get("title") if isinstance(rec.get("title"), str) else None
        artists = rec.get("artists")
        if isinstance(artists, list) and artists and isinstance(artists[0], dict):
            name = artists[0].get("name")
            artist = name if isinstance(name, str) else None
        groups = rec.get("releasegroups")
        if isinstance(groups, list) and groups and isinstance(groups[0], dict):
            group = groups[0]
            gtitle = group.get("title")
            album = gtitle if isinstance(gtitle, str) else None
            gid = group.get("id")
            release_group_mbid = gid if isinstance(gid, str) else None

    return RecordingMatch(
        acoustid=acoustid,
        score=score,
        recording_mbid=recording_mbid,
        release_group_mbid=release_group_mbid,
        title=title,
        artist=artist,
        album=album,
        year=None,
    )