Files
mcma-backend/tests/test_real_file_enrichment.py
T
Senko-san 73d7da440f
Docker Build & Publish / build (push) Has been cancelled
Docker Build & Publish / push (push) Has been cancelled
Docker Build & Publish / Prune old image versions (push) Has been cancelled
feat(enrichment): record status/errors and trust high-confidence AcoustID
Two related gaps surfaced from "uploaded a track, nothing changed / no status":

- A track could stay stuck on `pending` forever (an unexpected worker error
  rolled back the run without recording anything), and `failed` carried no
  reason. Add `tracks.metadata_error` + `tracks.enriched_at` (migration), stamp
  the outcome in apply_enrichment, add TrackRepository.mark_enrichment_failed,
  wrap enrich_task to persist crashes as `failed` in a fresh session, and emit a
  human-readable no-match reason. Expose metadata_error/enriched_at in TrackOut.

- The tag-first merge let junk embedded tags (e.g. "Music Track"/"Sound_13958")
  override even a 0.99-confidence AcoustID match. Add acoustid_trust_score
  (default 0.85): above it the acoustic identity wins for title/artist/album/
  year, tags are fallback; below it, tag-first as before.

Add a license-free real-file fixture (Scarlet Fire / Otis McDonald) whose junk
tags AcoustID overrides, with an always-on tag-reader test plus fpcalc/AcoustID/
network-gated identity + full-pipeline tests (skip on host, run in the container).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-13 13:29:08 +03:00

207 lines
7.0 KiB
Python

"""Enrichment tests against a real audio file (``tests/fixtures/``).
The fixture "Scarlet Fire" by Otis McDonald carries *junk* embedded tags
(``Sound_13958`` / ``Music Track`` / ``Музыка``) yet is identified by AcoustID
with ~0.99 confidence. That makes it the real-world reproduction of the
"uploaded a track, got the wrong name/artist" bug: tag reading must be exact,
and a high-confidence AcoustID match must override the junk tags.
Two layers:
- The tag-reader test is offline and deterministic — it always runs.
- The AcoustID/identity tests need the ``fpcalc`` binary, an AcoustID API key,
and network. They *skip* (never fail) when those aren't present, honouring the
project rule that the suite never hard-requires network. They do run inside the
api/worker container (``make test-api``), which ships fpcalc + the key.
"""
import uuid
from collections.abc import AsyncIterator
from contextlib import asynccontextmanager
from dataclasses import dataclass, field
from datetime import UTC, datetime
from pathlib import Path
import pytest
from app.application.metadata_service import MetadataEnrichmentService
from app.core.config import get_settings
from app.domain.entities.album import Album
from app.domain.entities.track import Artist, Track
from app.infrastructure.metadata.acoustid import AcoustIdHttpClient
from app.infrastructure.metadata.fingerprint import FpcalcFingerprinter
from app.infrastructure.metadata.tags import MutagenTagReader
pytestmark = pytest.mark.asyncio
FIXTURE = Path(__file__).parent / "fixtures" / "scarlet_fire_otis_mcdonald.mp3"
_settings = get_settings()
_fpcalc = FpcalcFingerprinter(_settings.fpcalc_path)
# Gate for the network/identity tests — present in the container, absent in CI.
requires_acoustid = pytest.mark.skipif(
not (_fpcalc.is_available() and _settings.acoustid_api_key is not None),
reason="needs the fpcalc binary + ACOUSTID_API_KEY (+ network)",
)
def _acoustid_client() -> AcoustIdHttpClient:
key = _settings.acoustid_api_key
return AcoustIdHttpClient(
api_key=key.get_secret_value() if key else None,
user_agent=_settings.musicbrainz_user_agent,
api_url=_settings.acoustid_api_url,
)
# --- offline: tag reading on a real file -----------------------------------
async def test_real_file_embedded_tags_are_read() -> None:
"""The reader extracts the file's actual (junk) embedded tags verbatim —
proving real-file tag parsing works end to end, no network involved."""
assert FIXTURE.exists(), "fixture mp3 missing"
tags = await MutagenTagReader().read(FIXTURE)
assert tags is not None
assert tags.title == "Sound_13958"
assert tags.artist == "Music Track"
assert tags.album == "Музыка"
assert tags.genre == "Hip Hop & Rap"
assert tags.year == 2018
assert tags.duration_seconds == 143
assert tags.bitrate == 128
# --- networked: AcoustID identifies the real recording ---------------------
@requires_acoustid
async def test_real_file_identified_by_acoustid() -> None:
"""fpcalc → AcoustID identifies the real audio as Scarlet Fire / Otis
McDonald with high confidence (despite the junk tags)."""
fingerprint = await _fpcalc.calculate(FIXTURE)
if fingerprint is None:
pytest.skip("fpcalc produced no fingerprint")
match = await _acoustid_client().lookup(fingerprint)
if match is None:
pytest.skip("AcoustID returned no match (network/rate limit?)")
assert match.score >= _settings.acoustid_trust_score
assert match.title == "Scarlet Fire"
assert match.artist == "Otis McDonald"
assert match.recording_mbid is not None
@requires_acoustid
async def test_real_file_enrichment_overrides_junk_tags() -> None:
"""Full pipeline on the real file with the real tag-reader, fingerprinter
and AcoustID client: the high-confidence match wins over the junk embedded
tags, so the track is stored as Scarlet Fire / Otis McDonald."""
track = _pending_track()
tracks = _FakeTrackRepo(track)
artists = _FakeArtistRepo()
albums = _FakeAlbumRepo()
service = MetadataEnrichmentService(
tracks=tracks, # type: ignore[arg-type]
artists=artists, # type: ignore[arg-type]
albums=albums, # type: ignore[arg-type]
storage=_FixtureStorage(), # type: ignore[arg-type]
tag_reader=MutagenTagReader(),
fingerprinter=_fpcalc,
acoustid=_acoustid_client(),
acoustid_trust_score=_settings.acoustid_trust_score,
)
result = await service.enrich(track.id)
if result.status == "failed":
pytest.skip("AcoustID unavailable at run time (network/rate limit?)")
assert result.status == "enriched"
applied = tracks.applied
assert applied is not None
assert applied["title"] == "Scarlet Fire"
assert "Otis McDonald" in artists.created
assert "Music Track" not in artists.created
assert albums.created and albums.created[0][0] == "Scarlet Fire"
# --- minimal in-memory adapters --------------------------------------------
def _pending_track() -> Track:
now = datetime.now(UTC)
return Track(
id=uuid.uuid4(),
title="scarlet_fire_otis_mcdonald", # the upload-time filename stem
artist_id=uuid.uuid4(),
album_id=None,
storage_uri="tracks/sf/scarlet.mp3",
file_format="mp3",
file_size=FIXTURE.stat().st_size,
source="upload",
source_id="sha-real",
duration_seconds=None,
genre=None,
year=None,
metadata_status="pending",
metadata_error=None,
enriched_at=None,
created_at=now,
updated_at=now,
)
class _FixtureStorage:
@asynccontextmanager
async def as_local_path(self, _key: str) -> AsyncIterator[Path]:
yield FIXTURE
class _FakeTrackRepo:
def __init__(self, track: Track) -> None:
self._track = track
self.applied: dict[str, object] | None = None
async def get_by_id(self, _track_id: uuid.UUID) -> Track:
return self._track
async def apply_enrichment(self, _track_id: uuid.UUID, **kw: object) -> Track:
self.applied = kw
return self._track
@dataclass
class _FakeArtistRepo:
created: list[str] = field(default_factory=list)
async def get_or_create(self, name: str) -> Artist:
self.created.append(name)
now = datetime.now(UTC)
return Artist(id=uuid.uuid4(), name=name, created_at=now, updated_at=now)
@dataclass
class _FakeAlbumRepo:
created: list[tuple[str, uuid.UUID]] = field(default_factory=list)
async def get_or_create(
self, *, title: str, artist_id: uuid.UUID, year: int | None, musicbrainz_id: str | None
) -> Album:
self.created.append((title, artist_id))
now = datetime.now(UTC)
return Album(
id=uuid.uuid4(),
title=title,
artist_id=artist_id,
year=year,
cover_path=None,
musicbrainz_id=musicbrainz_id,
created_at=now,
updated_at=now,
)
async def set_cover_path(self, _album_id: uuid.UUID, _cover_path: str) -> None:
return None