feat(enrichment): record status/errors and trust high-confidence AcoustID
Docker Build & Publish / build (push) Has been cancelled
Docker Build & Publish / push (push) Has been cancelled
Docker Build & Publish / Prune old image versions (push) Has been cancelled

Two related gaps surfaced from "uploaded a track, nothing changed / no status":

- A track could stay stuck on `pending` forever (an unexpected worker error
  rolled back the run without recording anything), and `failed` carried no
  reason. Add `tracks.metadata_error` + `tracks.enriched_at` (migration), stamp
  the outcome in apply_enrichment, add TrackRepository.mark_enrichment_failed,
  wrap enrich_task to persist crashes as `failed` in a fresh session, and emit a
  human-readable no-match reason. Expose metadata_error/enriched_at in TrackOut.

- The tag-first merge let junk embedded tags (e.g. "Music Track"/"Sound_13958")
  override even a 0.99-confidence AcoustID match. Add acoustid_trust_score
  (default 0.85): above it the acoustic identity wins for title/artist/album/
  year, tags are fallback; below it, tag-first as before.

Add a license-free real-file fixture (Scarlet Fire / Otis McDonald) whose junk
tags AcoustID overrides, with an always-on tag-reader test plus fpcalc/AcoustID/
network-gated identity + full-pipeline tests (skip on host, run in the container).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Senko-san
2026-06-13 13:29:08 +03:00
parent 30cb8901f2
commit 73d7da440f
17 changed files with 468 additions and 33 deletions
+20
View File
@@ -0,0 +1,20 @@
# Test fixtures
## `scarlet_fire_otis_mcdonald.mp3`
"Scarlet Fire" by **Otis McDonald** — a royalty-free / license-free track
(YouTube Audio Library; distributed via Pro-Sound.org). Used as a real-world
audio fixture for the enrichment pipeline.
What makes it a good fixture: its **embedded ID3 tags are junk**
(`title=Sound_13958`, `artist=Music Track`, `album=Музыка`, `genre=Hip Hop & Rap`)
while AcoustID identifies it with very high confidence as *Scarlet Fire /
Otis McDonald*. So it exercises both:
- the offline tag reader (deterministic, always runs), and
- the "trust a high-confidence AcoustID match over junk tags" path
(`acoustid_trust_score`), which only runs when `fpcalc` + an AcoustID API key
+ network are available — see `tests/test_real_file_enrichment.py`.
Because it's license-free, it may also seed a built-in demo track for fresh
instances.
Binary file not shown.
+4
View File
@@ -52,6 +52,8 @@ class FakeTrackRepo:
genre=None,
year=None,
metadata_status=str(kw["metadata_status"]),
metadata_error=None,
enriched_at=None,
created_at=now,
updated_at=now,
)
@@ -133,6 +135,8 @@ async def test_dedup_skips_already_imported() -> None:
genre=None,
year=None,
metadata_status="pending",
metadata_error=None,
enriched_at=None,
created_at=now,
updated_at=now,
)
+80 -10
View File
@@ -39,6 +39,8 @@ def _track(*, metadata_status: str = "pending", title: str = "raw-stem") -> Trac
genre=None,
year=None,
metadata_status=metadata_status,
metadata_error=None,
enriched_at=None,
created_at=now,
updated_at=now,
)
@@ -251,6 +253,33 @@ async def test_nothing_found_marks_failed() -> None:
assert applied is not None
assert applied["artist_id"] == track.artist_id # fallback kept
assert applied["metadata_status"] == "failed"
# A failed run records a human-readable reason; here both id steps were
# available, so it's the generic "no match" message.
assert applied["metadata_error"] == "No metadata match found in tags or AcoustID."
async def test_failed_reason_names_unavailable_fingerprinter() -> None:
track = _track()
service, tracks, _, _, _ = _service(track=track, tags=None, fp=None, fp_available=False)
result = await service.enrich(track.id)
assert result.status == "failed"
applied = tracks.applied
assert applied is not None
assert "fingerprinting" in str(applied["metadata_error"])
async def test_successful_enrich_clears_error() -> None:
track = _track()
service, tracks, _, _, _ = _service(track=track, tags=AudioTags(artist="Pink Floyd"))
result = await service.enrich(track.id)
assert result.status == "enriched"
applied = tracks.applied
assert applied is not None
assert applied["metadata_error"] is None
async def test_acoustid_path_fills_when_tags_absent() -> None:
@@ -281,13 +310,14 @@ async def test_acoustid_path_fills_when_tags_absent() -> None:
assert "Daft Punk" in artists.created
async def test_tags_win_over_acoustid_for_overlapping_fields() -> None:
async def test_tags_win_over_low_confidence_acoustid() -> None:
track = _track()
fp = Fingerprint(fingerprint="AQAA", duration_seconds=200)
tags = AudioTags(title="Tagged Title", artist="Tagged Artist")
# Below the 0.85 trust threshold → keep tag-first.
match = RecordingMatch(
acoustid="aid",
score=0.9,
score=0.5,
recording_mbid="mbid",
title="AcoustID Title",
artist="AcoustID Artist",
@@ -306,6 +336,36 @@ async def test_tags_win_over_acoustid_for_overlapping_fields() -> None:
assert applied["musicbrainz_id"] == "mbid"
async def test_high_confidence_acoustid_wins_over_junk_tags() -> None:
track = _track()
fp = Fingerprint(fingerprint="AQAA", duration_seconds=200)
# The real-world bug: junk embedded tags on a downloaded file vs a near-
# certain acoustic identification. The match must win for the identity.
tags = AudioTags(title="Sound_13958", artist="Music Track", album="Музыка")
match = RecordingMatch(
acoustid="aid",
score=0.98,
recording_mbid="mbid",
release_group_mbid="rg",
title="Scarlet Fire",
artist="Otis McDonald",
album="Scarlet Fire",
)
service, tracks, artists, albums, _acoustid = _service(
track=track, tags=tags, fp=fp, match=match
)
await service.enrich(track.id)
applied = tracks.applied
assert applied is not None
assert applied["title"] == "Scarlet Fire"
assert "Otis McDonald" in artists.created
assert "Music Track" not in artists.created
assert albums.created and albums.created[0][0] == "Scarlet Fire"
assert applied["metadata_status"] == "enriched"
async def test_fingerprint_skipped_when_acoustid_unavailable() -> None:
track = _track()
fp = Fingerprint(fingerprint="AQAA", duration_seconds=200)
@@ -356,8 +416,10 @@ async def test_cover_extracted_from_embedded_art() -> None:
extractor = FakeCoverExtractor(_PNG)
provider = FakeCoverProvider(_JPG)
service, albums, storage = _cover_service(
track=track, tags=AudioTags(album="The Wall", artist="PF"),
extractor=extractor, provider=provider,
track=track,
tags=AudioTags(album="The Wall", artist="PF"),
extractor=extractor,
provider=provider,
)
await service.enrich(track.id)
@@ -377,8 +439,12 @@ async def test_cover_falls_back_to_archive() -> None:
match = RecordingMatch(acoustid="ac", score=1.0, release_group_mbid="rg-123", album="The Wall")
fp = Fingerprint(fingerprint="AQAA", duration_seconds=200)
service, albums, storage = _cover_service(
track=track, tags=AudioTags(album="The Wall", artist="PF"),
match=match, fp=fp, extractor=extractor, provider=provider,
track=track,
tags=AudioTags(album="The Wall", artist="PF"),
match=match,
fp=fp,
extractor=extractor,
provider=provider,
)
await service.enrich(track.id)
@@ -394,8 +460,10 @@ async def test_cover_not_fetched_without_release_group() -> None:
track = _track()
provider = FakeCoverProvider(_JPG)
service, albums, _ = _cover_service(
track=track, tags=AudioTags(album="The Wall", artist="PF"),
extractor=FakeCoverExtractor(None), provider=provider,
track=track,
tags=AudioTags(album="The Wall", artist="PF"),
extractor=FakeCoverExtractor(None),
provider=provider,
)
await service.enrich(track.id)
@@ -408,8 +476,10 @@ async def test_existing_cover_is_not_overwritten() -> None:
track = _track()
extractor = FakeCoverExtractor(_PNG)
service, albums, storage = _cover_service(
track=track, tags=AudioTags(album="The Wall", artist="PF"),
extractor=extractor, existing_cover="covers/old.jpg",
track=track,
tags=AudioTags(album="The Wall", artist="PF"),
extractor=extractor,
existing_cover="covers/old.jpg",
)
await service.enrich(track.id)
+206
View File
@@ -0,0 +1,206 @@
"""Enrichment tests against a real audio file (``tests/fixtures/``).
The fixture "Scarlet Fire" by Otis McDonald carries *junk* embedded tags
(``Sound_13958`` / ``Music Track`` / ``Музыка``) yet is identified by AcoustID
with ~0.99 confidence. That makes it the real-world reproduction of the
"uploaded a track, got the wrong name/artist" bug: tag reading must be exact,
and a high-confidence AcoustID match must override the junk tags.
Two layers:
- The tag-reader test is offline and deterministic — it always runs.
- The AcoustID/identity tests need the ``fpcalc`` binary, an AcoustID API key,
and network. They *skip* (never fail) when those aren't present, honouring the
project rule that the suite never hard-requires network. They do run inside the
api/worker container (``make test-api``), which ships fpcalc + the key.
"""
import uuid
from collections.abc import AsyncIterator
from contextlib import asynccontextmanager
from dataclasses import dataclass, field
from datetime import UTC, datetime
from pathlib import Path
import pytest
from app.application.metadata_service import MetadataEnrichmentService
from app.core.config import get_settings
from app.domain.entities.album import Album
from app.domain.entities.track import Artist, Track
from app.infrastructure.metadata.acoustid import AcoustIdHttpClient
from app.infrastructure.metadata.fingerprint import FpcalcFingerprinter
from app.infrastructure.metadata.tags import MutagenTagReader
pytestmark = pytest.mark.asyncio
FIXTURE = Path(__file__).parent / "fixtures" / "scarlet_fire_otis_mcdonald.mp3"
_settings = get_settings()
_fpcalc = FpcalcFingerprinter(_settings.fpcalc_path)
# Gate for the network/identity tests — present in the container, absent in CI.
requires_acoustid = pytest.mark.skipif(
not (_fpcalc.is_available() and _settings.acoustid_api_key is not None),
reason="needs the fpcalc binary + ACOUSTID_API_KEY (+ network)",
)
def _acoustid_client() -> AcoustIdHttpClient:
key = _settings.acoustid_api_key
return AcoustIdHttpClient(
api_key=key.get_secret_value() if key else None,
user_agent=_settings.musicbrainz_user_agent,
api_url=_settings.acoustid_api_url,
)
# --- offline: tag reading on a real file -----------------------------------
async def test_real_file_embedded_tags_are_read() -> None:
"""The reader extracts the file's actual (junk) embedded tags verbatim —
proving real-file tag parsing works end to end, no network involved."""
assert FIXTURE.exists(), "fixture mp3 missing"
tags = await MutagenTagReader().read(FIXTURE)
assert tags is not None
assert tags.title == "Sound_13958"
assert tags.artist == "Music Track"
assert tags.album == "Музыка"
assert tags.genre == "Hip Hop & Rap"
assert tags.year == 2018
assert tags.duration_seconds == 143
assert tags.bitrate == 128
# --- networked: AcoustID identifies the real recording ---------------------
@requires_acoustid
async def test_real_file_identified_by_acoustid() -> None:
"""fpcalc → AcoustID identifies the real audio as Scarlet Fire / Otis
McDonald with high confidence (despite the junk tags)."""
fingerprint = await _fpcalc.calculate(FIXTURE)
if fingerprint is None:
pytest.skip("fpcalc produced no fingerprint")
match = await _acoustid_client().lookup(fingerprint)
if match is None:
pytest.skip("AcoustID returned no match (network/rate limit?)")
assert match.score >= _settings.acoustid_trust_score
assert match.title == "Scarlet Fire"
assert match.artist == "Otis McDonald"
assert match.recording_mbid is not None
@requires_acoustid
async def test_real_file_enrichment_overrides_junk_tags() -> None:
"""Full pipeline on the real file with the real tag-reader, fingerprinter
and AcoustID client: the high-confidence match wins over the junk embedded
tags, so the track is stored as Scarlet Fire / Otis McDonald."""
track = _pending_track()
tracks = _FakeTrackRepo(track)
artists = _FakeArtistRepo()
albums = _FakeAlbumRepo()
service = MetadataEnrichmentService(
tracks=tracks, # type: ignore[arg-type]
artists=artists, # type: ignore[arg-type]
albums=albums, # type: ignore[arg-type]
storage=_FixtureStorage(), # type: ignore[arg-type]
tag_reader=MutagenTagReader(),
fingerprinter=_fpcalc,
acoustid=_acoustid_client(),
acoustid_trust_score=_settings.acoustid_trust_score,
)
result = await service.enrich(track.id)
if result.status == "failed":
pytest.skip("AcoustID unavailable at run time (network/rate limit?)")
assert result.status == "enriched"
applied = tracks.applied
assert applied is not None
assert applied["title"] == "Scarlet Fire"
assert "Otis McDonald" in artists.created
assert "Music Track" not in artists.created
assert albums.created and albums.created[0][0] == "Scarlet Fire"
# --- minimal in-memory adapters --------------------------------------------
def _pending_track() -> Track:
now = datetime.now(UTC)
return Track(
id=uuid.uuid4(),
title="scarlet_fire_otis_mcdonald", # the upload-time filename stem
artist_id=uuid.uuid4(),
album_id=None,
storage_uri="tracks/sf/scarlet.mp3",
file_format="mp3",
file_size=FIXTURE.stat().st_size,
source="upload",
source_id="sha-real",
duration_seconds=None,
genre=None,
year=None,
metadata_status="pending",
metadata_error=None,
enriched_at=None,
created_at=now,
updated_at=now,
)
class _FixtureStorage:
@asynccontextmanager
async def as_local_path(self, _key: str) -> AsyncIterator[Path]:
yield FIXTURE
class _FakeTrackRepo:
def __init__(self, track: Track) -> None:
self._track = track
self.applied: dict[str, object] | None = None
async def get_by_id(self, _track_id: uuid.UUID) -> Track:
return self._track
async def apply_enrichment(self, _track_id: uuid.UUID, **kw: object) -> Track:
self.applied = kw
return self._track
@dataclass
class _FakeArtistRepo:
created: list[str] = field(default_factory=list)
async def get_or_create(self, name: str) -> Artist:
self.created.append(name)
now = datetime.now(UTC)
return Artist(id=uuid.uuid4(), name=name, created_at=now, updated_at=now)
@dataclass
class _FakeAlbumRepo:
created: list[tuple[str, uuid.UUID]] = field(default_factory=list)
async def get_or_create(
self, *, title: str, artist_id: uuid.UUID, year: int | None, musicbrainz_id: str | None
) -> Album:
self.created.append((title, artist_id))
now = datetime.now(UTC)
return Album(
id=uuid.uuid4(),
title=title,
artist_id=artist_id,
year=year,
cover_path=None,
musicbrainz_id=musicbrainz_id,
created_at=now,
updated_at=now,
)
async def set_cover_path(self, _album_id: uuid.UUID, _cover_path: str) -> None:
return None