feat(enrichment): tag-first metadata pipeline (§1D)
Docker Build & Publish / push (push) Has been cancelled
Docker Build & Publish / Prune old image versions (push) Has been cancelled
Docker Build & Publish / build (push) Failing after 10m8s

Implements the §6.2 enrichment pipeline: embedded tags → Chromaprint
fingerprint → AcoustID lookup. Well-tagged files get correct
artist/album/title offline; the rest are identified via AcoustID
(which also yields a MusicBrainz recording id in one call).

- domain: AudioTags/Fingerprint/RecordingMatch value objects; ports
  AudioTagReader, AudioFingerprinter, AcoustIdClient; TrackRepository
  .apply_enrichment (gap-fill, never erases) + AlbumRepository.get_or_create
- infrastructure/metadata: MutagenTagReader, FpcalcFingerprinter,
  AcoustIdHttpClient (rich meta=recordings+releasegroups, throttled)
- application: MetadataEnrichmentService — tags preferred, AcoustID fills
  gaps; resolves artist/album; status enriched/failed; skips manual;
  every external step wrapped (graceful degradation)
- workers: enrich_task registered; enqueue_enrich is best-effort and
  deferred so the caller's txn commits before the worker reads the row
- wiring: upload enqueues after add; import returns imported_ids and
  enqueues post-commit (mid-scan would race the worker); manual
  POST /tracks/{id}/metadata/enrich endpoint
- deps: add mutagen (fpcalc/ffmpeg already in the image)

Tests: metadata service orchestration, AcoustID parser, tag helpers.
125 passed; mypy strict + ruff clean.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Senko-san
2026-06-09 13:04:02 +03:00
parent 48e3418c7f
commit c72d19599a
24 changed files with 1934 additions and 763 deletions
+75
View File
@@ -0,0 +1,75 @@
"""Unit tests for the AcoustID response parser — pure, no network."""
from app.infrastructure.metadata.acoustid import _parse_best_match
def _payload_with_results(results: list[object]) -> dict[str, object]:
return {"status": "ok", "results": results}
def test_parses_full_recording() -> None:
payload = _payload_with_results(
[
{
"id": "acoustid-1",
"score": 0.97,
"recordings": [
{
"id": "mb-rec-1",
"title": "One More Time",
"artists": [{"id": "a1", "name": "Daft Punk"}],
"releasegroups": [{"id": "rg1", "title": "Discovery"}],
}
],
}
]
)
match = _parse_best_match(payload)
assert match is not None
assert match.acoustid == "acoustid-1"
assert match.recording_mbid == "mb-rec-1"
assert match.title == "One More Time"
assert match.artist == "Daft Punk"
assert match.album == "Discovery"
assert match.score == 0.97
def test_picks_highest_score() -> None:
payload = _payload_with_results(
[
{"id": "low", "score": 0.40, "recordings": [{"id": "r-low", "title": "Low"}]},
{"id": "high", "score": 0.92, "recordings": [{"id": "r-high", "title": "High"}]},
]
)
match = _parse_best_match(payload)
assert match is not None
assert match.acoustid == "high"
assert match.title == "High"
def test_result_without_recordings_still_returns_id() -> None:
payload = _payload_with_results([{"id": "acoustid-only", "score": 0.5}])
match = _parse_best_match(payload)
assert match is not None
assert match.acoustid == "acoustid-only"
assert match.recording_mbid is None
assert match.title is None
def test_error_status_returns_none() -> None:
assert _parse_best_match({"status": "error", "error": {"message": "bad"}}) is None
def test_empty_results_returns_none() -> None:
assert _parse_best_match(_payload_with_results([])) is None
def test_non_dict_payload_returns_none() -> None:
assert _parse_best_match("nonsense") is None
assert _parse_best_match(None) is None
+283
View File
@@ -0,0 +1,283 @@
"""Unit tests for MetadataEnrichmentService — DB-free, in-memory fakes.
Covers the §6.2 orchestration contract: tag-first merge, AcoustID fallback,
artist/album resolution, status transitions, and the hard invariants
(``manual`` untouched, graceful degradation, idempotent gap-fill).
"""
import datetime as dt
import uuid
from collections.abc import AsyncIterator
from contextlib import asynccontextmanager
from pathlib import Path
import pytest
from app.application.metadata_service import MetadataEnrichmentService
from app.domain.entities import Artist, Track
from app.domain.entities.album import Album
from app.domain.entities.metadata import AudioTags, Fingerprint, RecordingMatch
pytestmark = pytest.mark.asyncio
_UNKNOWN = "Unknown Artist"
def _track(*, metadata_status: str = "pending", title: str = "raw-stem") -> Track:
now = dt.datetime.now(dt.UTC)
return Track(
id=uuid.uuid4(),
title=title,
artist_id=uuid.uuid4(), # the "Unknown Artist" id
album_id=None,
storage_uri="tracks/aa/song.mp3",
file_format="mp3",
file_size=123,
source="upload",
source_id="deadbeef",
duration_seconds=None,
genre=None,
year=None,
metadata_status=metadata_status,
created_at=now,
updated_at=now,
)
class FakeTrackRepo:
def __init__(self, track: Track | None) -> None:
self._track = track
self.applied: dict[str, object] | None = None
async def get_by_id(self, track_id: uuid.UUID) -> Track | None:
return self._track
async def apply_enrichment(self, track_id: uuid.UUID, **kw: object) -> Track:
self.applied = kw
return self._track # type: ignore[return-value]
class FakeArtistRepo:
def __init__(self) -> None:
self.created: list[str] = []
async def get_or_create(self, name: str) -> Artist:
self.created.append(name)
now = dt.datetime.now(dt.UTC)
return Artist(id=uuid.uuid4(), name=name, created_at=now, updated_at=now)
class FakeAlbumRepo:
def __init__(self) -> None:
self.created: list[tuple[str, uuid.UUID]] = []
async def get_or_create(
self, *, title: str, artist_id: uuid.UUID, year: int | None, musicbrainz_id: str | None
) -> Album:
self.created.append((title, artist_id))
now = dt.datetime.now(dt.UTC)
return Album(
id=uuid.uuid4(),
title=title,
artist_id=artist_id,
year=year,
cover_path=None,
musicbrainz_id=musicbrainz_id,
created_at=now,
updated_at=now,
)
class FakeStorage:
@asynccontextmanager
async def as_local_path(self, key: str) -> AsyncIterator[Path]:
yield Path("/tmp") / key
class FakeTagReader:
def __init__(self, tags: AudioTags | None) -> None:
self._tags = tags
async def read(self, path: Path) -> AudioTags | None:
return self._tags
class FakeFingerprinter:
def __init__(self, fp: Fingerprint | None, *, available: bool = True) -> None:
self._fp = fp
self._available = available
def is_available(self) -> bool:
return self._available
async def calculate(self, path: Path) -> Fingerprint | None:
return self._fp
class FakeAcoustId:
def __init__(self, match: RecordingMatch | None, *, available: bool = True) -> None:
self._match = match
self._available = available
self.calls = 0
def is_available(self) -> bool:
return self._available
async def lookup(self, fingerprint: Fingerprint) -> RecordingMatch | None:
self.calls += 1
return self._match
def _service(
*,
track: Track | None,
tags: AudioTags | None = None,
fp: Fingerprint | None = None,
match: RecordingMatch | None = None,
fp_available: bool = True,
acoustid_available: bool = True,
) -> tuple[MetadataEnrichmentService, FakeTrackRepo, FakeArtistRepo, FakeAlbumRepo, FakeAcoustId]:
tracks = FakeTrackRepo(track)
artists = FakeArtistRepo()
albums = FakeAlbumRepo()
acoustid = FakeAcoustId(match, available=acoustid_available)
service = MetadataEnrichmentService(
tracks=tracks, # type: ignore[arg-type]
artists=artists, # type: ignore[arg-type]
albums=albums, # type: ignore[arg-type]
storage=FakeStorage(), # type: ignore[arg-type]
tag_reader=FakeTagReader(tags), # type: ignore[arg-type]
fingerprinter=FakeFingerprinter(fp, available=fp_available), # type: ignore[arg-type]
acoustid=acoustid, # type: ignore[arg-type]
)
return service, tracks, artists, albums, acoustid
async def test_tags_only_enriches_and_relinks_artist_and_album() -> None:
track = _track()
tags = AudioTags(
title="Real Title",
artist="Pink Floyd",
album="The Wall",
genre="Rock",
year=1979,
track_number=1,
duration_seconds=222,
)
service, tracks, artists, albums, acoustid = _service(track=track, tags=tags)
result = await service.enrich(track.id)
assert result.status == "enriched"
assert acoustid.calls == 0 # no fingerprint → no lookup needed
assert "Pink Floyd" in artists.created
assert albums.created and albums.created[0][0] == "The Wall"
applied = tracks.applied
assert applied is not None
assert applied["title"] == "Real Title"
assert applied["genre"] == "Rock"
assert applied["year"] == 1979
assert applied["track_number"] == 1
assert applied["duration_seconds"] == 222
assert applied["metadata_status"] == "enriched"
async def test_manual_track_is_never_touched() -> None:
track = _track(metadata_status="manual")
service, tracks, _, _, _ = _service(track=track, tags=AudioTags(artist="X"))
result = await service.enrich(track.id)
assert result.status == "skipped"
assert tracks.applied is None # nothing written
async def test_missing_track_is_a_clean_noop() -> None:
service, tracks, _, _, _ = _service(track=None)
result = await service.enrich(uuid.uuid4())
assert result.status == "skipped"
assert tracks.applied is None
async def test_nothing_found_marks_failed() -> None:
track = _track()
# No tags, no fingerprint → no identity at all.
service, tracks, artists, albums, _acoustid = _service(track=track, tags=None, fp=None)
result = await service.enrich(track.id)
assert result.status == "failed"
assert artists.created == [] # artist stays the original unknown
assert albums.created == []
applied = tracks.applied
assert applied is not None
assert applied["artist_id"] == track.artist_id # fallback kept
assert applied["metadata_status"] == "failed"
async def test_acoustid_path_fills_when_tags_absent() -> None:
track = _track()
fp = Fingerprint(fingerprint="AQAAxyz", duration_seconds=200)
match = RecordingMatch(
acoustid="acoustid-uuid",
score=0.95,
recording_mbid="mb-recording-id",
title="Identified Title",
artist="Daft Punk",
album="Discovery",
)
service, tracks, artists, _albums, acoustid = _service(
track=track, tags=None, fp=fp, match=match
)
result = await service.enrich(track.id)
assert result.status == "enriched"
assert result.matched_mbid == "mb-recording-id"
assert acoustid.calls == 1
applied = tracks.applied
assert applied is not None
assert applied["title"] == "Identified Title"
assert applied["musicbrainz_id"] == "mb-recording-id"
assert applied["acoustid_fingerprint"] == "acoustid-uuid"
assert "Daft Punk" in artists.created
async def test_tags_win_over_acoustid_for_overlapping_fields() -> None:
track = _track()
fp = Fingerprint(fingerprint="AQAA", duration_seconds=200)
tags = AudioTags(title="Tagged Title", artist="Tagged Artist")
match = RecordingMatch(
acoustid="aid",
score=0.9,
recording_mbid="mbid",
title="AcoustID Title",
artist="AcoustID Artist",
)
service, tracks, artists, _albums, _acoustid = _service(
track=track, tags=tags, fp=fp, match=match
)
await service.enrich(track.id)
applied = tracks.applied
assert applied is not None
assert applied["title"] == "Tagged Title" # tag preferred
assert "Tagged Artist" in artists.created
# but the MBID from AcoustID is still captured
assert applied["musicbrainz_id"] == "mbid"
async def test_fingerprint_skipped_when_acoustid_unavailable() -> None:
track = _track()
fp = Fingerprint(fingerprint="AQAA", duration_seconds=200)
service, _tracks, _artists, _albums, acoustid = _service(
track=track, tags=AudioTags(artist="Tagged"), fp=fp, acoustid_available=False
)
result = await service.enrich(track.id)
# tags still enrich, but no AcoustID call is attempted
assert acoustid.calls == 0
assert result.status == "enriched"
+27
View File
@@ -0,0 +1,27 @@
"""Unit tests for the mutagen tag-parsing helpers — pure, no files."""
from app.infrastructure.metadata.tags import _first, _parse_track_number, _parse_year
def test_first_takes_head_of_list() -> None:
assert _first(["Pink Floyd", "other"]) == "Pink Floyd"
assert _first("Solo") == "Solo"
assert _first([]) is None
assert _first(None) is None
assert _first([" "]) is None # whitespace-only → None
def test_parse_year_extracts_four_digits() -> None:
assert _parse_year(["1979"]) == 1979
assert _parse_year(["1979-01-02"]) == 1979
assert _parse_year("2021-12") == 2021
assert _parse_year(["no year"]) is None
assert _parse_year(None) is None
def test_parse_track_number_handles_slash_form() -> None:
assert _parse_track_number(["3/12"]) == 3
assert _parse_track_number(["7"]) == 7
assert _parse_track_number("1/10") == 1
assert _parse_track_number(["A1"]) is None
assert _parse_track_number(None) is None