feat(enrichment): tag-first metadata pipeline (§1D)
Docker Build & Publish / push (push) Has been cancelled
Docker Build & Publish / Prune old image versions (push) Has been cancelled
Docker Build & Publish / build (push) Failing after 10m8s

Implements the §6.2 enrichment pipeline: embedded tags → Chromaprint
fingerprint → AcoustID lookup. Well-tagged files get correct
artist/album/title offline; the rest are identified via AcoustID
(which also yields a MusicBrainz recording id in one call).

- domain: AudioTags/Fingerprint/RecordingMatch value objects; ports
  AudioTagReader, AudioFingerprinter, AcoustIdClient; TrackRepository
  .apply_enrichment (gap-fill, never erases) + AlbumRepository.get_or_create
- infrastructure/metadata: MutagenTagReader, FpcalcFingerprinter,
  AcoustIdHttpClient (rich meta=recordings+releasegroups, throttled)
- application: MetadataEnrichmentService — tags preferred, AcoustID fills
  gaps; resolves artist/album; status enriched/failed; skips manual;
  every external step wrapped (graceful degradation)
- workers: enrich_task registered; enqueue_enrich is best-effort and
  deferred so the caller's txn commits before the worker reads the row
- wiring: upload enqueues after add; import returns imported_ids and
  enqueues post-commit (mid-scan would race the worker); manual
  POST /tracks/{id}/metadata/enrich endpoint
- deps: add mutagen (fpcalc/ffmpeg already in the image)

Tests: metadata service orchestration, AcoustID parser, tag helpers.
125 passed; mypy strict + ruff clean.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Senko-san
2026-06-09 13:04:02 +03:00
parent 48e3418c7f
commit c72d19599a
24 changed files with 1934 additions and 763 deletions
+16 -6
View File
@@ -9,7 +9,7 @@ must not abort the whole scan (graceful degradation).
import contextlib
import uuid
from dataclasses import dataclass
from dataclasses import dataclass, field
from app.core.logging import get_logger
from app.domain.ports import ArtistRepository, FileStorage, IndexableSource, TrackRepository
@@ -27,6 +27,9 @@ class ImportSummary:
imported: int
skipped: int
failed: int
# IDs of freshly imported tracks, for the caller to enqueue enrichment
# *after* its transaction commits (enqueuing mid-scan would race the worker).
imported_ids: list[uuid.UUID] = field(default_factory=list)
class LibraryImportService:
@@ -44,7 +47,8 @@ class LibraryImportService:
async def scan_and_import(
self, source: IndexableSource, *, added_by: uuid.UUID | None
) -> ImportSummary:
seen = imported = skipped = failed = 0
seen = skipped = failed = 0
imported_ids: list[uuid.UUID] = []
for file in source.scan():
seen += 1
try:
@@ -52,13 +56,18 @@ class LibraryImportService:
if existing is not None:
skipped += 1
continue
await self._import_one(source.name, file, added_by)
imported += 1
track_id = await self._import_one(source.name, file, added_by)
imported_ids.append(track_id)
except Exception:
failed += 1
log.warning("import_file_failed", source=source.name, source_id=file.source_id)
summary = ImportSummary(
source=source.name, seen=seen, imported=imported, skipped=skipped, failed=failed
source=source.name,
seen=seen,
imported=len(imported_ids),
skipped=skipped,
failed=failed,
imported_ids=imported_ids,
)
log.info(
"import_complete",
@@ -72,7 +81,7 @@ class LibraryImportService:
async def _import_one(
self, source_name: str, file: SourceFile, added_by: uuid.UUID | None
) -> None:
) -> uuid.UUID:
track_id = uuid.uuid4()
key = f"tracks/{str(track_id)[:2]}/{track_id}.{file.file_format}"
await self._storage.save_file(key, file.path)
@@ -94,3 +103,4 @@ class LibraryImportService:
with contextlib.suppress(Exception):
await self._storage.delete(key)
raise
return track_id
+174
View File
@@ -0,0 +1,174 @@
"""MetadataEnrichmentService — the §6.2 pipeline orchestrator.
Order (tag-first): embedded tags → Chromaprint fingerprint → AcoustID lookup.
Tags fix the common well-tagged case offline; AcoustID identifies the rest and
supplies a MusicBrainz id. The result updates the track and sets
``metadata_status`` to ``enriched`` (identity found) or ``failed`` (nothing).
Invariants (plan §6.2, CLAUDE.md):
- **Never touch ``manual``** — a user-edited track is returned untouched.
- **Graceful degradation** — every external step is wrapped; one failure (no
fpcalc, no API key, service down) degrades the result, never crashes.
- **Idempotent** — re-running only fills gaps; ``apply_enrichment`` never erases.
"""
import uuid
from dataclasses import dataclass
from app.core.logging import get_logger
from app.domain.entities.metadata import AudioTags, RecordingMatch
from app.domain.ports import (
AcoustIdClient,
AlbumRepository,
ArtistRepository,
AudioFingerprinter,
AudioTagReader,
FileStorage,
TrackRepository,
)
log = get_logger(__name__)
_UNKNOWN_ARTIST = "Unknown Artist"
@dataclass(frozen=True)
class EnrichmentResult:
track_id: uuid.UUID
status: str # "enriched" | "failed" | "skipped"
matched_mbid: str | None = None
class MetadataEnrichmentService:
def __init__(
self,
*,
tracks: TrackRepository,
artists: ArtistRepository,
albums: AlbumRepository,
storage: FileStorage,
tag_reader: AudioTagReader,
fingerprinter: AudioFingerprinter,
acoustid: AcoustIdClient,
) -> None:
self._tracks = tracks
self._artists = artists
self._albums = albums
self._storage = storage
self._tag_reader = tag_reader
self._fingerprinter = fingerprinter
self._acoustid = acoustid
async def enrich(self, track_id: uuid.UUID) -> EnrichmentResult:
track = await self._tracks.get_by_id(track_id)
if track is None:
log.info("enrich_track_missing", track_id=str(track_id))
return EnrichmentResult(track_id=track_id, status="skipped")
if track.metadata_status == "manual":
log.info("enrich_skip_manual", track_id=str(track_id))
return EnrichmentResult(track_id=track_id, status="skipped")
tags = await self._read_local(track.storage_uri)
match = await self._identify(track.storage_uri)
# Merge sources: prefer embedded tags, fall back to the AcoustID match.
# ``title`` is guaranteed non-None by the existing track title; the rest
# stay None when neither source has them.
tag_title = tags.title if tags else None
tag_artist = tags.artist if tags else None
tag_album = tags.album if tags else None
title = _opt_str(tag_title, match.title if match else None) or track.title
artist_name = _opt_str(tag_artist, match.artist if match else None)
album_title = _opt_str(tag_album, match.album if match else None)
year = _first_int(tags.year if tags else None, match.year if match else None)
genre = tags.genre if tags else None
track_number = tags.track_number if tags else None
duration = _first_int(
tags.duration_seconds if tags else None,
track.duration_seconds,
)
bitrate = tags.bitrate if tags else None
mbid = match.recording_mbid if match else None
acoustid_id = match.acoustid if match else None
artist_id = await self._resolve_artist(artist_name, fallback=track.artist_id)
album_id = await self._resolve_album(album_title, artist_id=artist_id, year=year, mbid=mbid)
identified = bool(artist_name) or album_id is not None or mbid is not None
status = "enriched" if identified else "failed"
await self._tracks.apply_enrichment(
track_id,
title=title,
artist_id=artist_id,
album_id=album_id,
genre=genre,
year=year,
track_number=track_number,
duration_seconds=duration,
bitrate=bitrate,
acoustid_fingerprint=acoustid_id,
musicbrainz_id=mbid,
metadata_status=status,
)
log.info("enrich_complete", track_id=str(track_id), status=status, mbid=mbid)
return EnrichmentResult(track_id=track_id, status=status, matched_mbid=mbid)
async def _read_local(self, storage_uri: str) -> AudioTags | None:
try:
async with self._storage.as_local_path(storage_uri) as path:
return await self._tag_reader.read(path)
except Exception:
log.warning("enrich_tag_step_failed", storage_uri=storage_uri)
return None
async def _identify(self, storage_uri: str) -> RecordingMatch | None:
if not self._acoustid.is_available() or not self._fingerprinter.is_available():
return None
try:
async with self._storage.as_local_path(storage_uri) as path:
fingerprint = await self._fingerprinter.calculate(path)
if fingerprint is None:
return None
return await self._acoustid.lookup(fingerprint)
except Exception:
log.warning("enrich_identify_step_failed", storage_uri=storage_uri)
return None
async def _resolve_artist(self, name: str | None, *, fallback: uuid.UUID) -> uuid.UUID:
if not name or name == _UNKNOWN_ARTIST:
return fallback
artist = await self._artists.get_or_create(name)
return artist.id
async def _resolve_album(
self,
title: str | None,
*,
artist_id: uuid.UUID,
year: int | None,
mbid: str | None,
) -> uuid.UUID | None:
if not title:
return None
album = await self._albums.get_or_create(
title=title,
artist_id=artist_id,
year=year,
musicbrainz_id=mbid,
)
return album.id
def _opt_str(*values: str | None) -> str | None:
for value in values:
if value:
return value
return None
def _first_int(*values: int | None) -> int | None:
for value in values:
if value is not None:
return value
return None
+7 -1
View File
@@ -5,6 +5,7 @@ import hashlib
import os
import tempfile
import uuid
from collections.abc import Awaitable, Callable
from dataclasses import dataclass
from pathlib import Path
from typing import Protocol
@@ -14,6 +15,8 @@ import anyio
from app.domain.entities.user import User
from app.domain.ports import ArtistRepository, FileStorage, TrackRepository
EnrichEnqueuer = Callable[[uuid.UUID], Awaitable[None]]
class UploadFileProtocol(Protocol):
filename: str | None
@@ -49,11 +52,13 @@ class UploadService:
artists: ArtistRepository,
storage: FileStorage,
tmp_dir: Path | None = None,
enqueue_enrich: EnrichEnqueuer | None = None,
) -> None:
self._tracks = tracks
self._artists = artists
self._storage = storage
self._tmp_dir = tmp_dir
self._enqueue_enrich = enqueue_enrich
async def handle_upload(
self,
@@ -105,7 +110,8 @@ class UploadService:
await self._storage.delete(key)
raise
# TODO(1D): enqueue metadata enrichment task
if self._enqueue_enrich is not None:
await self._enqueue_enrich(track.id)
return UploadResult(
track_id=track.id,