feat(enrichment): record status/errors and trust high-confidence AcoustID
Docker Build & Publish / build (push) Has been cancelled
Docker Build & Publish / push (push) Has been cancelled
Docker Build & Publish / Prune old image versions (push) Has been cancelled

Two related gaps surfaced from "uploaded a track, nothing changed / no status":

- A track could stay stuck on `pending` forever (an unexpected worker error
  rolled back the run without recording anything), and `failed` carried no
  reason. Add `tracks.metadata_error` + `tracks.enriched_at` (migration), stamp
  the outcome in apply_enrichment, add TrackRepository.mark_enrichment_failed,
  wrap enrich_task to persist crashes as `failed` in a fresh session, and emit a
  human-readable no-match reason. Expose metadata_error/enriched_at in TrackOut.

- The tag-first merge let junk embedded tags (e.g. "Music Track"/"Sound_13958")
  override even a 0.99-confidence AcoustID match. Add acoustid_trust_score
  (default 0.85): above it the acoustic identity wins for title/artist/album/
  year, tags are fallback; below it, tag-first as before.

Add a license-free real-file fixture (Scarlet Fire / Otis McDonald) whose junk
tags AcoustID overrides, with an always-on tag-reader test plus fpcalc/AcoustID/
network-gated identity + full-pipeline tests (skip on host, run in the container).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Senko-san
2026-06-13 13:29:08 +03:00
parent 30cb8901f2
commit 73d7da440f
17 changed files with 468 additions and 33 deletions
+2
View File
@@ -17,6 +17,8 @@ class TrackOut(BaseModel):
file_format: str
file_size: int
metadata_status: str
metadata_error: str | None
enriched_at: dt.datetime | None
source: str
has_cover: bool
created_at: dt.datetime
+2
View File
@@ -42,6 +42,8 @@ async def _build_track_out(
file_format=t.file_format,
file_size=t.file_size,
metadata_status=t.metadata_status,
metadata_error=t.metadata_error,
enriched_at=t.enriched_at,
source=t.source,
has_cover=bool(t.album_id and albums.get(t.album_id) and albums[t.album_id].cover_path),
created_at=t.created_at,
+37 -7
View File
@@ -58,6 +58,7 @@ class MetadataEnrichmentService:
acoustid: AcoustIdClient,
cover_extractor: CoverArtExtractor | None = None,
cover_provider: CoverArtProvider | None = None,
acoustid_trust_score: float = 0.85,
) -> None:
self._tracks = tracks
self._artists = artists
@@ -68,6 +69,7 @@ class MetadataEnrichmentService:
self._acoustid = acoustid
self._cover_extractor = cover_extractor
self._cover_provider = cover_provider
self._acoustid_trust_score = acoustid_trust_score
async def enrich(self, track_id: uuid.UUID) -> EnrichmentResult:
track = await self._tracks.get_by_id(track_id)
@@ -81,16 +83,31 @@ class MetadataEnrichmentService:
tags = await self._read_local(track.storage_uri)
match = await self._identify(track.storage_uri)
# Merge sources: prefer embedded tags, fall back to the AcoustID match.
# ``title`` is guaranteed non-None by the existing track title; the rest
# stay None when neither source has them.
# Merge order is tag-first by default — embedded tags fix the common
# well-tagged offline case. But a *high-confidence* AcoustID match is the
# more trustworthy identity (downloaded files routinely carry junk tags
# like "Music Track"/"Sound_12345"), so above the trust threshold the
# acoustic match wins for the identity fields and tags become fallback.
tag_title = tags.title if tags else None
tag_artist = tags.artist if tags else None
tag_album = tags.album if tags else None
title = _opt_str(tag_title, match.title if match else None) or track.title
artist_name = _opt_str(tag_artist, match.artist if match else None)
album_title = _opt_str(tag_album, match.album if match else None)
year = _first_int(tags.year if tags else None, match.year if match else None)
match_title = match.title if match else None
match_artist = match.artist if match else None
match_album = match.album if match else None
match_year = match.year if match else None
tag_year = tags.year if tags else None
trust_match = match is not None and match.score >= self._acoustid_trust_score
if trust_match:
title = _opt_str(match_title, tag_title) or track.title
artist_name = _opt_str(match_artist, tag_artist)
album_title = _opt_str(match_album, tag_album)
year = _first_int(match_year, tag_year)
else:
title = _opt_str(tag_title, match_title) or track.title
artist_name = _opt_str(tag_artist, match_artist)
album_title = _opt_str(tag_album, match_album)
year = _first_int(tag_year, match_year)
genre = tags.genre if tags else None
track_number = tags.track_number if tags else None
duration = _first_int(
@@ -114,6 +131,9 @@ class MetadataEnrichmentService:
identified = bool(artist_name) or album_id is not None or mbid is not None
status = "enriched" if identified else "failed"
# On a clean "no identity" outcome, record *why* so the UI shows a reason
# rather than a bare "failed". A successful run clears any prior error.
metadata_error = None if identified else self._no_match_reason()
await self._tracks.apply_enrichment(
track_id,
@@ -128,10 +148,20 @@ class MetadataEnrichmentService:
acoustid_fingerprint=acoustid_id,
musicbrainz_id=mbid,
metadata_status=status,
metadata_error=metadata_error,
)
log.info("enrich_complete", track_id=str(track_id), status=status, mbid=mbid)
return EnrichmentResult(track_id=track_id, status=status, matched_mbid=mbid)
def _no_match_reason(self) -> str:
"""Explain a ``failed`` (no-identity) run in terms a user can act on:
which optional identification step was unavailable, if any."""
if not self._fingerprinter.is_available():
return "No metadata match: audio fingerprinting (fpcalc) is unavailable."
if not self._acoustid.is_available():
return "No metadata match: AcoustID lookup is unavailable (no API key)."
return "No metadata match found in tags or AcoustID."
async def _read_local(self, storage_uri: str) -> AudioTags | None:
try:
async with self._storage.as_local_path(storage_uri) as path:
+4
View File
@@ -90,6 +90,10 @@ class Settings(BaseSettings):
ml_service_url: str | None = None
acoustid_api_key: SecretStr | None = None
acoustid_api_url: str = "https://api.acoustid.org/v2/lookup"
# Above this AcoustID match score, trust the acoustic identification over
# embedded file tags (which are frequently junk on downloaded files —
# e.g. "Music Track" / "Sound_12345"). Below it, keep the tag-first merge.
acoustid_trust_score: float = 0.85
# MusicBrainz/AcoustID require a meaningful User-Agent identifying the
# application and a way to contact its maintainer (see
# https://musicbrainz.org/doc/XML_Web_Service/Rate_Limiting). Self-hosted
+2
View File
@@ -28,5 +28,7 @@ class Track:
genre: str | None
year: int | None
metadata_status: str
metadata_error: str | None
enriched_at: dt.datetime | None
created_at: dt.datetime
updated_at: dt.datetime
+9 -2
View File
@@ -172,11 +172,18 @@ class TrackRepository(Protocol):
acoustid_fingerprint: str | None,
musicbrainz_id: str | None,
metadata_status: str,
metadata_error: str | None = None,
) -> Track:
"""Persist auto-enrichment results. Nullable fields are filled only when
a non-``None`` value is supplied (re-enrich never erases prior data);
``title``/``artist_id``/``metadata_status`` are always written. Callers
must not invoke this for ``metadata_status == 'manual'`` tracks."""
``title``/``artist_id``/``metadata_status`` are always written, and the
run's outcome (``metadata_error`` + completion time) is always stamped.
Callers must not invoke this for ``metadata_status == 'manual'`` tracks."""
...
async def mark_enrichment_failed(self, track_id: uuid.UUID, *, error: str) -> None:
"""Record that an enrichment run crashed unexpectedly: set ``failed`` +
the error reason. A no-op for ``manual`` or missing tracks."""
...
+12 -1
View File
@@ -6,9 +6,10 @@
imports/downloads stay idempotent (plan §4, §6.1).
"""
import datetime as dt
import uuid
from sqlalchemy import ForeignKey, Integer, String, UniqueConstraint
from sqlalchemy import DateTime, ForeignKey, Integer, String, UniqueConstraint
from sqlalchemy.orm import Mapped, mapped_column
from app.infrastructure.db.base import Base
@@ -63,6 +64,16 @@ class TrackModel(UUIDPrimaryKeyMixin, TimestampMixin, Base):
nullable=False,
default=MetadataStatus.PENDING.value,
)
# Human-readable reason the last enrichment run set ``failed`` (no match, or
# an unexpected worker error). ``None`` once a run succeeds. Surfaced in the
# UI so a stuck/failed track is diagnosable, not silent.
metadata_error: Mapped[str | None] = mapped_column(String(2048), nullable=True)
# When the last enrichment run finished (success or failure). ``None`` while
# still ``pending`` — lets the UI distinguish "queued/running" from "done".
enriched_at: Mapped[dt.datetime | None] = mapped_column(
DateTime(timezone=True),
nullable=True,
)
added_by: Mapped[uuid.UUID | None] = mapped_column(
ForeignKey("users.id", ondelete="SET NULL"),
@@ -39,6 +39,8 @@ def _track_to_entity(row: TrackModel) -> Track:
genre=row.genre,
year=row.year,
metadata_status=row.metadata_status,
metadata_error=row.metadata_error,
enriched_at=row.enriched_at,
created_at=row.created_at,
updated_at=row.updated_at,
)
@@ -38,6 +38,8 @@ def _track_to_entity(row: TrackModel) -> Track:
genre=row.genre,
year=row.year,
metadata_status=row.metadata_status,
metadata_error=row.metadata_error,
enriched_at=row.enriched_at,
created_at=row.created_at,
updated_at=row.updated_at,
)
@@ -1,5 +1,6 @@
"""Track repository — adapter over ``AsyncSession``."""
import datetime as dt
import uuid
from sqlalchemy import func, select
@@ -26,6 +27,8 @@ def _to_entity(row: TrackModel) -> Track:
genre=row.genre,
year=row.year,
metadata_status=row.metadata_status,
metadata_error=row.metadata_error,
enriched_at=row.enriched_at,
created_at=row.created_at,
updated_at=row.updated_at,
)
@@ -189,6 +192,7 @@ class SqlAlchemyTrackRepository:
acoustid_fingerprint: str | None,
musicbrainz_id: str | None,
metadata_status: str,
metadata_error: str | None = None,
) -> Track:
row = await self._session.get(TrackModel, track_id)
if row is None:
@@ -197,6 +201,10 @@ class SqlAlchemyTrackRepository:
row.title = title
row.artist_id = artist_id
row.metadata_status = metadata_status
# A finished run always stamps outcome: clear/set the reason and mark the
# completion time so the UI can tell "still pending" from "done/failed".
row.metadata_error = metadata_error
row.enriched_at = dt.datetime.now(dt.UTC)
# Nullable extras: fill gaps only — never erase data a prior run found.
if album_id is not None:
row.album_id = album_id
@@ -217,3 +225,16 @@ class SqlAlchemyTrackRepository:
await self._session.flush()
await self._session.refresh(row)
return _to_entity(row)
async def mark_enrichment_failed(self, track_id: uuid.UUID, *, error: str) -> None:
"""Record that an enrichment run crashed (unexpected exception). Runs in
its own session so the failure is persisted even though the run's own
transaction rolled back. Never overwrites ``manual`` (a no-op then), and
a missing track is a clean no-op."""
row = await self._session.get(TrackModel, track_id)
if row is None or row.metadata_status == "manual":
return
row.metadata_status = "failed"
row.metadata_error = error
row.enriched_at = dt.datetime.now(dt.UTC)
await self._session.flush()
+26 -13
View File
@@ -42,19 +42,32 @@ async def enrich_track(_ctx: dict[str, Any], *, track_id: str) -> dict[str, Any]
base_url=settings.coverart_base_url,
)
async with session_scope() as session:
service = MetadataEnrichmentService(
tracks=SqlAlchemyTrackRepository(session),
artists=SqlAlchemyArtistRepository(session),
albums=SqlAlchemyAlbumRepository(session),
storage=get_file_storage(),
tag_reader=MutagenTagReader(),
fingerprinter=FpcalcFingerprinter(settings.fpcalc_path),
acoustid=acoustid,
cover_extractor=MutagenCoverExtractor(),
cover_provider=cover_provider,
)
result = await service.enrich(uuid.UUID(track_id))
tid = uuid.UUID(track_id)
try:
async with session_scope() as session:
service = MetadataEnrichmentService(
tracks=SqlAlchemyTrackRepository(session),
artists=SqlAlchemyArtistRepository(session),
albums=SqlAlchemyAlbumRepository(session),
storage=get_file_storage(),
tag_reader=MutagenTagReader(),
fingerprinter=FpcalcFingerprinter(settings.fpcalc_path),
acoustid=acoustid,
cover_extractor=MutagenCoverExtractor(),
cover_provider=cover_provider,
acoustid_trust_score=settings.acoustid_trust_score,
)
result = await service.enrich(tid)
except Exception as exc:
# The run's own transaction rolled back, leaving the track stuck at
# ``pending``. Record the failure in a fresh session so the UI shows a
# ``failed`` status with a reason instead of a silent, endless spinner.
log.exception("enrich_failed", track_id=track_id)
async with session_scope() as session:
await SqlAlchemyTrackRepository(session).mark_enrichment_failed(
tid, error=f"Enrichment crashed: {type(exc).__name__}: {exc}"
)
return {"track_id": track_id, "status": "failed", "mbid": None}
return {
"track_id": str(result.track_id),