feat(sources): YouTube Music search + download pipeline (§1C/§1E)
Docker Build & Publish / build (push) Successful in 2m39s
Docker Build & Publish / push (push) Failing after 36s
Docker Build & Publish / Prune old image versions (push) Has been skipped

Pluggable fetch source: ytmusicapi search + yt-dlp download (cookies-file guard), DownloadJob entity/repo + DownloadService, download_task worker with exponential-backoff retries, and wired /search, /sources/{source}/search, and /downloads endpoints. Adds youtube_enabled/cookies config, yt-dlp+ytmusicapi deps, and the download_jobs.track_id migration. Snapshot also bundles in-progress storage/tracks/acoustid edits.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Senko-san
2026-06-14 14:04:33 +03:00
parent ea880edd57
commit 78007461e1
32 changed files with 2645 additions and 819 deletions
@@ -35,3 +35,9 @@ class DownloadJobModel(UUIDPrimaryKeyMixin, TimestampMixin, Base):
progress: Mapped[float] = mapped_column(Float, nullable=False, default=0.0)
error_message: Mapped[str | None] = mapped_column(Text, nullable=True)
retry_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
# Set once the download finishes and the track is imported — lets the UI
# link a completed job to its library track.
track_id: Mapped[uuid.UUID | None] = mapped_column(
ForeignKey("tracks.id", ondelete="SET NULL"),
nullable=True,
)
@@ -2,6 +2,9 @@
from app.infrastructure.db.repositories.album_repository import SqlAlchemyAlbumRepository
from app.infrastructure.db.repositories.artist_repository import SqlAlchemyArtistRepository
from app.infrastructure.db.repositories.download_job_repository import (
SqlAlchemyDownloadJobRepository,
)
from app.infrastructure.db.repositories.history_repository import SqlAlchemyHistoryRepository
from app.infrastructure.db.repositories.like_repository import SqlAlchemyLikeRepository
from app.infrastructure.db.repositories.playlist_repository import SqlAlchemyPlaylistRepository
@@ -14,6 +17,7 @@ from app.infrastructure.db.repositories.user_repository import SqlAlchemyUserRep
__all__ = [
"SqlAlchemyAlbumRepository",
"SqlAlchemyArtistRepository",
"SqlAlchemyDownloadJobRepository",
"SqlAlchemyHistoryRepository",
"SqlAlchemyLikeRepository",
"SqlAlchemyPlaylistRepository",
@@ -0,0 +1,164 @@
"""Download job repository — adapter over ``AsyncSession`` (plan §6.1)."""
import datetime as dt
import uuid
from sqlalchemy import func, select
from sqlalchemy.ext.asyncio import AsyncSession
from app.domain.entities.download import DownloadJob
from app.infrastructure.db.models.download_job import DownloadJobModel
from app.infrastructure.db.models.enums import DownloadStatus
# Jobs that are not yet finished — used to dedup an in-flight download.
_ACTIVE_STATUSES = (
DownloadStatus.QUEUED.value,
DownloadStatus.DOWNLOADING.value,
DownloadStatus.ENRICHING.value,
)
def _to_entity(row: DownloadJobModel) -> DownloadJob:
return DownloadJob(
id=row.id,
source=row.source,
source_id=row.source_id,
query=row.query,
requested_by=row.requested_by,
status=row.status,
progress=row.progress,
error_message=row.error_message,
retry_count=row.retry_count,
track_id=row.track_id,
created_at=row.created_at,
updated_at=row.updated_at,
)
class SqlAlchemyDownloadJobRepository:
def __init__(self, session: AsyncSession) -> None:
self._session = session
async def add(
self,
*,
source: str,
source_id: str | None,
query: str | None,
requested_by: uuid.UUID | None,
) -> DownloadJob:
row = DownloadJobModel(
source=source,
source_id=source_id,
query=query,
requested_by=requested_by,
status=DownloadStatus.QUEUED.value,
progress=0.0,
retry_count=0,
)
self._session.add(row)
await self._session.flush()
await self._session.refresh(row)
return _to_entity(row)
async def get_by_id(self, job_id: uuid.UUID) -> DownloadJob | None:
row = await self._session.get(DownloadJobModel, job_id)
return _to_entity(row) if row is not None else None
async def get_active_for_source(self, source: str, source_id: str) -> DownloadJob | None:
row = (
await self._session.execute(
select(DownloadJobModel)
.where(
DownloadJobModel.source == source,
DownloadJobModel.source_id == source_id,
DownloadJobModel.status.in_(_ACTIVE_STATUSES),
)
.order_by(DownloadJobModel.created_at.desc())
.limit(1)
)
).scalar_one_or_none()
return _to_entity(row) if row is not None else None
async def list(
self,
*,
requested_by: uuid.UUID | None,
status: str | None,
limit: int,
offset: int,
) -> list[DownloadJob]:
stmt = select(DownloadJobModel)
if requested_by is not None:
stmt = stmt.where(DownloadJobModel.requested_by == requested_by)
if status is not None:
stmt = stmt.where(DownloadJobModel.status == status)
stmt = stmt.order_by(DownloadJobModel.created_at.desc()).limit(limit).offset(offset)
rows = (await self._session.execute(stmt)).scalars().all()
return [_to_entity(r) for r in rows]
async def count(self, *, requested_by: uuid.UUID | None, status: str | None) -> int:
stmt = select(func.count()).select_from(DownloadJobModel)
if requested_by is not None:
stmt = stmt.where(DownloadJobModel.requested_by == requested_by)
if status is not None:
stmt = stmt.where(DownloadJobModel.status == status)
return (await self._session.execute(stmt)).scalar_one()
async def set_status(
self,
job_id: uuid.UUID,
*,
status: str,
error_message: str | None = None,
track_id: uuid.UUID | None = None,
) -> None:
row = await self._session.get(DownloadJobModel, job_id)
if row is None:
return
row.status = status
# ``error_message`` is always written: a successful transition clears a
# stale reason from an earlier failed attempt.
row.error_message = error_message
if track_id is not None:
row.track_id = track_id
if status == DownloadStatus.DONE.value:
row.progress = 1.0
await self._session.flush()
async def set_progress(self, job_id: uuid.UUID, progress: float) -> None:
row = await self._session.get(DownloadJobModel, job_id)
if row is None:
return
row.progress = max(0.0, min(1.0, progress))
await self._session.flush()
async def increment_retry(self, job_id: uuid.UUID) -> int:
row = await self._session.get(DownloadJobModel, job_id)
if row is None:
return 0
row.retry_count += 1
await self._session.flush()
return row.retry_count
async def delete(self, job_id: uuid.UUID) -> None:
row = await self._session.get(DownloadJobModel, job_id)
if row is not None:
await self._session.delete(row)
await self._session.flush()
async def failure_rate(self, source: str, *, since: dt.datetime) -> float:
total, failed = (
await self._session.execute(
select(
func.count(),
func.count().filter(DownloadJobModel.status == DownloadStatus.FAILED.value),
)
.select_from(DownloadJobModel)
.where(
DownloadJobModel.source == source,
DownloadJobModel.created_at >= since,
)
)
).one()
return (failed / total) if total else 0.0
+1 -1
View File
@@ -78,7 +78,7 @@ class AcoustIdHttpClient:
)
resp.raise_for_status()
return resp.json() # type: ignore[no-any-return]
except (httpx.HTTPError, ValueError):
except httpx.HTTPError, ValueError:
log.warning("acoustid_lookup_failed")
return None
+27 -2
View File
@@ -2,16 +2,18 @@
Built from settings at the composition root. Only sources that are configured
are registered (e.g. ``local`` appears only when ``LOCAL_MEDIA_IMPORT_PATH`` is
set), so enumeration reflects what the instance can actually use.
set; ``youtube`` only when ``YOUTUBE_ENABLED``), so enumeration reflects what the
instance can actually use.
"""
from typing import cast
from app.core.config import Settings
from app.domain.errors import NotFoundError, ValidationError
from app.domain.ports import IndexableSource, SourceBackend
from app.domain.ports import FetchableSource, IndexableSource, SearchableSource, SourceBackend
from app.domain.sources import SourceInfo
from app.infrastructure.sources.local_folder import LocalFolderSource
from app.infrastructure.sources.youtube import YouTubeMusicSource
class SourceRegistry:
@@ -30,6 +32,22 @@ class SourceRegistry:
raise ValidationError(f"Source {name!r} cannot be indexed.")
return cast(IndexableSource, backend)
def searchable(self, name: str) -> SearchableSource:
backend = self.get(name)
if not hasattr(backend, "search"):
raise ValidationError(f"Source {name!r} cannot be searched.")
return cast(SearchableSource, backend)
def fetchable(self, name: str) -> FetchableSource:
backend = self.get(name)
if not hasattr(backend, "fetch"):
raise ValidationError(f"Source {name!r} cannot download.")
return cast(FetchableSource, backend)
def searchables(self) -> list[SearchableSource]:
"""Every registered source that supports search (for cross-source search)."""
return [cast(SearchableSource, b) for b in self._by_name.values() if hasattr(b, "search")]
def infos(self) -> list[SourceInfo]:
return [backend.info() for backend in self._by_name.values()]
@@ -38,4 +56,11 @@ def build_source_registry(settings: Settings) -> SourceRegistry:
backends: list[SourceBackend] = []
if settings.local_media_import_path is not None:
backends.append(LocalFolderSource(settings.local_media_import_path))
if settings.youtube_enabled:
backends.append(
YouTubeMusicSource(
cookies_path=settings.youtube_cookies_path,
tmp_dir=settings.upload_tmp_dir,
)
)
return SourceRegistry(backends)
+207
View File
@@ -0,0 +1,207 @@
"""``youtube`` source — YouTube Music search + download (plan §5).
A *fetch* source: it searches YouTube Music (via ``ytmusicapi``, which returns
clean song/artist/album/duration rows) and downloads the chosen item with
``yt-dlp``. The two libraries are synchronous, so every call is bounced to a
worker thread (``anyio.to_thread``); the sync yt-dlp progress hook bridges back
to the async progress callback via ``anyio.from_thread``.
Both libraries are optional dependencies — if either is missing the source is
simply *unavailable* (it never crashes import or the registry; graceful
degradation per CLAUDE.md). The audio stream is stored **as-is** (YouTube serves
lossy Opus/AAC; re-encoding would be lossy→lossy, plan §6.6).
``source_id`` is the YouTube ``videoId`` — stable, so a re-download of the same
id is idempotent and dedups against an existing track.
"""
import functools
import tempfile
from collections.abc import Callable
from pathlib import Path
from typing import Any
import anyio
from app.core.logging import get_logger
from app.domain.ports import ProgressCallback
from app.domain.sources import (
KIND_FETCH,
DownloadResult,
RawMetadata,
SearchResult,
SourceInfo,
)
from app.infrastructure.db.models.enums import TrackSource
log = get_logger(__name__)
# Functions a caller may inject for testing (defaults do the real library work).
SearchFn = Callable[[str, int], list[dict[str, Any]]]
# (video_id, tmp_dir, progress_hook, cookies_path) -> normalized download dict
DownloadFn = Callable[[str, Path, Callable[[dict[str, Any]], None], Path | None], dict[str, Any]]
def _libs_available() -> bool:
try:
import yt_dlp # noqa: F401
import ytmusicapi # noqa: F401
except ImportError:
return False
return True
def _watch_url(video_id: str) -> str:
return f"https://music.youtube.com/watch?v={video_id}"
class YouTubeMusicSource:
"""Implements :class:`app.domain.ports.SearchableSource` and
:class:`~app.domain.ports.FetchableSource`."""
name = TrackSource.YOUTUBE.value
def __init__(
self,
*,
cookies_path: Path | None = None,
tmp_dir: Path | None = None,
search_fn: SearchFn | None = None,
download_fn: DownloadFn | None = None,
) -> None:
self._cookies_path = cookies_path
self._tmp_dir = tmp_dir
self._search_fn = search_fn or _default_search
self._download_fn = download_fn or _default_download
# Only the real library path needs the deps; an injected fn is self-contained.
self._injected = search_fn is not None or download_fn is not None
def info(self) -> SourceInfo:
return SourceInfo(
name=self.name,
label="YouTube Music",
kind=KIND_FETCH,
available=self.is_available(),
)
def is_available(self) -> bool:
return True if self._injected else _libs_available()
async def search(self, query: str, *, limit: int) -> list[SearchResult]:
query = query.strip()
if not query:
return []
try:
rows = await anyio.to_thread.run_sync(functools.partial(self._search_fn, query, limit))
except Exception:
# No results / service down → degrade to empty (plan §5, CLAUDE.md).
log.warning("ytm_search_failed", query=query)
return []
return [r for r in (self._to_result(row) for row in rows) if r is not None]
async def fetch(
self, source_id: str, *, on_progress: ProgressCallback | None = None
) -> DownloadResult:
tmp_dir = self._tmp_dir or Path(tempfile.gettempdir())
def hook(d: dict[str, Any]) -> None:
if on_progress is None or d.get("status") != "downloading":
return
total = d.get("total_bytes") or d.get("total_bytes_estimate")
done = d.get("downloaded_bytes")
if not total or done is None:
return
# Cap below 1.0 — the job only reaches 1.0 once stored + imported.
frac = min(done / total, 0.99)
# Bridge sync hook (worker thread) → async callback (event loop).
anyio.from_thread.run(on_progress, frac)
def _run() -> dict[str, Any]:
return self._download_fn(source_id, tmp_dir, hook, self._cookies_path)
info = await anyio.to_thread.run_sync(_run)
path = Path(info["filepath"])
stat = await anyio.Path(path).stat()
return DownloadResult(
source_id=source_id,
path=path,
file_format=info["file_format"],
file_size=stat.st_size,
bitrate=info.get("bitrate"),
suggested_title=info.get("title") or source_id,
)
async def get_metadata(self, source_id: str) -> RawMetadata | None:
# The search result already carries a usable title/artist, and the
# canonical metadata comes from enrichment (§6.2). A dedicated lookup is
# an optional refinement — skipped for now (returns None gracefully).
return None
def _to_result(self, row: dict[str, Any]) -> SearchResult | None:
video_id = row.get("videoId")
if not video_id:
return None # non-playable row (e.g. a video without audio id)
artists = row.get("artists") or []
artist = ", ".join(a["name"] for a in artists if a.get("name")) or None
album = (row.get("album") or {}).get("name") if isinstance(row.get("album"), dict) else None
thumbnails = row.get("thumbnails") or []
thumbnail = thumbnails[-1].get("url") if thumbnails else None
return SearchResult(
source=self.name,
source_id=str(video_id),
title=row.get("title") or "Unknown",
artist=artist,
album=album,
duration_seconds=row.get("duration_seconds"),
thumbnail_url=thumbnail,
raw=row,
)
def _default_search(query: str, limit: int) -> list[dict[str, Any]]:
"""Real ytmusicapi search (songs only). Runs in a worker thread."""
from ytmusicapi import YTMusic
yt = YTMusic() # unauthenticated: public search needs no login
results: list[dict[str, Any]] = yt.search(query, filter="songs", limit=limit)
return results[:limit]
def _default_download(
video_id: str,
tmp_dir: Path,
progress_hook: Callable[[dict[str, Any]], None],
cookies_path: Path | None,
) -> dict[str, Any]:
"""Real yt-dlp download of the best audio stream. Runs in a worker thread.
Stores the original stream (no transcode — plan §6.3/§6.6). Returns a
normalized dict the adapter maps to :class:`DownloadResult`.
"""
from yt_dlp import YoutubeDL
opts: dict[str, Any] = {
"format": "bestaudio/best",
"outtmpl": str(tmp_dir / "%(id)s.%(ext)s"),
"quiet": True,
"no_warnings": True,
"noprogress": True,
"progress_hooks": [progress_hook],
}
# Use cookies only when the file is actually present: the path can be set
# unconditionally (e.g. a mounted volume that may be empty) and downloads
# still work without it — cookies just unlock age/region-restricted items.
if cookies_path is not None and cookies_path.is_file():
opts["cookiefile"] = str(cookies_path)
with YoutubeDL(opts) as ydl:
info = ydl.extract_info(_watch_url(video_id), download=True)
filepath = Path(ydl.prepare_filename(info))
abr = info.get("abr")
return {
"filepath": filepath,
"file_format": filepath.suffix.lstrip(".").lower() or "m4a",
"bitrate": int(abr) if abr else None,
"title": info.get("title"),
}