feat(sources): YouTube Music search + download pipeline (§1C/§1E)

Pluggable fetch source: ytmusicapi search + yt-dlp download (cookies-file guard), DownloadJob entity/repo + DownloadService, download_task worker with exponential-backoff retries, and wired /search, /sources/{source}/search, and /downloads endpoints. Adds youtube_enabled/cookies config, yt-dlp+ytmusicapi deps, and the download_jobs.track_id migration. Snapshot also bundles in-progress storage/tracks/acoustid edits. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-14 14:04:33 +03:00
parent ea880edd57
commit 78007461e1
32 changed files with 2645 additions and 819 deletions
@@ -35,3 +35,9 @@ class DownloadJobModel(UUIDPrimaryKeyMixin, TimestampMixin, Base):
    progress: Mapped[float] = mapped_column(Float, nullable=False, default=0.0)
    error_message: Mapped[str | None] = mapped_column(Text, nullable=True)
    retry_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
+    # Set once the download finishes and the track is imported — lets the UI
+    # link a completed job to its library track.
+    track_id: Mapped[uuid.UUID | None] = mapped_column(
+        ForeignKey("tracks.id", ondelete="SET NULL"),
+        nullable=True,
+    )
@@ -2,6 +2,9 @@

 from app.infrastructure.db.repositories.album_repository import SqlAlchemyAlbumRepository
 from app.infrastructure.db.repositories.artist_repository import SqlAlchemyArtistRepository
+from app.infrastructure.db.repositories.download_job_repository import (
+    SqlAlchemyDownloadJobRepository,
+)
 from app.infrastructure.db.repositories.history_repository import SqlAlchemyHistoryRepository
 from app.infrastructure.db.repositories.like_repository import SqlAlchemyLikeRepository
 from app.infrastructure.db.repositories.playlist_repository import SqlAlchemyPlaylistRepository
@@ -14,6 +17,7 @@ from app.infrastructure.db.repositories.user_repository import SqlAlchemyUserRep
 __all__ = [
    "SqlAlchemyAlbumRepository",
    "SqlAlchemyArtistRepository",
+    "SqlAlchemyDownloadJobRepository",
    "SqlAlchemyHistoryRepository",
    "SqlAlchemyLikeRepository",
    "SqlAlchemyPlaylistRepository",
@@ -0,0 +1,164 @@
+"""Download job repository — adapter over ``AsyncSession`` (plan §6.1)."""
+
+import datetime as dt
+import uuid
+
+from sqlalchemy import func, select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.domain.entities.download import DownloadJob
+from app.infrastructure.db.models.download_job import DownloadJobModel
+from app.infrastructure.db.models.enums import DownloadStatus
+
+# Jobs that are not yet finished — used to dedup an in-flight download.
+_ACTIVE_STATUSES = (
+    DownloadStatus.QUEUED.value,
+    DownloadStatus.DOWNLOADING.value,
+    DownloadStatus.ENRICHING.value,
+)
+
+
+def _to_entity(row: DownloadJobModel) -> DownloadJob:
+    return DownloadJob(
+        id=row.id,
+        source=row.source,
+        source_id=row.source_id,
+        query=row.query,
+        requested_by=row.requested_by,
+        status=row.status,
+        progress=row.progress,
+        error_message=row.error_message,
+        retry_count=row.retry_count,
+        track_id=row.track_id,
+        created_at=row.created_at,
+        updated_at=row.updated_at,
+    )
+
+
+class SqlAlchemyDownloadJobRepository:
+    def __init__(self, session: AsyncSession) -> None:
+        self._session = session
+
+    async def add(
+        self,
+        *,
+        source: str,
+        source_id: str | None,
+        query: str | None,
+        requested_by: uuid.UUID | None,
+    ) -> DownloadJob:
+        row = DownloadJobModel(
+            source=source,
+            source_id=source_id,
+            query=query,
+            requested_by=requested_by,
+            status=DownloadStatus.QUEUED.value,
+            progress=0.0,
+            retry_count=0,
+        )
+        self._session.add(row)
+        await self._session.flush()
+        await self._session.refresh(row)
+        return _to_entity(row)
+
+    async def get_by_id(self, job_id: uuid.UUID) -> DownloadJob | None:
+        row = await self._session.get(DownloadJobModel, job_id)
+        return _to_entity(row) if row is not None else None
+
+    async def get_active_for_source(self, source: str, source_id: str) -> DownloadJob | None:
+        row = (
+            await self._session.execute(
+                select(DownloadJobModel)
+                .where(
+                    DownloadJobModel.source == source,
+                    DownloadJobModel.source_id == source_id,
+                    DownloadJobModel.status.in_(_ACTIVE_STATUSES),
+                )
+                .order_by(DownloadJobModel.created_at.desc())
+                .limit(1)
+            )
+        ).scalar_one_or_none()
+        return _to_entity(row) if row is not None else None
+
+    async def list(
+        self,
+        *,
+        requested_by: uuid.UUID | None,
+        status: str | None,
+        limit: int,
+        offset: int,
+    ) -> list[DownloadJob]:
+        stmt = select(DownloadJobModel)
+        if requested_by is not None:
+            stmt = stmt.where(DownloadJobModel.requested_by == requested_by)
+        if status is not None:
+            stmt = stmt.where(DownloadJobModel.status == status)
+        stmt = stmt.order_by(DownloadJobModel.created_at.desc()).limit(limit).offset(offset)
+        rows = (await self._session.execute(stmt)).scalars().all()
+        return [_to_entity(r) for r in rows]
+
+    async def count(self, *, requested_by: uuid.UUID | None, status: str | None) -> int:
+        stmt = select(func.count()).select_from(DownloadJobModel)
+        if requested_by is not None:
+            stmt = stmt.where(DownloadJobModel.requested_by == requested_by)
+        if status is not None:
+            stmt = stmt.where(DownloadJobModel.status == status)
+        return (await self._session.execute(stmt)).scalar_one()
+
+    async def set_status(
+        self,
+        job_id: uuid.UUID,
+        *,
+        status: str,
+        error_message: str | None = None,
+        track_id: uuid.UUID | None = None,
+    ) -> None:
+        row = await self._session.get(DownloadJobModel, job_id)
+        if row is None:
+            return
+        row.status = status
+        # ``error_message`` is always written: a successful transition clears a
+        # stale reason from an earlier failed attempt.
+        row.error_message = error_message
+        if track_id is not None:
+            row.track_id = track_id
+        if status == DownloadStatus.DONE.value:
+            row.progress = 1.0
+        await self._session.flush()
+
+    async def set_progress(self, job_id: uuid.UUID, progress: float) -> None:
+        row = await self._session.get(DownloadJobModel, job_id)
+        if row is None:
+            return
+        row.progress = max(0.0, min(1.0, progress))
+        await self._session.flush()
+
+    async def increment_retry(self, job_id: uuid.UUID) -> int:
+        row = await self._session.get(DownloadJobModel, job_id)
+        if row is None:
+            return 0
+        row.retry_count += 1
+        await self._session.flush()
+        return row.retry_count
+
+    async def delete(self, job_id: uuid.UUID) -> None:
+        row = await self._session.get(DownloadJobModel, job_id)
+        if row is not None:
+            await self._session.delete(row)
+            await self._session.flush()
+
+    async def failure_rate(self, source: str, *, since: dt.datetime) -> float:
+        total, failed = (
+            await self._session.execute(
+                select(
+                    func.count(),
+                    func.count().filter(DownloadJobModel.status == DownloadStatus.FAILED.value),
+                )
+                .select_from(DownloadJobModel)
+                .where(
+                    DownloadJobModel.source == source,
+                    DownloadJobModel.created_at >= since,
+                )
+            )
+        ).one()
+        return (failed / total) if total else 0.0
@@ -78,7 +78,7 @@ class AcoustIdHttpClient:
                )
            resp.raise_for_status()
            return resp.json()  # type: ignore[no-any-return]
-        except (httpx.HTTPError, ValueError):
+        except httpx.HTTPError, ValueError:
            log.warning("acoustid_lookup_failed")
            return None

@@ -2,16 +2,18 @@

 Built from settings at the composition root. Only sources that are configured
 are registered (e.g. ``local`` appears only when ``LOCAL_MEDIA_IMPORT_PATH`` is
-set), so enumeration reflects what the instance can actually use.
+set; ``youtube`` only when ``YOUTUBE_ENABLED``), so enumeration reflects what the
+instance can actually use.
 """

 from typing import cast

 from app.core.config import Settings
 from app.domain.errors import NotFoundError, ValidationError
-from app.domain.ports import IndexableSource, SourceBackend
+from app.domain.ports import FetchableSource, IndexableSource, SearchableSource, SourceBackend
 from app.domain.sources import SourceInfo
 from app.infrastructure.sources.local_folder import LocalFolderSource
+from app.infrastructure.sources.youtube import YouTubeMusicSource


 class SourceRegistry:
@@ -30,6 +32,22 @@ class SourceRegistry:
            raise ValidationError(f"Source {name!r} cannot be indexed.")
        return cast(IndexableSource, backend)

+    def searchable(self, name: str) -> SearchableSource:
+        backend = self.get(name)
+        if not hasattr(backend, "search"):
+            raise ValidationError(f"Source {name!r} cannot be searched.")
+        return cast(SearchableSource, backend)
+
+    def fetchable(self, name: str) -> FetchableSource:
+        backend = self.get(name)
+        if not hasattr(backend, "fetch"):
+            raise ValidationError(f"Source {name!r} cannot download.")
+        return cast(FetchableSource, backend)
+
+    def searchables(self) -> list[SearchableSource]:
+        """Every registered source that supports search (for cross-source search)."""
+        return [cast(SearchableSource, b) for b in self._by_name.values() if hasattr(b, "search")]
+
    def infos(self) -> list[SourceInfo]:
        return [backend.info() for backend in self._by_name.values()]

@@ -38,4 +56,11 @@ def build_source_registry(settings: Settings) -> SourceRegistry:
    backends: list[SourceBackend] = []
    if settings.local_media_import_path is not None:
        backends.append(LocalFolderSource(settings.local_media_import_path))
+    if settings.youtube_enabled:
+        backends.append(
+            YouTubeMusicSource(
+                cookies_path=settings.youtube_cookies_path,
+                tmp_dir=settings.upload_tmp_dir,
+            )
+        )
    return SourceRegistry(backends)
@@ -0,0 +1,207 @@
+"""``youtube`` source — YouTube Music search + download (plan §5).
+
+A *fetch* source: it searches YouTube Music (via ``ytmusicapi``, which returns
+clean song/artist/album/duration rows) and downloads the chosen item with
+``yt-dlp``. The two libraries are synchronous, so every call is bounced to a
+worker thread (``anyio.to_thread``); the sync yt-dlp progress hook bridges back
+to the async progress callback via ``anyio.from_thread``.
+
+Both libraries are optional dependencies — if either is missing the source is
+simply *unavailable* (it never crashes import or the registry; graceful
+degradation per CLAUDE.md). The audio stream is stored **as-is** (YouTube serves
+lossy Opus/AAC; re-encoding would be lossy→lossy, plan §6.6).
+
+``source_id`` is the YouTube ``videoId`` — stable, so a re-download of the same
+id is idempotent and dedups against an existing track.
+"""
+
+import functools
+import tempfile
+from collections.abc import Callable
+from pathlib import Path
+from typing import Any
+
+import anyio
+
+from app.core.logging import get_logger
+from app.domain.ports import ProgressCallback
+from app.domain.sources import (
+    KIND_FETCH,
+    DownloadResult,
+    RawMetadata,
+    SearchResult,
+    SourceInfo,
+)
+from app.infrastructure.db.models.enums import TrackSource
+
+log = get_logger(__name__)
+
+# Functions a caller may inject for testing (defaults do the real library work).
+SearchFn = Callable[[str, int], list[dict[str, Any]]]
+# (video_id, tmp_dir, progress_hook, cookies_path) -> normalized download dict
+DownloadFn = Callable[[str, Path, Callable[[dict[str, Any]], None], Path | None], dict[str, Any]]
+
+
+def _libs_available() -> bool:
+    try:
+        import yt_dlp  # noqa: F401
+        import ytmusicapi  # noqa: F401
+    except ImportError:
+        return False
+    return True
+
+
+def _watch_url(video_id: str) -> str:
+    return f"https://music.youtube.com/watch?v={video_id}"
+
+
+class YouTubeMusicSource:
+    """Implements :class:`app.domain.ports.SearchableSource` and
+    :class:`~app.domain.ports.FetchableSource`."""
+
+    name = TrackSource.YOUTUBE.value
+
+    def __init__(
+        self,
+        *,
+        cookies_path: Path | None = None,
+        tmp_dir: Path | None = None,
+        search_fn: SearchFn | None = None,
+        download_fn: DownloadFn | None = None,
+    ) -> None:
+        self._cookies_path = cookies_path
+        self._tmp_dir = tmp_dir
+        self._search_fn = search_fn or _default_search
+        self._download_fn = download_fn or _default_download
+        # Only the real library path needs the deps; an injected fn is self-contained.
+        self._injected = search_fn is not None or download_fn is not None
+
+    def info(self) -> SourceInfo:
+        return SourceInfo(
+            name=self.name,
+            label="YouTube Music",
+            kind=KIND_FETCH,
+            available=self.is_available(),
+        )
+
+    def is_available(self) -> bool:
+        return True if self._injected else _libs_available()
+
+    async def search(self, query: str, *, limit: int) -> list[SearchResult]:
+        query = query.strip()
+        if not query:
+            return []
+        try:
+            rows = await anyio.to_thread.run_sync(functools.partial(self._search_fn, query, limit))
+        except Exception:
+            # No results / service down → degrade to empty (plan §5, CLAUDE.md).
+            log.warning("ytm_search_failed", query=query)
+            return []
+        return [r for r in (self._to_result(row) for row in rows) if r is not None]
+
+    async def fetch(
+        self, source_id: str, *, on_progress: ProgressCallback | None = None
+    ) -> DownloadResult:
+        tmp_dir = self._tmp_dir or Path(tempfile.gettempdir())
+
+        def hook(d: dict[str, Any]) -> None:
+            if on_progress is None or d.get("status") != "downloading":
+                return
+            total = d.get("total_bytes") or d.get("total_bytes_estimate")
+            done = d.get("downloaded_bytes")
+            if not total or done is None:
+                return
+            # Cap below 1.0 — the job only reaches 1.0 once stored + imported.
+            frac = min(done / total, 0.99)
+            # Bridge sync hook (worker thread) → async callback (event loop).
+            anyio.from_thread.run(on_progress, frac)
+
+        def _run() -> dict[str, Any]:
+            return self._download_fn(source_id, tmp_dir, hook, self._cookies_path)
+
+        info = await anyio.to_thread.run_sync(_run)
+        path = Path(info["filepath"])
+        stat = await anyio.Path(path).stat()
+        return DownloadResult(
+            source_id=source_id,
+            path=path,
+            file_format=info["file_format"],
+            file_size=stat.st_size,
+            bitrate=info.get("bitrate"),
+            suggested_title=info.get("title") or source_id,
+        )
+
+    async def get_metadata(self, source_id: str) -> RawMetadata | None:
+        # The search result already carries a usable title/artist, and the
+        # canonical metadata comes from enrichment (§6.2). A dedicated lookup is
+        # an optional refinement — skipped for now (returns None gracefully).
+        return None
+
+    def _to_result(self, row: dict[str, Any]) -> SearchResult | None:
+        video_id = row.get("videoId")
+        if not video_id:
+            return None  # non-playable row (e.g. a video without audio id)
+        artists = row.get("artists") or []
+        artist = ", ".join(a["name"] for a in artists if a.get("name")) or None
+        album = (row.get("album") or {}).get("name") if isinstance(row.get("album"), dict) else None
+        thumbnails = row.get("thumbnails") or []
+        thumbnail = thumbnails[-1].get("url") if thumbnails else None
+        return SearchResult(
+            source=self.name,
+            source_id=str(video_id),
+            title=row.get("title") or "Unknown",
+            artist=artist,
+            album=album,
+            duration_seconds=row.get("duration_seconds"),
+            thumbnail_url=thumbnail,
+            raw=row,
+        )
+
+
+def _default_search(query: str, limit: int) -> list[dict[str, Any]]:
+    """Real ytmusicapi search (songs only). Runs in a worker thread."""
+    from ytmusicapi import YTMusic
+
+    yt = YTMusic()  # unauthenticated: public search needs no login
+    results: list[dict[str, Any]] = yt.search(query, filter="songs", limit=limit)
+    return results[:limit]
+
+
+def _default_download(
+    video_id: str,
+    tmp_dir: Path,
+    progress_hook: Callable[[dict[str, Any]], None],
+    cookies_path: Path | None,
+) -> dict[str, Any]:
+    """Real yt-dlp download of the best audio stream. Runs in a worker thread.
+
+    Stores the original stream (no transcode — plan §6.3/§6.6). Returns a
+    normalized dict the adapter maps to :class:`DownloadResult`.
+    """
+    from yt_dlp import YoutubeDL
+
+    opts: dict[str, Any] = {
+        "format": "bestaudio/best",
+        "outtmpl": str(tmp_dir / "%(id)s.%(ext)s"),
+        "quiet": True,
+        "no_warnings": True,
+        "noprogress": True,
+        "progress_hooks": [progress_hook],
+    }
+    # Use cookies only when the file is actually present: the path can be set
+    # unconditionally (e.g. a mounted volume that may be empty) and downloads
+    # still work without it — cookies just unlock age/region-restricted items.
+    if cookies_path is not None and cookies_path.is_file():
+        opts["cookiefile"] = str(cookies_path)
+
+    with YoutubeDL(opts) as ydl:
+        info = ydl.extract_info(_watch_url(video_id), download=True)
+        filepath = Path(ydl.prepare_filename(info))
+
+    abr = info.get("abr")
+    return {
+        "filepath": filepath,
+        "file_format": filepath.suffix.lstrip(".").lower() or "m4a",
+        "bitrate": int(abr) if abr else None,
+        "title": info.get("title"),
+    }