feat(sources): YouTube Music search + download pipeline (§1C/§1E)

Pluggable fetch source: ytmusicapi search + yt-dlp download (cookies-file guard), DownloadJob entity/repo + DownloadService, download_task worker with exponential-backoff retries, and wired /search, /sources/{source}/search, and /downloads endpoints. Adds youtube_enabled/cookies config, yt-dlp+ytmusicapi deps, and the download_jobs.track_id migration. Snapshot also bundles in-progress storage/tracks/acoustid edits. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-14 14:04:33 +03:00
parent ea880edd57
commit 78007461e1
32 changed files with 2645 additions and 819 deletions
@@ -79,9 +79,7 @@ async def _create_test_db_if_missing() -> None:
    except Exception:
        return
    try:
-        exists = await conn.fetchval(
-            "SELECT 1 FROM pg_database WHERE datname = $1", _TEST_DB_NAME
-        )
+        exists = await conn.fetchval("SELECT 1 FROM pg_database WHERE datname = $1", _TEST_DB_NAME)
        if not exists:
            # CREATE DATABASE can't run inside a transaction; asyncpg's implicit
            # autocommit on a bare connection handles that.
@@ -0,0 +1,213 @@
+"""Unit tests for DownloadService — DB-free, in-memory fakes."""
+
+import datetime as dt
+import uuid
+from pathlib import Path
+
+import pytest
+from app.application.download_service import DownloadService
+from app.domain.entities import Artist, Track
+from app.domain.entities.download import DownloadJob
+from app.domain.sources import DownloadResult
+
+pytestmark = pytest.mark.asyncio
+
+
+class FakeArtistRepo:
+    async def get_or_create(self, name: str) -> Artist:
+        now = dt.datetime.now(dt.UTC)
+        return Artist(id=uuid.uuid4(), name=name, created_at=now, updated_at=now)
+
+
+class FakeTrackRepo:
+    def __init__(self) -> None:
+        self.by_source: dict[tuple[str, str], Track] = {}
+        self.added: list[Track] = []
+
+    async def get_by_source(self, source: str, source_id: str) -> Track | None:
+        return self.by_source.get((source, source_id))
+
+    async def add(self, **kw: object) -> Track:
+        now = dt.datetime.now(dt.UTC)
+        track = Track(
+            id=kw["id"],  # type: ignore[arg-type]
+            title=str(kw["title"]),
+            artist_id=kw["artist_id"],  # type: ignore[arg-type]
+            album_id=None,
+            storage_uri=str(kw["storage_uri"]),
+            file_format=str(kw["file_format"]),
+            file_size=int(kw["file_size"]),  # type: ignore[call-overload]
+            source=str(kw["source"]),
+            source_id=str(kw["source_id"]),
+            duration_seconds=None,
+            genre=None,
+            year=None,
+            track_number=None,
+            metadata_status=str(kw["metadata_status"]),
+            metadata_error=None,
+            enriched_at=None,
+            created_at=now,
+            updated_at=now,
+        )
+        self.by_source[(track.source, track.source_id)] = track
+        self.added.append(track)
+        return track
+
+
+class FakeStorage:
+    def __init__(self) -> None:
+        self.saved: dict[str, Path] = {}
+        self.deleted: list[str] = []
+
+    async def save_file(self, key: str, src_path: Path) -> int:
+        self.saved[key] = src_path
+        return 1
+
+    async def delete(self, key: str) -> None:
+        self.deleted.append(key)
+
+
+class FakeJobRepo:
+    def __init__(self) -> None:
+        self.jobs: dict[uuid.UUID, DownloadJob] = {}
+        self.active: dict[tuple[str, str], DownloadJob] = {}
+
+    def _make(self, **kw: object) -> DownloadJob:
+        now = dt.datetime.now(dt.UTC)
+        return DownloadJob(
+            id=uuid.uuid4(),
+            source=str(kw["source"]),
+            source_id=kw.get("source_id"),  # type: ignore[arg-type]
+            query=kw.get("query"),  # type: ignore[arg-type]
+            requested_by=kw.get("requested_by"),  # type: ignore[arg-type]
+            status="queued",
+            progress=0.0,
+            error_message=None,
+            retry_count=0,
+            track_id=None,
+            created_at=now,
+            updated_at=now,
+        )
+
+    async def add(self, **kw: object) -> DownloadJob:
+        job = self._make(**kw)
+        self.jobs[job.id] = job
+        return job
+
+    async def get_by_id(self, job_id: uuid.UUID) -> DownloadJob | None:
+        return self.jobs.get(job_id)
+
+    async def get_active_for_source(self, source: str, source_id: str) -> DownloadJob | None:
+        return self.active.get((source, source_id))
+
+    async def set_status(self, job_id: uuid.UUID, **kw: object) -> None: ...
+
+    async def delete(self, job_id: uuid.UUID) -> None:
+        self.jobs.pop(job_id, None)
+
+
+def _service(
+    *, jobs: FakeJobRepo, tracks: FakeTrackRepo, storage: FakeStorage, enqueued: list[uuid.UUID]
+) -> DownloadService:
+    async def enqueue_download(job_id: uuid.UUID) -> None:
+        enqueued.append(job_id)
+
+    return DownloadService(
+        jobs=jobs,  # type: ignore[arg-type]
+        tracks=tracks,  # type: ignore[arg-type]
+        artists=FakeArtistRepo(),  # type: ignore[arg-type]
+        storage=storage,  # type: ignore[arg-type]
+        enqueue_download=enqueue_download,
+    )
+
+
+def _track(source: str, source_id: str) -> Track:
+    now = dt.datetime.now(dt.UTC)
+    return Track(
+        id=uuid.uuid4(),
+        title="t",
+        artist_id=uuid.uuid4(),
+        album_id=None,
+        storage_uri="k",
+        file_format="mp3",
+        file_size=1,
+        source=source,
+        source_id=source_id,
+        duration_seconds=None,
+        genre=None,
+        year=None,
+        track_number=None,
+        metadata_status="pending",
+        metadata_error=None,
+        enriched_at=None,
+        created_at=now,
+        updated_at=now,
+    )
+
+
+async def test_request_dedups_against_library() -> None:
+    jobs, tracks, storage, enq = FakeJobRepo(), FakeTrackRepo(), FakeStorage(), []
+    tracks.by_source[("youtube", "abc")] = _track("youtube", "abc")
+    svc = _service(jobs=jobs, tracks=tracks, storage=storage, enqueued=enq)
+
+    result = await svc.request(source="youtube", source_id="abc", query=None, requested_by=None)
+
+    assert result.already_in_library is True
+    assert result.track_id is not None
+    assert result.job is None
+    assert enq == []  # nothing enqueued
+
+
+async def test_request_returns_existing_active_job() -> None:
+    jobs, tracks, storage, enq = FakeJobRepo(), FakeTrackRepo(), FakeStorage(), []
+    existing = await jobs.add(source="youtube", source_id="abc", query=None, requested_by=None)
+    jobs.active[("youtube", "abc")] = existing
+    svc = _service(jobs=jobs, tracks=tracks, storage=storage, enqueued=enq)
+
+    result = await svc.request(source="youtube", source_id="abc", query=None, requested_by=None)
+
+    assert result.already_in_library is False
+    assert result.job is not None
+    assert result.job.id == existing.id
+    assert enq == []  # not re-enqueued
+
+
+async def test_request_creates_and_enqueues_new_job() -> None:
+    jobs, tracks, storage, enq = FakeJobRepo(), FakeTrackRepo(), FakeStorage(), []
+    svc = _service(jobs=jobs, tracks=tracks, storage=storage, enqueued=enq)
+
+    result = await svc.request(
+        source="youtube", source_id="abc", query="bohemian", requested_by=None
+    )
+
+    assert result.already_in_library is False
+    assert result.job is not None
+    assert enq == [result.job.id]
+
+
+async def test_store_result_imports_and_cleans_temp(tmp_path: Path) -> None:
+    jobs, tracks, storage, enq = FakeJobRepo(), FakeTrackRepo(), FakeStorage(), []
+    svc = _service(jobs=jobs, tracks=tracks, storage=storage, enqueued=enq)
+
+    audio = tmp_path / "abc.webm"
+    audio.write_bytes(b"audio" * 20)
+    result = DownloadResult(
+        source_id="abc",
+        path=audio,
+        file_format="m4a",
+        file_size=100,
+        bitrate=160,
+        suggested_title="Bohemian Rhapsody",
+    )
+
+    track_id = await svc.store_result(source="youtube", result=result, requested_by=None)
+
+    assert len(tracks.added) == 1
+    stored = tracks.added[0]
+    assert stored.id == track_id
+    assert stored.source == "youtube"
+    assert stored.source_id == "abc"
+    assert stored.metadata_status == "pending"
+    assert stored.title == "Bohemian Rhapsody"
+    assert len(storage.saved) == 1
+    assert not audio.exists()  # temp file removed
@@ -0,0 +1,252 @@
+"""Integration tests for downloads + external search.
+
+Requires a reachable Postgres; skips otherwise. The download worker task is
+invoked directly (no Redis needed) against a fake fetch source, so the full
+DB + storage import path is covered without touching the network.
+"""
+
+import asyncio
+import os
+from collections.abc import AsyncIterator
+from pathlib import Path
+from typing import Any
+
+import pytest
+from app.core.config import get_settings
+from app.domain.sources import KIND_FETCH, DownloadResult, SearchResult, SourceInfo
+from app.infrastructure.db import Base, dispose_engine, get_engine, session_scope
+from app.infrastructure.db.repositories import (
+    SqlAlchemyRefreshTokenRepository,
+    SqlAlchemyUserRepository,
+)
+from app.infrastructure.sources.registry import SourceRegistry
+from asgi_lifespan import LifespanManager
+from httpx import ASGITransport, AsyncClient
+
+pytestmark = pytest.mark.asyncio
+
+_db_reachable_cache: bool | None = None
+
+
+async def _db_reachable() -> bool:
+    global _db_reachable_cache
+    if _db_reachable_cache is not None:
+        return _db_reachable_cache
+    from sqlalchemy import text
+
+    try:
+        async with asyncio.timeout(3):
+            async with get_engine().connect() as conn:
+                await conn.execute(text("SELECT 1"))
+        _db_reachable_cache = True
+    except Exception:
+        _db_reachable_cache = False
+    return _db_reachable_cache
+
+
+class FakeFetchSource:
+    """A searchable + fetchable source that writes a local file (no network)."""
+
+    name = "youtube"
+
+    def __init__(self, tmp_dir: Path) -> None:
+        self._tmp_dir = tmp_dir
+
+    def info(self) -> SourceInfo:
+        return SourceInfo(name=self.name, label="YouTube Music", kind=KIND_FETCH, available=True)
+
+    def is_available(self) -> bool:
+        return True
+
+    async def search(self, query: str, *, limit: int) -> list[SearchResult]:
+        return [
+            SearchResult(
+                source=self.name,
+                source_id="vid-1",
+                title=f"{query} song",
+                artist="Some Artist",
+                album="Some Album",
+                duration_seconds=200,
+                thumbnail_url="http://img/large.jpg",
+            )
+        ]
+
+    async def fetch(self, source_id: str, *, on_progress: Any = None) -> DownloadResult:
+        path = self._tmp_dir / f"{source_id}.m4a"
+        path.write_bytes(b"downloaded audio bytes" * 8)
+        if on_progress is not None:
+            await on_progress(0.5)
+        return DownloadResult(
+            source_id=source_id,
+            path=path,
+            file_format="webm",
+            file_size=path.stat().st_size,
+            bitrate=160,
+            suggested_title=f"Title for {source_id}",
+        )
+
+    async def get_metadata(self, source_id: str) -> None:
+        return None
+
+
+@pytest.fixture
+async def api(tmp_path: Path) -> AsyncIterator[AsyncClient]:
+    if not await _db_reachable():
+        pytest.skip("Postgres not reachable — integration test skipped.")
+
+    media = tmp_path / "media"
+    media.mkdir()
+    os.environ["MEDIA_PATH"] = str(media)
+    get_settings.cache_clear()
+
+    import app.infrastructure.storage.provider as _storage_provider
+
+    _storage_provider._storage = None
+
+    try:
+        async with get_engine().begin() as conn:
+            await conn.run_sync(Base.metadata.drop_all)
+            await conn.run_sync(Base.metadata.create_all)
+
+        from app.application.user_service import UserService
+        from app.core.security import Argon2PasswordHasher
+
+        async with session_scope() as session:
+            await UserService(
+                users=SqlAlchemyUserRepository(session),
+                refresh_tokens=SqlAlchemyRefreshTokenRepository(session),
+                hasher=Argon2PasswordHasher(),
+            ).create_user(username="admin", password="adminpass1", is_superuser=True)
+
+        from app.api.deps import get_source_registry
+        from app.main import create_app
+
+        app = create_app()
+        # Inject a fake fetch source so search/download never hit the network.
+        fake_registry = SourceRegistry([FakeFetchSource(tmp_path / "dl")])  # type: ignore[list-item]
+        (tmp_path / "dl").mkdir()
+        app.dependency_overrides[get_source_registry] = lambda: fake_registry
+
+        async with LifespanManager(app):
+            transport = ASGITransport(app=app)
+            async with AsyncClient(transport=transport, base_url="http://test") as client:
+                yield client
+
+        async with get_engine().begin() as conn:
+            await conn.run_sync(Base.metadata.drop_all)
+        await dispose_engine()
+    finally:
+        _storage_provider._storage = None
+        os.environ.pop("MEDIA_PATH", None)
+        get_settings.cache_clear()
+
+
+async def _login(api: AsyncClient) -> str:
+    resp = await api.post(
+        "/api/v1/auth/login", json={"username": "admin", "password": "adminpass1"}
+    )
+    assert resp.status_code == 200
+    return str(resp.json()["access_token"])
+
+
+async def test_search_aggregates_fetch_sources(api: AsyncClient) -> None:
+    token = await _login(api)
+    headers = {"Authorization": f"Bearer {token}"}
+
+    resp = await api.get("/api/v1/search", params={"q": "queen"}, headers=headers)
+    assert resp.status_code == 200
+    body = resp.json()
+    assert body["searched_sources"] == ["youtube"]
+    assert len(body["results"]) == 1
+    hit = body["results"][0]
+    assert hit["source"] == "youtube"
+    assert hit["source_id"] == "vid-1"
+    assert hit["title"] == "queen song"
+
+
+async def test_source_scoped_search(api: AsyncClient) -> None:
+    token = await _login(api)
+    headers = {"Authorization": f"Bearer {token}"}
+    resp = await api.get("/api/v1/sources/youtube/search", params={"q": "abba"}, headers=headers)
+    assert resp.status_code == 200
+    assert resp.json()["results"][0]["title"] == "abba song"
+
+
+async def test_download_create_list_and_complete(
+    api: AsyncClient, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    token = await _login(api)
+    headers = {"Authorization": f"Bearer {token}"}
+
+    # Request a download — Redis is absent, so enqueue degrades but the job persists.
+    create = await api.post(
+        "/api/v1/downloads",
+        json={"source": "youtube", "source_id": "vid-1", "query": "queen"},
+        headers=headers,
+    )
+    assert create.status_code == 202
+    body = create.json()
+    assert body["already_in_library"] is False
+    job_id = body["job"]["id"]
+    assert body["job"]["status"] == "queued"
+
+    # It shows up in the listing.
+    listing = await api.get("/api/v1/downloads", headers=headers)
+    assert listing.status_code == 200
+    assert any(j["id"] == job_id for j in listing.json()["items"])
+
+    # A duplicate request returns the same in-flight job, not a new one.
+    dup = await api.post(
+        "/api/v1/downloads",
+        json={"source": "youtube", "source_id": "vid-1"},
+        headers=headers,
+    )
+    assert dup.json()["job"]["id"] == job_id
+
+    # Run the worker task directly (bypasses Redis) with the fake fetch source.
+    import app.workers.tasks.download_task as dl_task
+
+    worker_dl = tmp_path / "worker-dl"
+    worker_dl.mkdir()
+    fake = SourceRegistry([FakeFetchSource(worker_dl)])  # type: ignore[list-item]
+    monkeypatch.setattr(dl_task, "build_source_registry", lambda _settings: fake)
+
+    result = await dl_task.download_track({}, job_id=job_id)
+    assert result["status"] == "done"
+    track_id = result["track_id"]
+
+    # The job is now done and linked to the imported track.
+    got = await api.get(f"/api/v1/downloads/{job_id}", headers=headers)
+    assert got.json()["status"] == "done"
+    assert got.json()["track_id"] == track_id
+
+    # The imported track streams back.
+    stream = await api.get(f"/api/v1/stream/{track_id}", headers=headers)
+    assert stream.status_code == 200
+    assert len(stream.content) > 0
+
+    # A new request for the same item now dedups against the library.
+    again = await api.post(
+        "/api/v1/downloads",
+        json={"source": "youtube", "source_id": "vid-1"},
+        headers=headers,
+    )
+    assert again.json()["already_in_library"] is True
+    assert again.json()["track_id"] == track_id
+
+
+async def test_cancel_download(api: AsyncClient) -> None:
+    token = await _login(api)
+    headers = {"Authorization": f"Bearer {token}"}
+    create = await api.post(
+        "/api/v1/downloads",
+        json={"source": "youtube", "source_id": "vid-cancel"},
+        headers=headers,
+    )
+    job_id = create.json()["job"]["id"]
+
+    cancel = await api.delete(f"/api/v1/downloads/{job_id}", headers=headers)
+    assert cancel.status_code == 204
+
+    got = await api.get(f"/api/v1/downloads/{job_id}", headers=headers)
+    assert got.status_code == 404
@@ -48,12 +48,17 @@ def test_info_reports_kind_and_availability(tmp_path: Path) -> None:


 def test_registry_registers_local_when_path_set(tmp_path: Path) -> None:
-    registry = build_source_registry(_settings(local_media_import_path=tmp_path))
+    # Disable youtube to isolate the local-source registration under test.
+    registry = build_source_registry(
+        _settings(local_media_import_path=tmp_path, youtube_enabled=False)
+    )
    names = {info.name for info in registry.infos()}
    assert names == {"local"}
    assert registry.indexable("local").is_available() is True


 def test_registry_empty_when_path_unset() -> None:
-    registry = build_source_registry(_settings(local_media_import_path=None))
+    registry = build_source_registry(
+        _settings(local_media_import_path=None, youtube_enabled=False)
+    )
    assert registry.infos() == []
@@ -107,9 +107,7 @@ async def test_track_out_includes_genre_year_track_number(api: AsyncClient) -> N
    token = await _login(api)
    track_id = await _upload(api, token)

-    resp = await api.get(
-        f"/api/v1/tracks/{track_id}", headers={"Authorization": f"Bearer {token}"}
-    )
+    resp = await api.get(f"/api/v1/tracks/{track_id}", headers={"Authorization": f"Bearer {token}"})
    assert resp.status_code == 200, resp.text
    body = resp.json()
    assert "genre" in body
@@ -0,0 +1,135 @@
+"""Unit tests for YouTubeMusicSource + registry (no network, injected libs)."""
+
+from pathlib import Path
+from typing import Any
+
+import pytest
+from app.core.config import Settings
+from app.domain.sources import KIND_FETCH
+from app.infrastructure.sources.registry import build_source_registry
+from app.infrastructure.sources.youtube import YouTubeMusicSource
+
+pytestmark = pytest.mark.asyncio
+
+
+def _song_row(**overrides: Any) -> dict[str, Any]:
+    row: dict[str, Any] = {
+        "videoId": "abc123",
+        "title": "Bohemian Rhapsody",
+        "artists": [{"name": "Queen", "id": "a1"}],
+        "album": {"name": "A Night at the Opera", "id": "al1"},
+        "duration_seconds": 354,
+        "thumbnails": [
+            {"url": "http://img/small.jpg", "width": 60, "height": 60},
+            {"url": "http://img/large.jpg", "width": 240, "height": 240},
+        ],
+    }
+    row.update(overrides)
+    return row
+
+
+def _settings(**overrides: object) -> Settings:
+    return Settings(**overrides)  # type: ignore[arg-type]
+
+
+async def test_search_maps_ytmusic_rows() -> None:
+    source = YouTubeMusicSource(search_fn=lambda q, limit: [_song_row()])
+    [result] = await source.search("queen", limit=10)
+
+    assert result.source == "youtube"
+    assert result.source_id == "abc123"
+    assert result.title == "Bohemian Rhapsody"
+    assert result.artist == "Queen"
+    assert result.album == "A Night at the Opera"
+    assert result.duration_seconds == 354
+    assert result.thumbnail_url == "http://img/large.jpg"  # last (largest)
+
+
+async def test_search_joins_multiple_artists_and_tolerates_missing_fields() -> None:
+    row = _song_row(
+        artists=[{"name": "Queen"}, {"name": "David Bowie"}],
+        album=None,
+        thumbnails=[],
+        duration_seconds=None,
+    )
+    source = YouTubeMusicSource(search_fn=lambda q, limit: [row])
+    [result] = await source.search("under pressure", limit=10)
+
+    assert result.artist == "Queen, David Bowie"
+    assert result.album is None
+    assert result.thumbnail_url is None
+    assert result.duration_seconds is None
+
+
+async def test_search_drops_rows_without_video_id() -> None:
+    rows = [_song_row(), _song_row(videoId=None), _song_row(videoId="xyz")]
+    source = YouTubeMusicSource(search_fn=lambda q, limit: rows)
+    results = await source.search("q", limit=10)
+    assert [r.source_id for r in results] == ["abc123", "xyz"]
+
+
+async def test_search_empty_query_short_circuits() -> None:
+    called = False
+
+    def _search(q: str, limit: int) -> list[dict[str, Any]]:
+        nonlocal called
+        called = True
+        return []
+
+    source = YouTubeMusicSource(search_fn=_search)
+    assert await source.search("   ", limit=10) == []
+    assert called is False
+
+
+async def test_search_degrades_to_empty_on_error() -> None:
+    def _boom(q: str, limit: int) -> list[dict[str, Any]]:
+        raise RuntimeError("service down")
+
+    source = YouTubeMusicSource(search_fn=_boom)
+    assert await source.search("q", limit=10) == []
+
+
+async def test_fetch_maps_download_result(tmp_path: Path) -> None:
+    audio = tmp_path / "abc123.m4a"
+    audio.write_bytes(b"opus-bytes" * 10)
+
+    def _download(video_id: str, tmp_dir: Path, hook: Any, cookies: Path | None) -> dict[str, Any]:
+        return {
+            "filepath": audio,
+            "file_format": "m4a",
+            "bitrate": 160,
+            "title": "Bohemian Rhapsody",
+        }
+
+    source = YouTubeMusicSource(download_fn=_download)
+    result = await source.fetch("abc123")
+
+    assert result.source_id == "abc123"
+    assert result.path == audio
+    assert result.file_format == "m4a"
+    assert result.file_size == len(b"opus-bytes" * 10)
+    assert result.bitrate == 160
+    assert result.suggested_title == "Bohemian Rhapsody"
+
+
+async def test_info_and_availability_with_injected_fn() -> None:
+    source = YouTubeMusicSource(search_fn=lambda q, limit: [])
+    info = source.info()
+    assert info.name == "youtube"
+    assert info.kind == KIND_FETCH
+    assert info.available is True  # injected fn → treated as available
+
+
+async def test_registry_registers_youtube_when_enabled() -> None:
+    registry = build_source_registry(_settings(youtube_enabled=True))
+    names = {info.name for info in registry.infos()}
+    assert "youtube" in names
+    # youtube is searchable + fetchable, not indexable
+    assert registry.searchable("youtube").name == "youtube"
+    assert registry.fetchable("youtube").name == "youtube"
+
+
+async def test_registry_omits_youtube_when_disabled() -> None:
+    registry = build_source_registry(_settings(youtube_enabled=False))
+    names = {info.name for info in registry.infos()}
+    assert "youtube" not in names