feat(sources): YouTube Music search + download pipeline (§1C/§1E)
Pluggable fetch source: ytmusicapi search + yt-dlp download (cookies-file guard), DownloadJob entity/repo + DownloadService, download_task worker with exponential-backoff retries, and wired /search, /sources/{source}/search, and /downloads endpoints. Adds youtube_enabled/cookies config, yt-dlp+ytmusicapi deps, and the download_jobs.track_id migration. Snapshot also bundles in-progress storage/tracks/acoustid edits.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -35,3 +35,9 @@ class DownloadJobModel(UUIDPrimaryKeyMixin, TimestampMixin, Base):
|
||||
progress: Mapped[float] = mapped_column(Float, nullable=False, default=0.0)
|
||||
error_message: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
retry_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
|
||||
# Set once the download finishes and the track is imported — lets the UI
|
||||
# link a completed job to its library track.
|
||||
track_id: Mapped[uuid.UUID | None] = mapped_column(
|
||||
ForeignKey("tracks.id", ondelete="SET NULL"),
|
||||
nullable=True,
|
||||
)
|
||||
|
||||
@@ -2,6 +2,9 @@
|
||||
|
||||
from app.infrastructure.db.repositories.album_repository import SqlAlchemyAlbumRepository
|
||||
from app.infrastructure.db.repositories.artist_repository import SqlAlchemyArtistRepository
|
||||
from app.infrastructure.db.repositories.download_job_repository import (
|
||||
SqlAlchemyDownloadJobRepository,
|
||||
)
|
||||
from app.infrastructure.db.repositories.history_repository import SqlAlchemyHistoryRepository
|
||||
from app.infrastructure.db.repositories.like_repository import SqlAlchemyLikeRepository
|
||||
from app.infrastructure.db.repositories.playlist_repository import SqlAlchemyPlaylistRepository
|
||||
@@ -14,6 +17,7 @@ from app.infrastructure.db.repositories.user_repository import SqlAlchemyUserRep
|
||||
__all__ = [
|
||||
"SqlAlchemyAlbumRepository",
|
||||
"SqlAlchemyArtistRepository",
|
||||
"SqlAlchemyDownloadJobRepository",
|
||||
"SqlAlchemyHistoryRepository",
|
||||
"SqlAlchemyLikeRepository",
|
||||
"SqlAlchemyPlaylistRepository",
|
||||
|
||||
@@ -0,0 +1,164 @@
|
||||
"""Download job repository — adapter over ``AsyncSession`` (plan §6.1)."""
|
||||
|
||||
import datetime as dt
|
||||
import uuid
|
||||
|
||||
from sqlalchemy import func, select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.domain.entities.download import DownloadJob
|
||||
from app.infrastructure.db.models.download_job import DownloadJobModel
|
||||
from app.infrastructure.db.models.enums import DownloadStatus
|
||||
|
||||
# Jobs that are not yet finished — used to dedup an in-flight download.
|
||||
_ACTIVE_STATUSES = (
|
||||
DownloadStatus.QUEUED.value,
|
||||
DownloadStatus.DOWNLOADING.value,
|
||||
DownloadStatus.ENRICHING.value,
|
||||
)
|
||||
|
||||
|
||||
def _to_entity(row: DownloadJobModel) -> DownloadJob:
|
||||
return DownloadJob(
|
||||
id=row.id,
|
||||
source=row.source,
|
||||
source_id=row.source_id,
|
||||
query=row.query,
|
||||
requested_by=row.requested_by,
|
||||
status=row.status,
|
||||
progress=row.progress,
|
||||
error_message=row.error_message,
|
||||
retry_count=row.retry_count,
|
||||
track_id=row.track_id,
|
||||
created_at=row.created_at,
|
||||
updated_at=row.updated_at,
|
||||
)
|
||||
|
||||
|
||||
class SqlAlchemyDownloadJobRepository:
|
||||
def __init__(self, session: AsyncSession) -> None:
|
||||
self._session = session
|
||||
|
||||
async def add(
|
||||
self,
|
||||
*,
|
||||
source: str,
|
||||
source_id: str | None,
|
||||
query: str | None,
|
||||
requested_by: uuid.UUID | None,
|
||||
) -> DownloadJob:
|
||||
row = DownloadJobModel(
|
||||
source=source,
|
||||
source_id=source_id,
|
||||
query=query,
|
||||
requested_by=requested_by,
|
||||
status=DownloadStatus.QUEUED.value,
|
||||
progress=0.0,
|
||||
retry_count=0,
|
||||
)
|
||||
self._session.add(row)
|
||||
await self._session.flush()
|
||||
await self._session.refresh(row)
|
||||
return _to_entity(row)
|
||||
|
||||
async def get_by_id(self, job_id: uuid.UUID) -> DownloadJob | None:
|
||||
row = await self._session.get(DownloadJobModel, job_id)
|
||||
return _to_entity(row) if row is not None else None
|
||||
|
||||
async def get_active_for_source(self, source: str, source_id: str) -> DownloadJob | None:
|
||||
row = (
|
||||
await self._session.execute(
|
||||
select(DownloadJobModel)
|
||||
.where(
|
||||
DownloadJobModel.source == source,
|
||||
DownloadJobModel.source_id == source_id,
|
||||
DownloadJobModel.status.in_(_ACTIVE_STATUSES),
|
||||
)
|
||||
.order_by(DownloadJobModel.created_at.desc())
|
||||
.limit(1)
|
||||
)
|
||||
).scalar_one_or_none()
|
||||
return _to_entity(row) if row is not None else None
|
||||
|
||||
async def list(
|
||||
self,
|
||||
*,
|
||||
requested_by: uuid.UUID | None,
|
||||
status: str | None,
|
||||
limit: int,
|
||||
offset: int,
|
||||
) -> list[DownloadJob]:
|
||||
stmt = select(DownloadJobModel)
|
||||
if requested_by is not None:
|
||||
stmt = stmt.where(DownloadJobModel.requested_by == requested_by)
|
||||
if status is not None:
|
||||
stmt = stmt.where(DownloadJobModel.status == status)
|
||||
stmt = stmt.order_by(DownloadJobModel.created_at.desc()).limit(limit).offset(offset)
|
||||
rows = (await self._session.execute(stmt)).scalars().all()
|
||||
return [_to_entity(r) for r in rows]
|
||||
|
||||
async def count(self, *, requested_by: uuid.UUID | None, status: str | None) -> int:
|
||||
stmt = select(func.count()).select_from(DownloadJobModel)
|
||||
if requested_by is not None:
|
||||
stmt = stmt.where(DownloadJobModel.requested_by == requested_by)
|
||||
if status is not None:
|
||||
stmt = stmt.where(DownloadJobModel.status == status)
|
||||
return (await self._session.execute(stmt)).scalar_one()
|
||||
|
||||
async def set_status(
|
||||
self,
|
||||
job_id: uuid.UUID,
|
||||
*,
|
||||
status: str,
|
||||
error_message: str | None = None,
|
||||
track_id: uuid.UUID | None = None,
|
||||
) -> None:
|
||||
row = await self._session.get(DownloadJobModel, job_id)
|
||||
if row is None:
|
||||
return
|
||||
row.status = status
|
||||
# ``error_message`` is always written: a successful transition clears a
|
||||
# stale reason from an earlier failed attempt.
|
||||
row.error_message = error_message
|
||||
if track_id is not None:
|
||||
row.track_id = track_id
|
||||
if status == DownloadStatus.DONE.value:
|
||||
row.progress = 1.0
|
||||
await self._session.flush()
|
||||
|
||||
async def set_progress(self, job_id: uuid.UUID, progress: float) -> None:
|
||||
row = await self._session.get(DownloadJobModel, job_id)
|
||||
if row is None:
|
||||
return
|
||||
row.progress = max(0.0, min(1.0, progress))
|
||||
await self._session.flush()
|
||||
|
||||
async def increment_retry(self, job_id: uuid.UUID) -> int:
|
||||
row = await self._session.get(DownloadJobModel, job_id)
|
||||
if row is None:
|
||||
return 0
|
||||
row.retry_count += 1
|
||||
await self._session.flush()
|
||||
return row.retry_count
|
||||
|
||||
async def delete(self, job_id: uuid.UUID) -> None:
|
||||
row = await self._session.get(DownloadJobModel, job_id)
|
||||
if row is not None:
|
||||
await self._session.delete(row)
|
||||
await self._session.flush()
|
||||
|
||||
async def failure_rate(self, source: str, *, since: dt.datetime) -> float:
|
||||
total, failed = (
|
||||
await self._session.execute(
|
||||
select(
|
||||
func.count(),
|
||||
func.count().filter(DownloadJobModel.status == DownloadStatus.FAILED.value),
|
||||
)
|
||||
.select_from(DownloadJobModel)
|
||||
.where(
|
||||
DownloadJobModel.source == source,
|
||||
DownloadJobModel.created_at >= since,
|
||||
)
|
||||
)
|
||||
).one()
|
||||
return (failed / total) if total else 0.0
|
||||
@@ -78,7 +78,7 @@ class AcoustIdHttpClient:
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return resp.json() # type: ignore[no-any-return]
|
||||
except (httpx.HTTPError, ValueError):
|
||||
except httpx.HTTPError, ValueError:
|
||||
log.warning("acoustid_lookup_failed")
|
||||
return None
|
||||
|
||||
|
||||
@@ -2,16 +2,18 @@
|
||||
|
||||
Built from settings at the composition root. Only sources that are configured
|
||||
are registered (e.g. ``local`` appears only when ``LOCAL_MEDIA_IMPORT_PATH`` is
|
||||
set), so enumeration reflects what the instance can actually use.
|
||||
set; ``youtube`` only when ``YOUTUBE_ENABLED``), so enumeration reflects what the
|
||||
instance can actually use.
|
||||
"""
|
||||
|
||||
from typing import cast
|
||||
|
||||
from app.core.config import Settings
|
||||
from app.domain.errors import NotFoundError, ValidationError
|
||||
from app.domain.ports import IndexableSource, SourceBackend
|
||||
from app.domain.ports import FetchableSource, IndexableSource, SearchableSource, SourceBackend
|
||||
from app.domain.sources import SourceInfo
|
||||
from app.infrastructure.sources.local_folder import LocalFolderSource
|
||||
from app.infrastructure.sources.youtube import YouTubeMusicSource
|
||||
|
||||
|
||||
class SourceRegistry:
|
||||
@@ -30,6 +32,22 @@ class SourceRegistry:
|
||||
raise ValidationError(f"Source {name!r} cannot be indexed.")
|
||||
return cast(IndexableSource, backend)
|
||||
|
||||
def searchable(self, name: str) -> SearchableSource:
|
||||
backend = self.get(name)
|
||||
if not hasattr(backend, "search"):
|
||||
raise ValidationError(f"Source {name!r} cannot be searched.")
|
||||
return cast(SearchableSource, backend)
|
||||
|
||||
def fetchable(self, name: str) -> FetchableSource:
|
||||
backend = self.get(name)
|
||||
if not hasattr(backend, "fetch"):
|
||||
raise ValidationError(f"Source {name!r} cannot download.")
|
||||
return cast(FetchableSource, backend)
|
||||
|
||||
def searchables(self) -> list[SearchableSource]:
|
||||
"""Every registered source that supports search (for cross-source search)."""
|
||||
return [cast(SearchableSource, b) for b in self._by_name.values() if hasattr(b, "search")]
|
||||
|
||||
def infos(self) -> list[SourceInfo]:
|
||||
return [backend.info() for backend in self._by_name.values()]
|
||||
|
||||
@@ -38,4 +56,11 @@ def build_source_registry(settings: Settings) -> SourceRegistry:
|
||||
backends: list[SourceBackend] = []
|
||||
if settings.local_media_import_path is not None:
|
||||
backends.append(LocalFolderSource(settings.local_media_import_path))
|
||||
if settings.youtube_enabled:
|
||||
backends.append(
|
||||
YouTubeMusicSource(
|
||||
cookies_path=settings.youtube_cookies_path,
|
||||
tmp_dir=settings.upload_tmp_dir,
|
||||
)
|
||||
)
|
||||
return SourceRegistry(backends)
|
||||
|
||||
@@ -0,0 +1,207 @@
|
||||
"""``youtube`` source — YouTube Music search + download (plan §5).
|
||||
|
||||
A *fetch* source: it searches YouTube Music (via ``ytmusicapi``, which returns
|
||||
clean song/artist/album/duration rows) and downloads the chosen item with
|
||||
``yt-dlp``. The two libraries are synchronous, so every call is bounced to a
|
||||
worker thread (``anyio.to_thread``); the sync yt-dlp progress hook bridges back
|
||||
to the async progress callback via ``anyio.from_thread``.
|
||||
|
||||
Both libraries are optional dependencies — if either is missing the source is
|
||||
simply *unavailable* (it never crashes import or the registry; graceful
|
||||
degradation per CLAUDE.md). The audio stream is stored **as-is** (YouTube serves
|
||||
lossy Opus/AAC; re-encoding would be lossy→lossy, plan §6.6).
|
||||
|
||||
``source_id`` is the YouTube ``videoId`` — stable, so a re-download of the same
|
||||
id is idempotent and dedups against an existing track.
|
||||
"""
|
||||
|
||||
import functools
|
||||
import tempfile
|
||||
from collections.abc import Callable
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import anyio
|
||||
|
||||
from app.core.logging import get_logger
|
||||
from app.domain.ports import ProgressCallback
|
||||
from app.domain.sources import (
|
||||
KIND_FETCH,
|
||||
DownloadResult,
|
||||
RawMetadata,
|
||||
SearchResult,
|
||||
SourceInfo,
|
||||
)
|
||||
from app.infrastructure.db.models.enums import TrackSource
|
||||
|
||||
log = get_logger(__name__)
|
||||
|
||||
# Functions a caller may inject for testing (defaults do the real library work).
|
||||
SearchFn = Callable[[str, int], list[dict[str, Any]]]
|
||||
# (video_id, tmp_dir, progress_hook, cookies_path) -> normalized download dict
|
||||
DownloadFn = Callable[[str, Path, Callable[[dict[str, Any]], None], Path | None], dict[str, Any]]
|
||||
|
||||
|
||||
def _libs_available() -> bool:
|
||||
try:
|
||||
import yt_dlp # noqa: F401
|
||||
import ytmusicapi # noqa: F401
|
||||
except ImportError:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _watch_url(video_id: str) -> str:
|
||||
return f"https://music.youtube.com/watch?v={video_id}"
|
||||
|
||||
|
||||
class YouTubeMusicSource:
|
||||
"""Implements :class:`app.domain.ports.SearchableSource` and
|
||||
:class:`~app.domain.ports.FetchableSource`."""
|
||||
|
||||
name = TrackSource.YOUTUBE.value
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
cookies_path: Path | None = None,
|
||||
tmp_dir: Path | None = None,
|
||||
search_fn: SearchFn | None = None,
|
||||
download_fn: DownloadFn | None = None,
|
||||
) -> None:
|
||||
self._cookies_path = cookies_path
|
||||
self._tmp_dir = tmp_dir
|
||||
self._search_fn = search_fn or _default_search
|
||||
self._download_fn = download_fn or _default_download
|
||||
# Only the real library path needs the deps; an injected fn is self-contained.
|
||||
self._injected = search_fn is not None or download_fn is not None
|
||||
|
||||
def info(self) -> SourceInfo:
|
||||
return SourceInfo(
|
||||
name=self.name,
|
||||
label="YouTube Music",
|
||||
kind=KIND_FETCH,
|
||||
available=self.is_available(),
|
||||
)
|
||||
|
||||
def is_available(self) -> bool:
|
||||
return True if self._injected else _libs_available()
|
||||
|
||||
async def search(self, query: str, *, limit: int) -> list[SearchResult]:
|
||||
query = query.strip()
|
||||
if not query:
|
||||
return []
|
||||
try:
|
||||
rows = await anyio.to_thread.run_sync(functools.partial(self._search_fn, query, limit))
|
||||
except Exception:
|
||||
# No results / service down → degrade to empty (plan §5, CLAUDE.md).
|
||||
log.warning("ytm_search_failed", query=query)
|
||||
return []
|
||||
return [r for r in (self._to_result(row) for row in rows) if r is not None]
|
||||
|
||||
async def fetch(
|
||||
self, source_id: str, *, on_progress: ProgressCallback | None = None
|
||||
) -> DownloadResult:
|
||||
tmp_dir = self._tmp_dir or Path(tempfile.gettempdir())
|
||||
|
||||
def hook(d: dict[str, Any]) -> None:
|
||||
if on_progress is None or d.get("status") != "downloading":
|
||||
return
|
||||
total = d.get("total_bytes") or d.get("total_bytes_estimate")
|
||||
done = d.get("downloaded_bytes")
|
||||
if not total or done is None:
|
||||
return
|
||||
# Cap below 1.0 — the job only reaches 1.0 once stored + imported.
|
||||
frac = min(done / total, 0.99)
|
||||
# Bridge sync hook (worker thread) → async callback (event loop).
|
||||
anyio.from_thread.run(on_progress, frac)
|
||||
|
||||
def _run() -> dict[str, Any]:
|
||||
return self._download_fn(source_id, tmp_dir, hook, self._cookies_path)
|
||||
|
||||
info = await anyio.to_thread.run_sync(_run)
|
||||
path = Path(info["filepath"])
|
||||
stat = await anyio.Path(path).stat()
|
||||
return DownloadResult(
|
||||
source_id=source_id,
|
||||
path=path,
|
||||
file_format=info["file_format"],
|
||||
file_size=stat.st_size,
|
||||
bitrate=info.get("bitrate"),
|
||||
suggested_title=info.get("title") or source_id,
|
||||
)
|
||||
|
||||
async def get_metadata(self, source_id: str) -> RawMetadata | None:
|
||||
# The search result already carries a usable title/artist, and the
|
||||
# canonical metadata comes from enrichment (§6.2). A dedicated lookup is
|
||||
# an optional refinement — skipped for now (returns None gracefully).
|
||||
return None
|
||||
|
||||
def _to_result(self, row: dict[str, Any]) -> SearchResult | None:
|
||||
video_id = row.get("videoId")
|
||||
if not video_id:
|
||||
return None # non-playable row (e.g. a video without audio id)
|
||||
artists = row.get("artists") or []
|
||||
artist = ", ".join(a["name"] for a in artists if a.get("name")) or None
|
||||
album = (row.get("album") or {}).get("name") if isinstance(row.get("album"), dict) else None
|
||||
thumbnails = row.get("thumbnails") or []
|
||||
thumbnail = thumbnails[-1].get("url") if thumbnails else None
|
||||
return SearchResult(
|
||||
source=self.name,
|
||||
source_id=str(video_id),
|
||||
title=row.get("title") or "Unknown",
|
||||
artist=artist,
|
||||
album=album,
|
||||
duration_seconds=row.get("duration_seconds"),
|
||||
thumbnail_url=thumbnail,
|
||||
raw=row,
|
||||
)
|
||||
|
||||
|
||||
def _default_search(query: str, limit: int) -> list[dict[str, Any]]:
|
||||
"""Real ytmusicapi search (songs only). Runs in a worker thread."""
|
||||
from ytmusicapi import YTMusic
|
||||
|
||||
yt = YTMusic() # unauthenticated: public search needs no login
|
||||
results: list[dict[str, Any]] = yt.search(query, filter="songs", limit=limit)
|
||||
return results[:limit]
|
||||
|
||||
|
||||
def _default_download(
|
||||
video_id: str,
|
||||
tmp_dir: Path,
|
||||
progress_hook: Callable[[dict[str, Any]], None],
|
||||
cookies_path: Path | None,
|
||||
) -> dict[str, Any]:
|
||||
"""Real yt-dlp download of the best audio stream. Runs in a worker thread.
|
||||
|
||||
Stores the original stream (no transcode — plan §6.3/§6.6). Returns a
|
||||
normalized dict the adapter maps to :class:`DownloadResult`.
|
||||
"""
|
||||
from yt_dlp import YoutubeDL
|
||||
|
||||
opts: dict[str, Any] = {
|
||||
"format": "bestaudio/best",
|
||||
"outtmpl": str(tmp_dir / "%(id)s.%(ext)s"),
|
||||
"quiet": True,
|
||||
"no_warnings": True,
|
||||
"noprogress": True,
|
||||
"progress_hooks": [progress_hook],
|
||||
}
|
||||
# Use cookies only when the file is actually present: the path can be set
|
||||
# unconditionally (e.g. a mounted volume that may be empty) and downloads
|
||||
# still work without it — cookies just unlock age/region-restricted items.
|
||||
if cookies_path is not None and cookies_path.is_file():
|
||||
opts["cookiefile"] = str(cookies_path)
|
||||
|
||||
with YoutubeDL(opts) as ydl:
|
||||
info = ydl.extract_info(_watch_url(video_id), download=True)
|
||||
filepath = Path(ydl.prepare_filename(info))
|
||||
|
||||
abr = info.get("abr")
|
||||
return {
|
||||
"filepath": filepath,
|
||||
"file_format": filepath.suffix.lstrip(".").lower() or "m4a",
|
||||
"bitrate": int(abr) if abr else None,
|
||||
"title": info.get("title"),
|
||||
}
|
||||
Reference in New Issue
Block a user