feat(sources): local_folder source backend + import pipeline
Docker Build & Publish / build (push) Has been cancelled
Docker Build & Publish / push (push) Has been cancelled
Docker Build & Publish / Prune old image versions (push) Has been cancelled

First ingest path beyond manual upload (plan §1C). Source abstraction +
the first concrete backend, so a homelab can index an existing library.

- domain: SourceBackend/IndexableSource ports + SourceInfo/SourceFile shapes
- infrastructure/sources: LocalFolderSource (walks a mounted dir, idempotent
  source_id = relative path) + registry built from settings
- application: LibraryImportService — batch sibling of UploadService; dedup on
  (source, source_id), copy into storage, minimal track (metadata_status=pending,
  enrichment fills the rest in 1D), per-file failures isolated
- workers: scan_local_folder arq task (registered) + enqueue helper (503 if
  Redis down)
- api: GET /sources, POST /sources/{source}/scan (admin, enqueues), /health
- config: LOCAL_MEDIA_IMPORT_PATH; README + .env.example documented
- tests: scanner, registry, import service (fakes) + DB-gated sources API path

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Senko-san
2026-06-08 20:02:09 +03:00
parent 551afbab13
commit 48e3418c7f
19 changed files with 800 additions and 11 deletions
+9
View File
@@ -35,6 +35,7 @@ from app.infrastructure.db.repositories import (
SqlAlchemyTrackRepository,
SqlAlchemyUserRepository,
)
from app.infrastructure.sources.registry import SourceRegistry, build_source_registry
from app.infrastructure.storage.provider import get_file_storage
@@ -70,6 +71,14 @@ def get_subsonic_cipher() -> SubsonicCipher:
return SubsonicPasswordCipher(get_settings().subsonic_secret_key.get_secret_value())
@lru_cache
def get_source_registry() -> SourceRegistry:
return build_source_registry(get_settings())
SourceRegistryDep = Annotated[SourceRegistry, Depends(get_source_registry)]
# -- request-scoped services ---------------------------------------------------
def get_auth_service(session: SessionDep) -> AuthService:
return AuthService(
+29
View File
@@ -0,0 +1,29 @@
"""Schemas for the source endpoints."""
from pydantic import BaseModel
from app.domain.sources import SourceInfo
class SourceInfoOut(BaseModel):
name: str
label: str
kind: str
available: bool
@classmethod
def from_entity(cls, info: SourceInfo) -> SourceInfoOut:
return cls(name=info.name, label=info.label, kind=info.kind, available=info.available)
class ScanResponse(BaseModel):
"""Result of enqueuing a source scan."""
source: str
job_id: str
status: str = "queued"
class SourceHealthOut(BaseModel):
name: str
available: bool
+30 -5
View File
@@ -1,19 +1,44 @@
"""External source endpoints (yt-dlp etc.)."""
"""External source endpoints: enumerate sources and trigger imports.
Listing/health are read-only (any authenticated user). Scanning a source is an
admin action and runs in a worker — the endpoint only enqueues it.
"""
from typing import Any
from fastapi import APIRouter
from app.api.deps import CurrentUser, SourceRegistryDep, SuperUser
from app.api.schemas.source import ScanResponse, SourceHealthOut, SourceInfoOut
from app.domain.errors import DependencyUnavailableError
from app.workers.queue import enqueue
router = APIRouter(prefix="/sources", tags=["sources"])
@router.get("")
async def list_sources() -> Any: ...
async def list_sources(_: CurrentUser, registry: SourceRegistryDep) -> list[SourceInfoOut]:
return [SourceInfoOut.from_entity(info) for info in registry.infos()]
@router.get("/{source}/search")
async def search_source(source: str) -> Any: ...
@router.post("/{source}/scan")
async def scan_source(source: str, user: SuperUser, registry: SourceRegistryDep) -> ScanResponse:
backend = registry.indexable(source) # 404 if unknown, 422 if not indexable
if not backend.is_available():
raise DependencyUnavailableError(f"Source {source!r} is not available.")
job_id = await enqueue("scan_local_folder", source=source, added_by=str(user.id))
return ScanResponse(source=source, job_id=job_id)
@router.get("/{source}/health")
async def source_health(source: str) -> Any: ...
async def source_health(
source: str, _: CurrentUser, registry: SourceRegistryDep
) -> SourceHealthOut:
backend = registry.get(source) # 404 if unknown
return SourceHealthOut(name=backend.name, available=backend.is_available())
@router.get("/{source}/search")
async def search_source(source: str, _: CurrentUser) -> Any:
# Search is for fetch-style sources (youtube, …) — not yet implemented.
...
+96
View File
@@ -0,0 +1,96 @@
"""LibraryImportService — imports files discovered by an indexable source.
Batch sibling of :class:`UploadService`: for each discovered file it dedups on
``(source, source_id)``, copies the file into managed storage, creates a minimal
track (artist ``Unknown Artist``, ``metadata_status=pending``), and leaves the
rest to enrichment (plan §6.2). Per-file failures are isolated — one bad file
must not abort the whole scan (graceful degradation).
"""
import contextlib
import uuid
from dataclasses import dataclass
from app.core.logging import get_logger
from app.domain.ports import ArtistRepository, FileStorage, IndexableSource, TrackRepository
from app.domain.sources import SourceFile
log = get_logger(__name__)
_UNKNOWN_ARTIST = "Unknown Artist"
@dataclass(frozen=True)
class ImportSummary:
source: str
seen: int
imported: int
skipped: int
failed: int
class LibraryImportService:
def __init__(
self,
*,
tracks: TrackRepository,
artists: ArtistRepository,
storage: FileStorage,
) -> None:
self._tracks = tracks
self._artists = artists
self._storage = storage
async def scan_and_import(
self, source: IndexableSource, *, added_by: uuid.UUID | None
) -> ImportSummary:
seen = imported = skipped = failed = 0
for file in source.scan():
seen += 1
try:
existing = await self._tracks.get_by_source(source.name, file.source_id)
if existing is not None:
skipped += 1
continue
await self._import_one(source.name, file, added_by)
imported += 1
except Exception:
failed += 1
log.warning("import_file_failed", source=source.name, source_id=file.source_id)
summary = ImportSummary(
source=source.name, seen=seen, imported=imported, skipped=skipped, failed=failed
)
log.info(
"import_complete",
source=summary.source,
seen=summary.seen,
imported=summary.imported,
skipped=summary.skipped,
failed=summary.failed,
)
return summary
async def _import_one(
self, source_name: str, file: SourceFile, added_by: uuid.UUID | None
) -> None:
track_id = uuid.uuid4()
key = f"tracks/{str(track_id)[:2]}/{track_id}.{file.file_format}"
await self._storage.save_file(key, file.path)
try:
artist = await self._artists.get_or_create(_UNKNOWN_ARTIST)
await self._tracks.add(
id=track_id,
title=file.suggested_title,
artist_id=artist.id,
storage_uri=key,
file_format=file.file_format,
file_size=file.file_size,
source=source_name,
source_id=file.source_id,
metadata_status="pending",
added_by=added_by,
)
except Exception:
with contextlib.suppress(Exception):
await self._storage.delete(key)
raise
+5
View File
@@ -58,6 +58,11 @@ class Settings(BaseSettings):
storage_backend: Literal["local", "s3"] = "local"
upload_tmp_dir: Path | None = None
# -- sources ----------------------------------------------------------
# Mounted folder the ``local`` source indexes (copies into managed storage).
# Unset → the local source is simply not registered.
local_media_import_path: Path | None = None
# -- S3 storage (deferred; set storage_backend="s3" to use) ----------
s3_endpoint_url: str | None = None
s3_bucket: str | None = None
+20 -1
View File
@@ -7,7 +7,7 @@ are bound to these ports at the composition root (``app.api.deps``).
import datetime as dt
import uuid
from collections.abc import AsyncIterator
from collections.abc import AsyncIterator, Iterator
from contextlib import AbstractAsyncContextManager
from pathlib import Path
from typing import Protocol
@@ -23,6 +23,7 @@ from app.domain.entities import (
User,
)
from app.domain.entities.track import Artist, Track
from app.domain.sources import SourceFile, SourceInfo
from app.domain.tokens import IssuedToken, TokenClaims, TokenType
@@ -221,3 +222,21 @@ class HistoryRepository(Protocol):
self, *, user_id: uuid.UUID, limit: int, offset: int
) -> list[PlayHistoryEntry]: ...
async def count(self, *, user_id: uuid.UUID) -> int: ...
class SourceBackend(Protocol):
"""A registered source of tracks (mounted folder, YouTube, …).
``name`` is the stable identifier used in URLs and stored on ``track.source``.
"""
name: str
def info(self) -> SourceInfo: ...
def is_available(self) -> bool: ...
class IndexableSource(SourceBackend, Protocol):
"""A source that enumerates files already on disk (e.g. the local folder)."""
def scan(self) -> Iterator[SourceFile]: ...
+39
View File
@@ -0,0 +1,39 @@
"""Source-backend value objects — framework-free.
A *source* is a place tracks come from (a mounted folder, YouTube, an upload).
Backends are driven adapters (``app.infrastructure.sources``); these are the
shapes they speak in, and the ports they satisfy live in ``app.domain.ports``.
The first backend, ``local``, is *indexable*: it enumerates files already on
disk. Concrete metadata (artist/album/tags) is intentionally **not** resolved
here — a source yields a file plus a minimal title; enrichment (plan §6.2) fills
the rest later, so this stays a thin discovery layer (CLAUDE.md: no duplicated
business logic)."""
from dataclasses import dataclass
from pathlib import Path
@dataclass(frozen=True, slots=True)
class SourceInfo:
"""Describes a registered source for enumeration / health (UI, admin)."""
name: str
label: str
kind: str # "indexable" (more kinds — search/download — arrive with youtube)
available: bool
@dataclass(frozen=True, slots=True)
class SourceFile:
"""A single importable file discovered by an indexable source.
``source_id`` is stable per source (the local backend uses the path relative
to its root) so re-scans are idempotent — already-imported files are skipped.
"""
source_id: str
path: Path
suggested_title: str
file_format: str
file_size: int
+1
View File
@@ -0,0 +1 @@
"""Source backends — driven adapters that discover/fetch tracks."""
@@ -0,0 +1,60 @@
"""``local`` source — indexes audio files from a mounted folder.
Walks a configured root directory and yields each audio file as a
:class:`SourceFile`. It does **not** parse tags or resolve artist/album — that's
enrichment's job (plan §6.2); this stays a thin discovery layer. ``source_id``
is the path relative to the root, so re-scans are idempotent.
"""
import os
from collections.abc import Iterator
from pathlib import Path
from app.domain.sources import SourceFile, SourceInfo
from app.infrastructure.db.models.enums import TrackSource
# Extensions we treat as audio. Mirrors the formats StreamingService serves.
_AUDIO_EXTENSIONS = frozenset(
{"mp3", "flac", "m4a", "aac", "ogg", "opus", "wav", "wma", "aiff", "aif", "alac"}
)
class LocalFolderSource:
"""Implements :class:`app.domain.ports.IndexableSource`."""
name = TrackSource.LOCAL.value
def __init__(self, root: Path) -> None:
self._root = root
def info(self) -> SourceInfo:
return SourceInfo(
name=self.name,
label="Local folder",
kind="indexable",
available=self.is_available(),
)
def is_available(self) -> bool:
return self._root.is_dir()
def scan(self) -> Iterator[SourceFile]:
if not self.is_available():
return
for dirpath, _dirnames, filenames in os.walk(self._root):
for filename in sorted(filenames):
ext = Path(filename).suffix.lower().lstrip(".")
if ext not in _AUDIO_EXTENSIONS:
continue
path = Path(dirpath) / filename
try:
size = path.stat().st_size
except OSError:
continue # vanished/unreadable between walk and stat → skip
yield SourceFile(
source_id=path.relative_to(self._root).as_posix(),
path=path,
suggested_title=path.stem or "Unknown",
file_format=ext,
file_size=size,
)
+41
View File
@@ -0,0 +1,41 @@
"""Source registry — selection + enumeration of configured backends.
Built from settings at the composition root. Only sources that are configured
are registered (e.g. ``local`` appears only when ``LOCAL_MEDIA_IMPORT_PATH`` is
set), so enumeration reflects what the instance can actually use.
"""
from typing import cast
from app.core.config import Settings
from app.domain.errors import NotFoundError, ValidationError
from app.domain.ports import IndexableSource, SourceBackend
from app.domain.sources import SourceInfo
from app.infrastructure.sources.local_folder import LocalFolderSource
class SourceRegistry:
def __init__(self, backends: list[SourceBackend]) -> None:
self._by_name = {backend.name: backend for backend in backends}
def get(self, name: str) -> SourceBackend:
backend = self._by_name.get(name)
if backend is None:
raise NotFoundError(f"Source {name!r} is not configured.")
return backend
def indexable(self, name: str) -> IndexableSource:
backend = self.get(name)
if not hasattr(backend, "scan"):
raise ValidationError(f"Source {name!r} cannot be indexed.")
return cast(IndexableSource, backend)
def infos(self) -> list[SourceInfo]:
return [backend.info() for backend in self._by_name.values()]
def build_source_registry(settings: Settings) -> SourceRegistry:
backends: list[SourceBackend] = []
if settings.local_media_import_path is not None:
backends.append(LocalFolderSource(settings.local_media_import_path))
return SourceRegistry(backends)
+2 -5
View File
@@ -10,6 +10,7 @@ from arq.connections import RedisSettings
from app.core.config import get_settings
from app.core.logging import configure_logging, get_logger
from app.workers.tasks.import_task import scan_local_folder
log = get_logger("worker")
@@ -24,12 +25,8 @@ async def shutdown(_ctx: dict[str, Any]) -> None:
log.info("worker_shutdown")
async def _noop(_ctx: dict[str, Any]) -> None:
pass
class WorkerSettings:
functions: ClassVar[list[Any]] = [_noop] # populated as tasks are implemented
functions: ClassVar[list[Any]] = [scan_local_folder]
on_startup = startup
on_shutdown = shutdown
max_jobs = get_settings().max_parallel_downloads
+30
View File
@@ -0,0 +1,30 @@
"""Enqueue helper — submit a job to the arq queue from the request cycle.
A short-lived pool per call keeps things simple (enqueues are rare, admin-driven
actions). Redis being down degrades to a clean 503 rather than a crash
(graceful degradation)."""
from typing import Any
from arq import create_pool
from arq.connections import RedisSettings
from app.core.config import get_settings
from app.domain.errors import DependencyUnavailableError
async def enqueue(function: str, **kwargs: Any) -> str:
"""Enqueue ``function`` by name, returning the job id. Raises
:class:`DependencyUnavailableError` if the queue can't be reached."""
settings = get_settings()
try:
pool = await create_pool(RedisSettings.from_dsn(str(settings.redis_url)))
except Exception as exc:
raise DependencyUnavailableError("Task queue (Redis) is unavailable.") from exc
try:
job = await pool.enqueue_job(function, **kwargs)
finally:
await pool.aclose()
if job is None:
raise DependencyUnavailableError("Could not enqueue job.")
return str(job.job_id)
+1
View File
@@ -0,0 +1 @@
"""arq task functions. Registered in ``app.workers.arq_worker.WorkerSettings``."""
+46
View File
@@ -0,0 +1,46 @@
"""arq task: scan an indexable source and import its files into the library.
Heavy work (directory walk + file copies) belongs off the request cycle
(CLAUDE.md). The HTTP endpoint enqueues this; the worker runs it with its own
transactional session.
"""
import uuid
from typing import Any
from app.application.import_service import LibraryImportService
from app.core.config import get_settings
from app.core.logging import get_logger
from app.infrastructure.db import session_scope
from app.infrastructure.db.repositories import (
SqlAlchemyArtistRepository,
SqlAlchemyTrackRepository,
)
from app.infrastructure.sources.registry import build_source_registry
from app.infrastructure.storage.provider import get_file_storage
log = get_logger("worker.import")
async def scan_local_folder(
_ctx: dict[str, Any], *, source: str = "local", added_by: str | None = None
) -> dict[str, Any]:
registry = build_source_registry(get_settings())
backend = registry.indexable(source)
actor = uuid.UUID(added_by) if added_by else None
async with session_scope() as session:
service = LibraryImportService(
tracks=SqlAlchemyTrackRepository(session),
artists=SqlAlchemyArtistRepository(session),
storage=get_file_storage(),
)
summary = await service.scan_and_import(backend, added_by=actor)
return {
"source": summary.source,
"seen": summary.seen,
"imported": summary.imported,
"skipped": summary.skipped,
"failed": summary.failed,
}