fix(import): decouple scan from HTTP connection, prevent duplicate imports

- Add scan_manager: background asyncio task + Redis event store so scans
  survive UI navigation; SSE stream reads from Redis and is reconnectable
- Replace SSE-tied scan endpoint with POST /nc-scan/start + GET /nc-scan/stream
- Fix frontend: AbortController + useEffect cleanup cancels stream on unmount
  without stopping the server-side scan
- Add unique constraint on audio_versions.nc_file_path (migration 0009) to
  prevent duplicate imports from concurrent scans; handle IntegrityError
  gracefully in nc_scan with rollback + skip
- Fix API health check: use plain python instead of uv (not in dev image)
- Optimize Taskfile: fix duplicate dev:restart, add dev:fresh/dev:rebuild/
  dev:status, migrate uses run --rm, check includes typecheck

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Mistral Vibe
2026-04-12 22:35:55 +02:00
parent b2d6b4d113
commit 15bc51603b
8 changed files with 349 additions and 147 deletions

View File

@@ -0,0 +1,36 @@
"""Add unique constraint on audio_versions.nc_file_path.
Prevents duplicate imports when concurrent scans race on the same file.
Revision ID: 0009_av_nc_path_uq
Revises: 0008_drop_nc_columns
Create Date: 2026-04-12
"""
from alembic import op
revision = "0009_av_nc_path_uq"
down_revision = "0008_drop_nc_columns"
branch_labels = None
depends_on = None
def upgrade() -> None:
# Remove any existing duplicates first (keep the oldest version per path)
op.execute("""
DELETE FROM audio_versions
WHERE id NOT IN (
SELECT DISTINCT ON (nc_file_path) id
FROM audio_versions
ORDER BY nc_file_path, uploaded_at ASC
)
""")
op.create_unique_constraint(
"uq_audio_version_nc_file_path",
"audio_versions",
["nc_file_path"],
)
def downgrade() -> None:
op.drop_constraint("uq_audio_version_nc_file_path", "audio_versions", type_="unique")

View File

@@ -277,7 +277,7 @@ class AudioVersion(Base):
)
version_number: Mapped[int] = mapped_column(Integer, nullable=False)
label: Mapped[str | None] = mapped_column(String(255))
nc_file_path: Mapped[str] = mapped_column(Text, nullable=False)
nc_file_path: Mapped[str] = mapped_column(Text, nullable=False, unique=True)
nc_file_etag: Mapped[str | None] = mapped_column(String(255))
cdn_hls_base: Mapped[str | None] = mapped_column(Text)
waveform_url: Mapped[str | None] = mapped_column(Text)

View File

@@ -1,3 +1,4 @@
import asyncio
import json
import logging
import uuid
@@ -8,8 +9,7 @@ from pydantic import BaseModel
from sqlalchemy.ext.asyncio import AsyncSession
from rehearsalhub.config import get_settings
from rehearsalhub.db.engine import get_session, get_session_factory
from rehearsalhub.queue.redis_queue import flush_pending_pushes
from rehearsalhub.db.engine import get_session
from rehearsalhub.db.models import Member
from rehearsalhub.dependencies import get_current_member
from rehearsalhub.repositories.band import BandRepository
@@ -21,6 +21,7 @@ from rehearsalhub.schemas.comment import SongCommentCreate, SongCommentRead
from rehearsalhub.schemas.song import SongCreate, SongRead, SongUpdate
from rehearsalhub.services.band import BandService
from rehearsalhub.services.nc_scan import scan_band_folder
from rehearsalhub.services.scan_manager import get_events, is_scanning, start_scan
from rehearsalhub.services.song import SongService
from rehearsalhub.storage.factory import StorageFactory
@@ -175,45 +176,60 @@ async def _get_band_and_assert_member(
return band
@router.get("/bands/{band_id}/nc-scan/stream")
async def scan_nextcloud_stream(
@router.post("/bands/{band_id}/nc-scan/start", status_code=202)
async def scan_nextcloud_start(
band_id: uuid.UUID,
session: AsyncSession = Depends(get_session),
current_member: Member = Depends(_member_from_request),
):
"""
SSE endpoint: streams scan progress as newline-delimited JSON events.
Each event is a JSON object on its own line.
Accepts ?token= for EventSource clients that can't set headers.
Start a background scan. Returns 202 immediately; progress is streamed via
/nc-scan/stream. Returns 409 if a scan is already running for this band.
"""
band = await _get_band_and_assert_member(band_id, current_member, session)
bs = await BandStorageRepository(session).get_active_for_band(band_id)
band_folder = (bs.root_path if bs and bs.root_path else None) or f"bands/{band.slug}/"
member_id = current_member.id
settings = get_settings()
try:
await start_scan(band_id, band_folder, current_member.id, get_settings())
except LookupError as exc:
raise HTTPException(status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail=str(exc))
except RuntimeError:
raise HTTPException(status_code=status.HTTP_409_CONFLICT, detail="Scan already in progress")
return {"status": "started"}
@router.get("/bands/{band_id}/nc-scan/stream")
async def scan_nextcloud_stream(
band_id: uuid.UUID,
cursor: int = Query(default=0, ge=0),
session: AsyncSession = Depends(get_session),
current_member: Member = Depends(_member_from_request),
):
"""
Stream scan events as newline-delimited JSON. Reads from Redis so this
endpoint is independent of the scan's lifecycle — safe to reconnect after
navigating away. Pass ?cursor=N to resume from event index N.
"""
await _get_band_and_assert_member(band_id, current_member, session)
async def event_generator():
async with get_session_factory()() as db:
try:
storage = await StorageFactory.create(db, band_id, settings)
async for event in scan_band_folder(db, storage, band_id, band_folder, member_id):
yield json.dumps(event) + "\n"
if event.get("type") in ("song", "session"):
await db.commit()
await flush_pending_pushes(db)
except LookupError as exc:
yield json.dumps({"type": "error", "message": str(exc)}) + "\n"
except Exception:
log.exception("SSE scan error for band %s", band_id)
yield json.dumps({"type": "error", "message": "Scan failed due to an internal error."}) + "\n"
finally:
await db.commit()
await flush_pending_pushes(db)
idx = cursor
while True:
events = await get_events(band_id, start=idx)
for event in events:
yield json.dumps(event) + "\n"
idx += 1
if event.get("type") in ("done", "error"):
return
if not events:
scanning = await is_scanning(band_id)
if not scanning:
return
await asyncio.sleep(0.3)
return StreamingResponse(
event_generator(),
media_type="application/x-ndjson",
)
return StreamingResponse(event_generator(), media_type="application/x-ndjson")
@router.post("/bands/{band_id}/nc-scan", response_model=NcScanResult)

View File

@@ -12,6 +12,7 @@ import logging
from collections.abc import AsyncGenerator
from pathlib import Path
from sqlalchemy.exc import IntegrityError
from sqlalchemy.ext.asyncio import AsyncSession
from rehearsalhub.repositories.audio_version import AudioVersionRepository
@@ -179,6 +180,13 @@ async def scan_band_folder(
)
yield {"type": "song", "song": read.model_dump(mode="json"), "is_new": is_new}
except IntegrityError:
# Unique constraint on nc_file_path — another concurrent scan already
# imported this file. Roll back the savepoint and treat as skipped.
await db_session.rollback()
log.debug("scan: concurrent import collision on '%s', skipping", nc_file_path)
skipped += 1
yield {"type": "skipped", "path": nc_file_path, "reason": "already imported"}
except Exception as exc:
log.error("Failed to import '%s': %s", nc_file_path, exc, exc_info=True)
skipped += 1

View File

@@ -0,0 +1,126 @@
"""Background scan manager.
Runs nc_scan.scan_band_folder as an asyncio task independent of any HTTP
connection. Events are pushed to a Redis list so the SSE endpoint can read
them whether or not the original requester is still connected.
Redis keys (all expire after EVENTS_TTL_SECONDS):
scan:{band_id}:status — "running" | "done" | "failed" (string)
scan:{band_id}:events — list of JSON-encoded event dicts (rpush / lrange)
"""
from __future__ import annotations
import asyncio
import contextlib
import json
import logging
import uuid
import redis.asyncio as aioredis
from rehearsalhub.config import get_settings
log = logging.getLogger(__name__)
EVENTS_TTL_SECONDS = 3600 # events visible for 1 hour after scan completes
_STATUS_KEY = "scan:{band_id}:status"
_EVENTS_KEY = "scan:{band_id}:events"
# In-process task registry — prevents duplicate scans within the same worker pod.
_running: dict[str, asyncio.Task] = {}
def _status_key(band_id: uuid.UUID) -> str:
return f"scan:{band_id}:status"
def _events_key(band_id: uuid.UUID) -> str:
return f"scan:{band_id}:events"
async def _get_redis() -> aioredis.Redis:
return aioredis.from_url(get_settings().redis_url, decode_responses=True)
async def is_scanning(band_id: uuid.UUID) -> bool:
r = await _get_redis()
status = await r.get(_status_key(band_id))
await r.aclose()
return status == "running"
async def get_events(band_id: uuid.UUID, start: int = 0) -> list[dict]:
"""Return events from index *start* onwards (0-based)."""
r = await _get_redis()
raw = await r.lrange(_events_key(band_id), start, -1)
await r.aclose()
events = []
for item in raw:
with contextlib.suppress(Exception):
events.append(json.loads(item))
return events
async def start_scan(
band_id: uuid.UUID,
band_folder: str,
member_id: uuid.UUID,
settings,
) -> None:
"""Launch a background scan task. Raises RuntimeError if already running."""
key = str(band_id)
task = _running.get(key)
if task and not task.done():
raise RuntimeError("Scan already in progress")
# Clear previous events
r = await _get_redis()
await r.delete(_events_key(band_id))
await r.set(_status_key(band_id), "running", ex=EVENTS_TTL_SECONDS)
await r.aclose()
task = asyncio.create_task(_run_scan(band_id, band_folder, member_id, settings))
_running[key] = task
async def _run_scan(
band_id: uuid.UUID,
band_folder: str,
member_id: uuid.UUID,
settings,
) -> None:
from rehearsalhub.db.engine import get_session_factory
from rehearsalhub.queue.redis_queue import flush_pending_pushes
from rehearsalhub.services.nc_scan import scan_band_folder
from rehearsalhub.storage.factory import StorageFactory
r = await _get_redis()
events_key = _events_key(band_id)
status_key = _status_key(band_id)
async def push(event: dict) -> None:
await r.rpush(events_key, json.dumps(event))
await r.expire(events_key, EVENTS_TTL_SECONDS)
try:
async with get_session_factory()() as db:
storage = await StorageFactory.create(db, band_id, settings)
async for event in scan_band_folder(db, storage, band_id, band_folder, member_id):
await push(event)
if event.get("type") in ("song", "session"):
await db.commit()
await flush_pending_pushes(db)
await db.commit()
await flush_pending_pushes(db)
await r.set(status_key, "done", ex=EVENTS_TTL_SECONDS)
log.info("Background scan completed for band %s", band_id)
except Exception as exc:
log.exception("Background scan failed for band %s", band_id)
await push({"type": "error", "message": "Scan failed due to an internal error."})
await r.set(status_key, "failed", ex=EVENTS_TTL_SECONDS)
finally:
await r.aclose()
_running.pop(str(band_id), None)