fix: resolve job-not-found race and YYMMDD scan folder structure

Race condition (worker "Job not found in DB"): - RedisJobQueue.enqueue() was pushing job IDs to Redis immediately after flush() but before the API transaction committed, so the worker would read an ID that didn't exist yet in the DB from its own session. - Fix: defer the Redis rpush until after session.commit() via a pending- push list drained by get_session() after each successful commit. - Worker: drain stale Redis queue entries on startup to clear any IDs left over from previously uncommitted transactions. - Worker: add 3-attempt retry with 200ms sleep when a job is not found, as a safety net for any remaining propagation edge cases. NC scan folder structure (YYMMDD rehearsal subfolders): - Previously used dir_name as song title for all files in a subdirectory, meaning every file got the folder name (e.g. "231015") as its title. - Fix: derive song title from Path(sub_rel).stem so each audio file gets its own name; use the file's parent path as nc_folder for version grouping. - Rehearsal folder name stored in song.notes as "Rehearsal: YYMMDD". - Added structured logging throughout the scan: entries found, per-folder file counts, skip/create/import decisions, and final summary count. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-28 21:58:46 +01:00
parent f7be1b994d
commit b28472c32f
4 changed files with 103 additions and 40 deletions
--- a/worker/src/worker/main.py
+++ b/worker/src/worker/main.py
@@ -116,6 +116,13 @@ async def main() -> None:
    session_factory = async_sessionmaker(engine, expire_on_commit=False, class_=AsyncSession)
    redis = aioredis.from_url(settings.redis_url, decode_responses=True)

+    # Drain stale job IDs left in Redis from previous runs whose API transactions
+    # were never committed (e.g. crashed processes).
+    stale = await redis.llen(settings.job_queue_key)
+    if stale:
+        log.warning("Draining %d stale job IDs from Redis queue before starting", stale)
+        await redis.delete(settings.job_queue_key)
+
    log.info("Worker started. Listening for jobs on %s", settings.job_queue_key)

    while True:
@@ -125,11 +132,21 @@ async def main() -> None:
                continue
            _, raw_id = result
            job_id = uuid.UUID(raw_id)
+            log.info("Dequeued job %s", job_id)

            async with session_factory() as session:
-                job = await session.get(JobModel, job_id)
+                # Brief retry: the deferred Redis push fires right after API commit,
+                # so a tiny propagation delay is still possible.
+                job = None
+                for _attempt in range(3):
+                    job = await session.get(JobModel, job_id)
+                    if job is not None:
+                        break
+                    await asyncio.sleep(0.2)
+                    await session.expire_all()
+
                if job is None:
-                    log.warning("Job %s not found in DB", job_id)
+                    log.warning("Job %s not found in DB after retries — discarding", job_id)
                    continue

                job.status = "running"
@@ -139,18 +156,20 @@ async def main() -> None:

                handler = HANDLERS.get(job.type)
                if handler is None:
-                    log.error("Unknown job type: %s", job.type)
+                    log.error("Job %s has unknown type '%s' — marking failed", job_id, job.type)
                    job.status = "failed"
                    job.error = f"Unknown job type: {job.type}"
                    job.finished_at = datetime.now(timezone.utc)
                    await session.commit()
                    continue

+                log.info("Running job %s type=%s payload=%s", job_id, job.type, job.payload)
                try:
                    await handler(job.payload, session, settings)
                    job.status = "done"
                    job.finished_at = datetime.now(timezone.utc)
                    await session.commit()
+                    log.info("Job %s done", job_id)
                except Exception as exc:
                    log.exception("Job %s failed: %s", job_id, exc)
                    job.status = "failed"