"""Async background worker for post-turn jobs (T22). The turn flow records a ``memory_written`` event synchronously on the request path so the timeline updates immediately. Significance scoring is a separate classifier round-trip that we don't want to block on, so the turn handler enqueues a :class:`SignificanceJob` here and the worker drains the queue out-of-band. A single :class:`BackgroundWorker` is started/stopped via FastAPI lifespan in :mod:`chat.app`. The worker owns its own ``asyncio.Queue`` and runs exactly one task that pulls jobs off the queue, calls :func:`chat.services.significance.compute_significance`, and writes ``memory_significance_set`` (and on score 3, ``memory_pin_changed``) events. Each job opens its own DB connection — workers and request handlers don't share connections. Failures inside ``_process`` are logged and swallowed: a flaky classifier shouldn't take down the worker. Tests can disable enqueue() by setting ``BackgroundWorker.enabled = False`` (e.g. in the existing turn-flow fixture, which doesn't have a usable LLM key for the lifespan-managed factory). """ from __future__ import annotations import asyncio import logging from dataclasses import dataclass from typing import Callable from chat.config import Settings from chat.db.connection import open_db from chat.eventlog.log import append_and_apply from chat.llm.client import LLMClient from chat.services.backup import ( prune_backups, should_take_backup, take_backup, ) from chat.services.significance import compute_significance from chat.services.snapshot import ( prune_periodic_snapshots, should_take_periodic_snapshot, take_snapshot, ) # T32: tick-loop wake interval. 60s gives a single backup window per # target hour with plenty of slack: should_take_backup's 23h freshness # guard prevents back-to-back runs. BACKUP_TICK_INTERVAL_SECONDS = 60.0 log = logging.getLogger(__name__) @dataclass class SignificanceJob: """One unit of work for the background worker. ``host_bot_id`` is the memory's owner — used both for the auto-pin soft cap query and as the eventual scope for the soft-cap eviction. """ memory_id: int narrative_text: str prior_dialogue: list[dict] host_bot_id: str class BackgroundWorker: """asyncio.Queue-backed single-worker task. Started on app startup; ``stop()`` enqueues a sentinel and awaits the task so any in-flight job has a chance to finish. Pending jobs after the sentinel are dropped on shutdown — Phase 1 simplification. """ def __init__( self, settings: Settings, llm_client_factory: Callable[[], LLMClient], *, enabled: bool = True, ) -> None: self._settings = settings self._llm_client_factory = llm_client_factory self._queue: asyncio.Queue[SignificanceJob | None] = asyncio.Queue() self._task: asyncio.Task | None = None # T32: nightly-backup tick loop runs alongside the job loop. The # event is set by stop() to wake the loop early so shutdown is # snappy even mid-tick. self._tick_task: asyncio.Task | None = None self._tick_stop: asyncio.Event = asyncio.Event() self.enabled = enabled async def start(self) -> None: if self._task is not None: return self._task = asyncio.create_task(self._run()) self._tick_task = asyncio.create_task(self._tick_loop()) async def stop(self) -> None: # Stop the tick loop first — it has no in-flight work to drain, # so signalling early lets it exit while the job loop is still # finishing its sentinel handoff. self._tick_stop.set() if self._tick_task is not None: await self._tick_task self._tick_task = None if self._task is None: return await self._queue.put(None) # sentinel await self._task self._task = None def enqueue(self, job: SignificanceJob) -> None: if not self.enabled: return self._queue.put_nowait(job) async def _run(self) -> None: while True: job = await self._queue.get() if job is None: return try: await self._process(job) except Exception as exc: # noqa: BLE001 — worker must not die log.exception("significance job failed: %s", exc) async def _tick_loop(self) -> None: """Periodic-operations loop (T32 nightly backup). Wakes every :data:`BACKUP_TICK_INTERVAL_SECONDS` seconds and asks :func:`should_take_backup` whether a backup is due. The scheduling decision lives in the backup module so we don't duplicate the "is it 03:00?" logic here. Failures are caught and logged so a flaky disk doesn't kill the loop — the next tick will retry. Wait uses :func:`asyncio.wait_for` on ``_tick_stop`` so that :meth:`stop` can interrupt a sleeping tick instead of having to wait the full interval. """ while not self._tick_stop.is_set(): try: if should_take_backup(self._settings.data_dir): take_backup( db_path=self._settings.db_path, data_dir=self._settings.data_dir, ) prune_backups(self._settings.data_dir, keep=14) log.info("nightly backup taken") except Exception as exc: # noqa: BLE001 — never break the loop log.exception("backup tick failed: %s", exc) try: await asyncio.wait_for( self._tick_stop.wait(), timeout=BACKUP_TICK_INTERVAL_SECONDS, ) except asyncio.TimeoutError: # Normal path: timed out waiting for stop, run another tick. pass async def _process(self, job: SignificanceJob) -> None: client = self._llm_client_factory() score = await compute_significance( client, model=self._settings.classifier_model, narrative_text=job.narrative_text, prior_dialogue=job.prior_dialogue, ) with open_db(self._settings.db_path) as conn: append_and_apply( conn, kind="memory_significance_set", payload={ "memory_id": job.memory_id, "significance": score, }, ) if score >= 3: _auto_pin_with_cap( conn, owner_id=job.host_bot_id, memory_id=job.memory_id, ) # T31: piggy-back the periodic snapshot check on the background # worker so we don't need a separate timer task. The classifier # pass already runs out-of-band, so snapshot I/O on the same # worker is a natural fit. Each snapshot opens its own # connection so we don't conflate the snapshot's read-only view # with the significance-write transaction above. Failures are # caught and logged: a flaky disk shouldn't take down the # significance pipeline. try: with open_db(self._settings.db_path) as conn: if should_take_periodic_snapshot( conn, self._settings.data_dir ): snapshot_path = take_snapshot( conn, data_dir=self._settings.data_dir, kind="periodic", ) prune_periodic_snapshots( self._settings.data_dir, keep=5 ) log.info( "periodic snapshot taken: %s", snapshot_path ) except Exception as exc: # noqa: BLE001 — never break the worker log.exception("periodic snapshot failed: %s", exc) def _auto_pin_with_cap( conn, *, owner_id: str, memory_id: int, cap: int = 8, ) -> None: """Auto-pin ``memory_id`` and evict the oldest auto-pin if over ``cap``. Per §8.5: pivotal turns are auto-pinned, with a soft cap of 8 pins per bot. When the cap is exceeded the oldest auto-pin is unpinned (manual pins are never auto-evicted — we filter on ``auto_pinned = 1``). """ append_and_apply( conn, kind="memory_pin_changed", payload={ "memory_id": memory_id, "pinned": 1, "auto_pinned": 1, }, ) cur = conn.execute( "SELECT COUNT(*) FROM memories WHERE owner_id = ? AND pinned = 1", (owner_id,), ) count = cur.fetchone()[0] if count <= cap: return cur = conn.execute( "SELECT id FROM memories " "WHERE owner_id = ? AND pinned = 1 AND auto_pinned = 1 AND id != ? " "ORDER BY created_at ASC, id ASC LIMIT 1", (owner_id, memory_id), ) row = cur.fetchone() if row is None: return append_and_apply( conn, kind="memory_pin_changed", payload={ "memory_id": row[0], "pinned": 0, "auto_pinned": 0, }, )