chat/chat/services/background.py

"""Async background worker for post-turn jobs (T22).

The turn flow records a ``memory_written`` event synchronously on the
request path so the timeline updates immediately. Significance scoring is
a separate classifier round-trip that we don't want to block on, so the
turn handler enqueues a :class:`SignificanceJob` here and the worker
drains the queue out-of-band.

A single :class:`BackgroundWorker` is started/stopped via FastAPI lifespan
in :mod:`chat.app`. The worker owns its own ``asyncio.Queue`` and runs
exactly one task that pulls jobs off the queue, calls
:func:`chat.services.significance.compute_significance`, and writes
``memory_significance_set`` (and on score 3, ``memory_pin_changed``)
events. Each job opens its own DB connection — workers and request
handlers don't share connections.

Failures inside ``_process`` are logged and swallowed: a flaky classifier
shouldn't take down the worker. Tests can disable enqueue() by setting
``BackgroundWorker.enabled = False`` (e.g. in the existing turn-flow
fixture, which doesn't have a usable LLM key for the lifespan-managed
factory).
"""

from __future__ import annotations

import asyncio
import logging
from dataclasses import dataclass
from typing import Callable

from chat.config import Settings
from chat.db.connection import open_db
from chat.eventlog.log import append_and_apply
from chat.llm.client import LLMClient
from chat.services.backup import (
    prune_backups,
    should_take_backup,
    take_backup,
)
from chat.services.significance import compute_significance
from chat.services.snapshot import (
    prune_periodic_snapshots,
    should_take_periodic_snapshot,
    take_snapshot,
)

# T32: tick-loop wake interval. 60s gives a single backup window per
# target hour with plenty of slack: should_take_backup's 23h freshness
# guard prevents back-to-back runs.
BACKUP_TICK_INTERVAL_SECONDS = 60.0

log = logging.getLogger(__name__)


@dataclass
class SignificanceJob:
    """One unit of work for the background worker.

    ``host_bot_id`` is the memory's owner — used both for the auto-pin
    soft cap query and as the eventual scope for the soft-cap eviction.
    """

    memory_id: int
    narrative_text: str
    prior_dialogue: list[dict]
    host_bot_id: str


class BackgroundWorker:
    """asyncio.Queue-backed single-worker task.

    Started on app startup; ``stop()`` enqueues a sentinel and awaits the
    task so any in-flight job has a chance to finish. Pending jobs after
    the sentinel are dropped on shutdown — Phase 1 simplification.
    """

    def __init__(
        self,
        settings: Settings,
        llm_client_factory: Callable[[], LLMClient],
        *,
        enabled: bool = True,
    ) -> None:
        self._settings = settings
        self._llm_client_factory = llm_client_factory
        self._queue: asyncio.Queue[SignificanceJob | None] = asyncio.Queue()
        self._task: asyncio.Task | None = None
        # T32: nightly-backup tick loop runs alongside the job loop. The
        # event is set by stop() to wake the loop early so shutdown is
        # snappy even mid-tick.
        self._tick_task: asyncio.Task | None = None
        self._tick_stop: asyncio.Event = asyncio.Event()
        self.enabled = enabled

    async def start(self) -> None:
        if self._task is not None:
            return
        self._task = asyncio.create_task(self._run())
        self._tick_task = asyncio.create_task(self._tick_loop())

    async def stop(self) -> None:
        # Stop the tick loop first — it has no in-flight work to drain,
        # so signalling early lets it exit while the job loop is still
        # finishing its sentinel handoff.
        self._tick_stop.set()
        if self._tick_task is not None:
            await self._tick_task
            self._tick_task = None
        if self._task is None:
            return
        await self._queue.put(None)  # sentinel
        await self._task
        self._task = None

    def enqueue(self, job: SignificanceJob) -> None:
        if not self.enabled:
            return
        self._queue.put_nowait(job)

    async def _run(self) -> None:
        while True:
            job = await self._queue.get()
            if job is None:
                return
            try:
                await self._process(job)
            except Exception as exc:  # noqa: BLE001 — worker must not die
                log.exception("significance job failed: %s", exc)

    async def _tick_loop(self) -> None:
        """Periodic-operations loop (T32 nightly backup).

        Wakes every :data:`BACKUP_TICK_INTERVAL_SECONDS` seconds and
        asks :func:`should_take_backup` whether a backup is due. The
        scheduling decision lives in the backup module so we don't
        duplicate the "is it 03:00?" logic here. Failures are caught
        and logged so a flaky disk doesn't kill the loop — the next
        tick will retry.

        Wait uses :func:`asyncio.wait_for` on ``_tick_stop`` so that
        :meth:`stop` can interrupt a sleeping tick instead of having to
        wait the full interval.
        """
        while not self._tick_stop.is_set():
            try:
                if should_take_backup(self._settings.data_dir):
                    take_backup(
                        db_path=self._settings.db_path,
                        data_dir=self._settings.data_dir,
                    )
                    prune_backups(self._settings.data_dir, keep=14)
                    log.info("nightly backup taken")
            except Exception as exc:  # noqa: BLE001 — never break the loop
                log.exception("backup tick failed: %s", exc)
            try:
                await asyncio.wait_for(
                    self._tick_stop.wait(),
                    timeout=BACKUP_TICK_INTERVAL_SECONDS,
                )
            except asyncio.TimeoutError:
                # Normal path: timed out waiting for stop, run another tick.
                pass

    async def _process(self, job: SignificanceJob) -> None:
        client = self._llm_client_factory()
        score = await compute_significance(
            client,
            model=self._settings.classifier_model,
            narrative_text=job.narrative_text,
            prior_dialogue=job.prior_dialogue,
        )
        with open_db(self._settings.db_path) as conn:
            append_and_apply(
                conn,
                kind="memory_significance_set",
                payload={
                    "memory_id": job.memory_id,
                    "significance": score,
                },
            )
            if score >= 3:
                _auto_pin_with_cap(
                    conn,
                    owner_id=job.host_bot_id,
                    memory_id=job.memory_id,
                )

        # T31: piggy-back the periodic snapshot check on the background
        # worker so we don't need a separate timer task. The classifier
        # pass already runs out-of-band, so snapshot I/O on the same
        # worker is a natural fit. Each snapshot opens its own
        # connection so we don't conflate the snapshot's read-only view
        # with the significance-write transaction above. Failures are
        # caught and logged: a flaky disk shouldn't take down the
        # significance pipeline.
        try:
            with open_db(self._settings.db_path) as conn:
                if should_take_periodic_snapshot(
                    conn, self._settings.data_dir
                ):
                    snapshot_path = take_snapshot(
                        conn,
                        data_dir=self._settings.data_dir,
                        kind="periodic",
                    )
                    prune_periodic_snapshots(
                        self._settings.data_dir, keep=5
                    )
                    log.info(
                        "periodic snapshot taken: %s", snapshot_path
                    )
        except Exception as exc:  # noqa: BLE001 — never break the worker
            log.exception("periodic snapshot failed: %s", exc)


def _auto_pin_with_cap(
    conn,
    *,
    owner_id: str,
    memory_id: int,
    cap: int = 8,
) -> None:
    """Auto-pin ``memory_id`` and evict the oldest auto-pin if over ``cap``.

    Per §8.5: pivotal turns are auto-pinned, with a soft cap of 8 pins per
    bot. When the cap is exceeded the oldest auto-pin is unpinned (manual
    pins are never auto-evicted — we filter on ``auto_pinned = 1``).
    """
    append_and_apply(
        conn,
        kind="memory_pin_changed",
        payload={
            "memory_id": memory_id,
            "pinned": 1,
            "auto_pinned": 1,
        },
    )
    cur = conn.execute(
        "SELECT COUNT(*) FROM memories WHERE owner_id = ? AND pinned = 1",
        (owner_id,),
    )
    count = cur.fetchone()[0]
    if count <= cap:
        return
    cur = conn.execute(
        "SELECT id FROM memories "
        "WHERE owner_id = ? AND pinned = 1 AND auto_pinned = 1 AND id != ? "
        "ORDER BY created_at ASC, id ASC LIMIT 1",
        (owner_id, memory_id),
    )
    row = cur.fetchone()
    if row is None:
        return
    append_and_apply(
        conn,
        kind="memory_pin_changed",
        payload={
            "memory_id": row[0],
            "pinned": 0,
            "auto_pinned": 0,
        },
    )