From 8390703b73e5ddfb288697a5dc765b6e3ec81ac8 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Sun, 26 Apr 2026 14:18:57 -0400 Subject: [PATCH] feat: nightly DB backups with 14-day retention --- chat/services/background.py | 57 +++++++++++++++++++ chat/services/backup.py | 106 ++++++++++++++++++++++++++++++++++++ tests/test_backup.py | 92 +++++++++++++++++++++++++++++++ 3 files changed, 255 insertions(+) create mode 100644 chat/services/backup.py create mode 100644 tests/test_backup.py diff --git a/chat/services/background.py b/chat/services/background.py index f02285c..27eecaa 100644 --- a/chat/services/background.py +++ b/chat/services/background.py @@ -32,6 +32,11 @@ from chat.config import Settings from chat.db.connection import open_db from chat.eventlog.log import append_and_apply from chat.llm.client import LLMClient +from chat.services.backup import ( + prune_backups, + should_take_backup, + take_backup, +) from chat.services.significance import compute_significance from chat.services.snapshot import ( prune_periodic_snapshots, @@ -39,6 +44,11 @@ from chat.services.snapshot import ( take_snapshot, ) +# T32: tick-loop wake interval. 60s gives a single backup window per +# target hour with plenty of slack: should_take_backup's 23h freshness +# guard prevents back-to-back runs. +BACKUP_TICK_INTERVAL_SECONDS = 60.0 + log = logging.getLogger(__name__) @@ -75,14 +85,27 @@ class BackgroundWorker: self._llm_client_factory = llm_client_factory self._queue: asyncio.Queue[SignificanceJob | None] = asyncio.Queue() self._task: asyncio.Task | None = None + # T32: nightly-backup tick loop runs alongside the job loop. The + # event is set by stop() to wake the loop early so shutdown is + # snappy even mid-tick. + self._tick_task: asyncio.Task | None = None + self._tick_stop: asyncio.Event = asyncio.Event() self.enabled = enabled async def start(self) -> None: if self._task is not None: return self._task = asyncio.create_task(self._run()) + self._tick_task = asyncio.create_task(self._tick_loop()) async def stop(self) -> None: + # Stop the tick loop first — it has no in-flight work to drain, + # so signalling early lets it exit while the job loop is still + # finishing its sentinel handoff. + self._tick_stop.set() + if self._tick_task is not None: + await self._tick_task + self._tick_task = None if self._task is None: return await self._queue.put(None) # sentinel @@ -104,6 +127,40 @@ class BackgroundWorker: except Exception as exc: # noqa: BLE001 — worker must not die log.exception("significance job failed: %s", exc) + async def _tick_loop(self) -> None: + """Periodic-operations loop (T32 nightly backup). + + Wakes every :data:`BACKUP_TICK_INTERVAL_SECONDS` seconds and + asks :func:`should_take_backup` whether a backup is due. The + scheduling decision lives in the backup module so we don't + duplicate the "is it 03:00?" logic here. Failures are caught + and logged so a flaky disk doesn't kill the loop — the next + tick will retry. + + Wait uses :func:`asyncio.wait_for` on ``_tick_stop`` so that + :meth:`stop` can interrupt a sleeping tick instead of having to + wait the full interval. + """ + while not self._tick_stop.is_set(): + try: + if should_take_backup(self._settings.data_dir): + take_backup( + db_path=self._settings.db_path, + data_dir=self._settings.data_dir, + ) + prune_backups(self._settings.data_dir, keep=14) + log.info("nightly backup taken") + except Exception as exc: # noqa: BLE001 — never break the loop + log.exception("backup tick failed: %s", exc) + try: + await asyncio.wait_for( + self._tick_stop.wait(), + timeout=BACKUP_TICK_INTERVAL_SECONDS, + ) + except asyncio.TimeoutError: + # Normal path: timed out waiting for stop, run another tick. + pass + async def _process(self, job: SignificanceJob) -> None: client = self._llm_client_factory() score = await compute_significance( diff --git a/chat/services/backup.py b/chat/services/backup.py new file mode 100644 index 0000000..d8b2f4e --- /dev/null +++ b/chat/services/backup.py @@ -0,0 +1,106 @@ +"""Nightly DB backup service (T32, Requirements §12). + +A simple in-process scheduler: at 03:00 local time daily, copy +``chat.db`` to ``data/backups/chat-.db`` and prune to the +14 most recent. The BackgroundWorker tick loop calls +:func:`should_take_backup` every 60 seconds; when it returns True the +worker calls :func:`take_backup` then :func:`prune_backups`. + +The launchd plist suggested in §12 can replace this later by invoking a +small script that calls :func:`take_backup` directly. For v1 the +in-process loop is enough — the daemon already runs continuously to +serve requests, so there's no extra moving part to install. + +Backups capture the live ``.db`` file via :func:`shutil.copy2`. SQLite's +WAL mode means an in-flight transaction's pages might live in the +``-wal`` sidecar rather than the main file, but our codebase commits +every write transaction synchronously, so the .db alone is sufficient +for v1. A truly safe online backup would use +``sqlite3.Connection.backup()``; deferred. +""" + +from __future__ import annotations + +import shutil +from datetime import datetime, timezone +from pathlib import Path + +# 03:00 local time per Requirements §12. Hardcoded for v1 — making this +# configurable via Settings is straightforward but not needed yet. +DEFAULT_BACKUP_HOUR = 3 + +# Retention window per Requirements §12 ("Last 14 retained"). +DEFAULT_KEEP = 14 + +# Wake interval for should_take_backup's freshness check. We wake the +# tick loop every 60s, so a backup taken in the previous tick within the +# same target hour must NOT trigger another. 23h gives us a generous +# safety margin against scheduling jitter while still allowing a single +# backup per day. +FRESHNESS_HOURS = 23 + + +def take_backup(*, db_path: Path, data_dir: Path) -> Path: + """Copy ``db_path`` to ``data_dir/backups/chat-.db``. + + Returns the new file path. Creates the backup directory if missing. + Uses :func:`shutil.copy2` so the destination's mtime is preserved — + :func:`should_take_backup` reads mtime to gate fresh backups. + """ + backup_dir = data_dir / "backups" + backup_dir.mkdir(parents=True, exist_ok=True) + timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") + backup_path = backup_dir / f"chat-{timestamp}.db" + shutil.copy2(db_path, backup_path) + return backup_path + + +def prune_backups(data_dir: Path, *, keep: int = DEFAULT_KEEP) -> int: + """Remove all but the most recent ``keep`` backup files. + + Returns the number of files removed. Safe when the directory is + missing (returns 0). Sorting is by filename, which is the UTC + timestamp embedded in the name — lexicographic order matches + chronological order. + """ + backup_dir = data_dir / "backups" + if not backup_dir.exists(): + return 0 + files = sorted(backup_dir.glob("chat-*.db")) + to_remove = files[:-keep] if len(files) > keep else [] + for f in to_remove: + f.unlink() + return len(to_remove) + + +def should_take_backup( + data_dir: Path, *, target_hour: int = DEFAULT_BACKUP_HOUR +) -> bool: + """Decide whether a nightly backup is due. + + Two conditions must hold: + + * The current local hour matches ``target_hour``. + * No backup file in ``data_dir/backups/`` has an mtime within the + last :data:`FRESHNESS_HOURS` (23h). The 23h window prevents a + double-backup within the same target hour while still allowing + the next day's run to fire on time. + + Local time (not UTC) is used for the hour comparison per the + requirements ("03:00 local time"). The filename embeds a UTC stamp + so file ordering remains unambiguous across DST transitions. + """ + now = datetime.now() + if now.hour != target_hour: + return False + backup_dir = data_dir / "backups" + if not backup_dir.exists(): + return True + files = list(backup_dir.glob("chat-*.db")) + if not files: + return True + most_recent = max(files, key=lambda f: f.stat().st_mtime) + age_hours = ( + datetime.now().timestamp() - most_recent.stat().st_mtime + ) / 3600 + return age_hours >= FRESHNESS_HOURS diff --git a/tests/test_backup.py b/tests/test_backup.py new file mode 100644 index 0000000..1fcfd29 --- /dev/null +++ b/tests/test_backup.py @@ -0,0 +1,92 @@ +"""Tests for nightly DB backups (T32). + +The backup service is intentionally simple: a flat ``data/backups/`` dir +containing timestamped copies of ``chat.db``, with retention of the most +recent 14. The scheduling decision (``should_take_backup``) is a pure +function of clock + filesystem state so it can be unit-tested without +spinning up the BackgroundWorker tick loop. +""" + +from __future__ import annotations + +from datetime import datetime +from unittest.mock import patch + +from chat.services.backup import ( + prune_backups, + should_take_backup, + take_backup, +) + + +def test_take_backup_creates_timestamped_copy(tmp_path): + db = tmp_path / "chat.db" + db.write_text("fake db contents") + backup_path = take_backup(db_path=db, data_dir=tmp_path / "data") + assert backup_path.exists() + assert backup_path.name.startswith("chat-") + assert backup_path.name.endswith(".db") + # Contents copied + assert backup_path.read_text() == "fake db contents" + # Located in data/backups/ + assert backup_path.parent == tmp_path / "data" / "backups" + + +def test_prune_keeps_last_14(tmp_path): + backup_dir = tmp_path / "data" / "backups" + backup_dir.mkdir(parents=True) + # Create 17 dummy backup files spanning days 1..17 of Jan 2026. + # Filenames sort lexicographically by the embedded timestamp, so + # prune_backups should drop the three oldest. + for i in range(1, 18): + (backup_dir / f"chat-202601{i:02d}T000000Z.db").write_text( + f"backup {i}" + ) + removed = prune_backups(tmp_path / "data", keep=14) + assert removed == 3 + remaining = sorted(backup_dir.glob("chat-*.db")) + assert len(remaining) == 14 + # Days 1, 2, 3 removed; day 4 is now the oldest retained backup. + assert remaining[0].name == "chat-20260104T000000Z.db" + + +def test_should_take_backup_when_no_prior_and_target_hour_matches(tmp_path): + from chat.services import backup as backup_mod + + class FakeDateTime(datetime): + @classmethod + def now(cls, tz=None): + return datetime(2026, 4, 26, 3, 0, 0) + + with patch.object(backup_mod, "datetime", FakeDateTime): + assert should_take_backup(tmp_path / "data") is True + + +def test_should_not_take_backup_outside_target_hour(tmp_path): + from chat.services import backup as backup_mod + + class FakeDateTime(datetime): + @classmethod + def now(cls, tz=None): + return datetime(2026, 4, 26, 14, 0, 0) + + with patch.object(backup_mod, "datetime", FakeDateTime): + assert should_take_backup(tmp_path / "data") is False + + +def test_should_not_take_backup_when_recent_backup_exists(tmp_path): + backup_dir = tmp_path / "data" / "backups" + backup_dir.mkdir(parents=True) + recent = backup_dir / "chat-recent.db" + recent.write_text("x") + # mtime defaults to "now" — within the 23h freshness window so + # should_take_backup must return False even at the target hour. + from chat.services import backup as backup_mod + + class FakeDateTime(datetime): + @classmethod + def now(cls, tz=None): + return datetime(2026, 4, 26, 3, 0, 0) + + with patch.object(backup_mod, "datetime", FakeDateTime): + assert should_take_backup(tmp_path / "data") is False