feat: nightly DB backups with 14-day retention

This commit is contained in:
Joseph Doherty
2026-04-26 14:18:57 -04:00
parent b9644fad31
commit 8390703b73
3 changed files with 255 additions and 0 deletions
+57
View File
@@ -32,6 +32,11 @@ from chat.config import Settings
from chat.db.connection import open_db
from chat.eventlog.log import append_and_apply
from chat.llm.client import LLMClient
from chat.services.backup import (
prune_backups,
should_take_backup,
take_backup,
)
from chat.services.significance import compute_significance
from chat.services.snapshot import (
prune_periodic_snapshots,
@@ -39,6 +44,11 @@ from chat.services.snapshot import (
take_snapshot,
)
# T32: tick-loop wake interval. 60s gives a single backup window per
# target hour with plenty of slack: should_take_backup's 23h freshness
# guard prevents back-to-back runs.
BACKUP_TICK_INTERVAL_SECONDS = 60.0
log = logging.getLogger(__name__)
@@ -75,14 +85,27 @@ class BackgroundWorker:
self._llm_client_factory = llm_client_factory
self._queue: asyncio.Queue[SignificanceJob | None] = asyncio.Queue()
self._task: asyncio.Task | None = None
# T32: nightly-backup tick loop runs alongside the job loop. The
# event is set by stop() to wake the loop early so shutdown is
# snappy even mid-tick.
self._tick_task: asyncio.Task | None = None
self._tick_stop: asyncio.Event = asyncio.Event()
self.enabled = enabled
async def start(self) -> None:
if self._task is not None:
return
self._task = asyncio.create_task(self._run())
self._tick_task = asyncio.create_task(self._tick_loop())
async def stop(self) -> None:
# Stop the tick loop first — it has no in-flight work to drain,
# so signalling early lets it exit while the job loop is still
# finishing its sentinel handoff.
self._tick_stop.set()
if self._tick_task is not None:
await self._tick_task
self._tick_task = None
if self._task is None:
return
await self._queue.put(None) # sentinel
@@ -104,6 +127,40 @@ class BackgroundWorker:
except Exception as exc: # noqa: BLE001 — worker must not die
log.exception("significance job failed: %s", exc)
async def _tick_loop(self) -> None:
"""Periodic-operations loop (T32 nightly backup).
Wakes every :data:`BACKUP_TICK_INTERVAL_SECONDS` seconds and
asks :func:`should_take_backup` whether a backup is due. The
scheduling decision lives in the backup module so we don't
duplicate the "is it 03:00?" logic here. Failures are caught
and logged so a flaky disk doesn't kill the loop — the next
tick will retry.
Wait uses :func:`asyncio.wait_for` on ``_tick_stop`` so that
:meth:`stop` can interrupt a sleeping tick instead of having to
wait the full interval.
"""
while not self._tick_stop.is_set():
try:
if should_take_backup(self._settings.data_dir):
take_backup(
db_path=self._settings.db_path,
data_dir=self._settings.data_dir,
)
prune_backups(self._settings.data_dir, keep=14)
log.info("nightly backup taken")
except Exception as exc: # noqa: BLE001 — never break the loop
log.exception("backup tick failed: %s", exc)
try:
await asyncio.wait_for(
self._tick_stop.wait(),
timeout=BACKUP_TICK_INTERVAL_SECONDS,
)
except asyncio.TimeoutError:
# Normal path: timed out waiting for stop, run another tick.
pass
async def _process(self, job: SignificanceJob) -> None:
client = self._llm_client_factory()
score = await compute_significance(
+106
View File
@@ -0,0 +1,106 @@
"""Nightly DB backup service (T32, Requirements §12).
A simple in-process scheduler: at 03:00 local time daily, copy
``chat.db`` to ``data/backups/chat-<utc-timestamp>.db`` and prune to the
14 most recent. The BackgroundWorker tick loop calls
:func:`should_take_backup` every 60 seconds; when it returns True the
worker calls :func:`take_backup` then :func:`prune_backups`.
The launchd plist suggested in §12 can replace this later by invoking a
small script that calls :func:`take_backup` directly. For v1 the
in-process loop is enough — the daemon already runs continuously to
serve requests, so there's no extra moving part to install.
Backups capture the live ``.db`` file via :func:`shutil.copy2`. SQLite's
WAL mode means an in-flight transaction's pages might live in the
``-wal`` sidecar rather than the main file, but our codebase commits
every write transaction synchronously, so the .db alone is sufficient
for v1. A truly safe online backup would use
``sqlite3.Connection.backup()``; deferred.
"""
from __future__ import annotations
import shutil
from datetime import datetime, timezone
from pathlib import Path
# 03:00 local time per Requirements §12. Hardcoded for v1 — making this
# configurable via Settings is straightforward but not needed yet.
DEFAULT_BACKUP_HOUR = 3
# Retention window per Requirements §12 ("Last 14 retained").
DEFAULT_KEEP = 14
# Wake interval for should_take_backup's freshness check. We wake the
# tick loop every 60s, so a backup taken in the previous tick within the
# same target hour must NOT trigger another. 23h gives us a generous
# safety margin against scheduling jitter while still allowing a single
# backup per day.
FRESHNESS_HOURS = 23
def take_backup(*, db_path: Path, data_dir: Path) -> Path:
"""Copy ``db_path`` to ``data_dir/backups/chat-<utc-timestamp>.db``.
Returns the new file path. Creates the backup directory if missing.
Uses :func:`shutil.copy2` so the destination's mtime is preserved —
:func:`should_take_backup` reads mtime to gate fresh backups.
"""
backup_dir = data_dir / "backups"
backup_dir.mkdir(parents=True, exist_ok=True)
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
backup_path = backup_dir / f"chat-{timestamp}.db"
shutil.copy2(db_path, backup_path)
return backup_path
def prune_backups(data_dir: Path, *, keep: int = DEFAULT_KEEP) -> int:
"""Remove all but the most recent ``keep`` backup files.
Returns the number of files removed. Safe when the directory is
missing (returns 0). Sorting is by filename, which is the UTC
timestamp embedded in the name — lexicographic order matches
chronological order.
"""
backup_dir = data_dir / "backups"
if not backup_dir.exists():
return 0
files = sorted(backup_dir.glob("chat-*.db"))
to_remove = files[:-keep] if len(files) > keep else []
for f in to_remove:
f.unlink()
return len(to_remove)
def should_take_backup(
data_dir: Path, *, target_hour: int = DEFAULT_BACKUP_HOUR
) -> bool:
"""Decide whether a nightly backup is due.
Two conditions must hold:
* The current local hour matches ``target_hour``.
* No backup file in ``data_dir/backups/`` has an mtime within the
last :data:`FRESHNESS_HOURS` (23h). The 23h window prevents a
double-backup within the same target hour while still allowing
the next day's run to fire on time.
Local time (not UTC) is used for the hour comparison per the
requirements ("03:00 local time"). The filename embeds a UTC stamp
so file ordering remains unambiguous across DST transitions.
"""
now = datetime.now()
if now.hour != target_hour:
return False
backup_dir = data_dir / "backups"
if not backup_dir.exists():
return True
files = list(backup_dir.glob("chat-*.db"))
if not files:
return True
most_recent = max(files, key=lambda f: f.stat().st_mtime)
age_hours = (
datetime.now().timestamp() - most_recent.stat().st_mtime
) / 3600
return age_hours >= FRESHNESS_HOURS
+92
View File
@@ -0,0 +1,92 @@
"""Tests for nightly DB backups (T32).
The backup service is intentionally simple: a flat ``data/backups/`` dir
containing timestamped copies of ``chat.db``, with retention of the most
recent 14. The scheduling decision (``should_take_backup``) is a pure
function of clock + filesystem state so it can be unit-tested without
spinning up the BackgroundWorker tick loop.
"""
from __future__ import annotations
from datetime import datetime
from unittest.mock import patch
from chat.services.backup import (
prune_backups,
should_take_backup,
take_backup,
)
def test_take_backup_creates_timestamped_copy(tmp_path):
db = tmp_path / "chat.db"
db.write_text("fake db contents")
backup_path = take_backup(db_path=db, data_dir=tmp_path / "data")
assert backup_path.exists()
assert backup_path.name.startswith("chat-")
assert backup_path.name.endswith(".db")
# Contents copied
assert backup_path.read_text() == "fake db contents"
# Located in data/backups/
assert backup_path.parent == tmp_path / "data" / "backups"
def test_prune_keeps_last_14(tmp_path):
backup_dir = tmp_path / "data" / "backups"
backup_dir.mkdir(parents=True)
# Create 17 dummy backup files spanning days 1..17 of Jan 2026.
# Filenames sort lexicographically by the embedded timestamp, so
# prune_backups should drop the three oldest.
for i in range(1, 18):
(backup_dir / f"chat-202601{i:02d}T000000Z.db").write_text(
f"backup {i}"
)
removed = prune_backups(tmp_path / "data", keep=14)
assert removed == 3
remaining = sorted(backup_dir.glob("chat-*.db"))
assert len(remaining) == 14
# Days 1, 2, 3 removed; day 4 is now the oldest retained backup.
assert remaining[0].name == "chat-20260104T000000Z.db"
def test_should_take_backup_when_no_prior_and_target_hour_matches(tmp_path):
from chat.services import backup as backup_mod
class FakeDateTime(datetime):
@classmethod
def now(cls, tz=None):
return datetime(2026, 4, 26, 3, 0, 0)
with patch.object(backup_mod, "datetime", FakeDateTime):
assert should_take_backup(tmp_path / "data") is True
def test_should_not_take_backup_outside_target_hour(tmp_path):
from chat.services import backup as backup_mod
class FakeDateTime(datetime):
@classmethod
def now(cls, tz=None):
return datetime(2026, 4, 26, 14, 0, 0)
with patch.object(backup_mod, "datetime", FakeDateTime):
assert should_take_backup(tmp_path / "data") is False
def test_should_not_take_backup_when_recent_backup_exists(tmp_path):
backup_dir = tmp_path / "data" / "backups"
backup_dir.mkdir(parents=True)
recent = backup_dir / "chat-recent.db"
recent.write_text("x")
# mtime defaults to "now" — within the 23h freshness window so
# should_take_backup must return False even at the target hour.
from chat.services import backup as backup_mod
class FakeDateTime(datetime):
@classmethod
def now(cls, tz=None):
return datetime(2026, 4, 26, 3, 0, 0)
with patch.object(backup_mod, "datetime", FakeDateTime):
assert should_take_backup(tmp_path / "data") is False