feat: nightly DB backups with 14-day retention
This commit is contained in:
@@ -32,6 +32,11 @@ from chat.config import Settings
|
||||
from chat.db.connection import open_db
|
||||
from chat.eventlog.log import append_and_apply
|
||||
from chat.llm.client import LLMClient
|
||||
from chat.services.backup import (
|
||||
prune_backups,
|
||||
should_take_backup,
|
||||
take_backup,
|
||||
)
|
||||
from chat.services.significance import compute_significance
|
||||
from chat.services.snapshot import (
|
||||
prune_periodic_snapshots,
|
||||
@@ -39,6 +44,11 @@ from chat.services.snapshot import (
|
||||
take_snapshot,
|
||||
)
|
||||
|
||||
# T32: tick-loop wake interval. 60s gives a single backup window per
|
||||
# target hour with plenty of slack: should_take_backup's 23h freshness
|
||||
# guard prevents back-to-back runs.
|
||||
BACKUP_TICK_INTERVAL_SECONDS = 60.0
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -75,14 +85,27 @@ class BackgroundWorker:
|
||||
self._llm_client_factory = llm_client_factory
|
||||
self._queue: asyncio.Queue[SignificanceJob | None] = asyncio.Queue()
|
||||
self._task: asyncio.Task | None = None
|
||||
# T32: nightly-backup tick loop runs alongside the job loop. The
|
||||
# event is set by stop() to wake the loop early so shutdown is
|
||||
# snappy even mid-tick.
|
||||
self._tick_task: asyncio.Task | None = None
|
||||
self._tick_stop: asyncio.Event = asyncio.Event()
|
||||
self.enabled = enabled
|
||||
|
||||
async def start(self) -> None:
|
||||
if self._task is not None:
|
||||
return
|
||||
self._task = asyncio.create_task(self._run())
|
||||
self._tick_task = asyncio.create_task(self._tick_loop())
|
||||
|
||||
async def stop(self) -> None:
|
||||
# Stop the tick loop first — it has no in-flight work to drain,
|
||||
# so signalling early lets it exit while the job loop is still
|
||||
# finishing its sentinel handoff.
|
||||
self._tick_stop.set()
|
||||
if self._tick_task is not None:
|
||||
await self._tick_task
|
||||
self._tick_task = None
|
||||
if self._task is None:
|
||||
return
|
||||
await self._queue.put(None) # sentinel
|
||||
@@ -104,6 +127,40 @@ class BackgroundWorker:
|
||||
except Exception as exc: # noqa: BLE001 — worker must not die
|
||||
log.exception("significance job failed: %s", exc)
|
||||
|
||||
async def _tick_loop(self) -> None:
|
||||
"""Periodic-operations loop (T32 nightly backup).
|
||||
|
||||
Wakes every :data:`BACKUP_TICK_INTERVAL_SECONDS` seconds and
|
||||
asks :func:`should_take_backup` whether a backup is due. The
|
||||
scheduling decision lives in the backup module so we don't
|
||||
duplicate the "is it 03:00?" logic here. Failures are caught
|
||||
and logged so a flaky disk doesn't kill the loop — the next
|
||||
tick will retry.
|
||||
|
||||
Wait uses :func:`asyncio.wait_for` on ``_tick_stop`` so that
|
||||
:meth:`stop` can interrupt a sleeping tick instead of having to
|
||||
wait the full interval.
|
||||
"""
|
||||
while not self._tick_stop.is_set():
|
||||
try:
|
||||
if should_take_backup(self._settings.data_dir):
|
||||
take_backup(
|
||||
db_path=self._settings.db_path,
|
||||
data_dir=self._settings.data_dir,
|
||||
)
|
||||
prune_backups(self._settings.data_dir, keep=14)
|
||||
log.info("nightly backup taken")
|
||||
except Exception as exc: # noqa: BLE001 — never break the loop
|
||||
log.exception("backup tick failed: %s", exc)
|
||||
try:
|
||||
await asyncio.wait_for(
|
||||
self._tick_stop.wait(),
|
||||
timeout=BACKUP_TICK_INTERVAL_SECONDS,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
# Normal path: timed out waiting for stop, run another tick.
|
||||
pass
|
||||
|
||||
async def _process(self, job: SignificanceJob) -> None:
|
||||
client = self._llm_client_factory()
|
||||
score = await compute_significance(
|
||||
|
||||
@@ -0,0 +1,106 @@
|
||||
"""Nightly DB backup service (T32, Requirements §12).
|
||||
|
||||
A simple in-process scheduler: at 03:00 local time daily, copy
|
||||
``chat.db`` to ``data/backups/chat-<utc-timestamp>.db`` and prune to the
|
||||
14 most recent. The BackgroundWorker tick loop calls
|
||||
:func:`should_take_backup` every 60 seconds; when it returns True the
|
||||
worker calls :func:`take_backup` then :func:`prune_backups`.
|
||||
|
||||
The launchd plist suggested in §12 can replace this later by invoking a
|
||||
small script that calls :func:`take_backup` directly. For v1 the
|
||||
in-process loop is enough — the daemon already runs continuously to
|
||||
serve requests, so there's no extra moving part to install.
|
||||
|
||||
Backups capture the live ``.db`` file via :func:`shutil.copy2`. SQLite's
|
||||
WAL mode means an in-flight transaction's pages might live in the
|
||||
``-wal`` sidecar rather than the main file, but our codebase commits
|
||||
every write transaction synchronously, so the .db alone is sufficient
|
||||
for v1. A truly safe online backup would use
|
||||
``sqlite3.Connection.backup()``; deferred.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import shutil
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
# 03:00 local time per Requirements §12. Hardcoded for v1 — making this
|
||||
# configurable via Settings is straightforward but not needed yet.
|
||||
DEFAULT_BACKUP_HOUR = 3
|
||||
|
||||
# Retention window per Requirements §12 ("Last 14 retained").
|
||||
DEFAULT_KEEP = 14
|
||||
|
||||
# Wake interval for should_take_backup's freshness check. We wake the
|
||||
# tick loop every 60s, so a backup taken in the previous tick within the
|
||||
# same target hour must NOT trigger another. 23h gives us a generous
|
||||
# safety margin against scheduling jitter while still allowing a single
|
||||
# backup per day.
|
||||
FRESHNESS_HOURS = 23
|
||||
|
||||
|
||||
def take_backup(*, db_path: Path, data_dir: Path) -> Path:
|
||||
"""Copy ``db_path`` to ``data_dir/backups/chat-<utc-timestamp>.db``.
|
||||
|
||||
Returns the new file path. Creates the backup directory if missing.
|
||||
Uses :func:`shutil.copy2` so the destination's mtime is preserved —
|
||||
:func:`should_take_backup` reads mtime to gate fresh backups.
|
||||
"""
|
||||
backup_dir = data_dir / "backups"
|
||||
backup_dir.mkdir(parents=True, exist_ok=True)
|
||||
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
||||
backup_path = backup_dir / f"chat-{timestamp}.db"
|
||||
shutil.copy2(db_path, backup_path)
|
||||
return backup_path
|
||||
|
||||
|
||||
def prune_backups(data_dir: Path, *, keep: int = DEFAULT_KEEP) -> int:
|
||||
"""Remove all but the most recent ``keep`` backup files.
|
||||
|
||||
Returns the number of files removed. Safe when the directory is
|
||||
missing (returns 0). Sorting is by filename, which is the UTC
|
||||
timestamp embedded in the name — lexicographic order matches
|
||||
chronological order.
|
||||
"""
|
||||
backup_dir = data_dir / "backups"
|
||||
if not backup_dir.exists():
|
||||
return 0
|
||||
files = sorted(backup_dir.glob("chat-*.db"))
|
||||
to_remove = files[:-keep] if len(files) > keep else []
|
||||
for f in to_remove:
|
||||
f.unlink()
|
||||
return len(to_remove)
|
||||
|
||||
|
||||
def should_take_backup(
|
||||
data_dir: Path, *, target_hour: int = DEFAULT_BACKUP_HOUR
|
||||
) -> bool:
|
||||
"""Decide whether a nightly backup is due.
|
||||
|
||||
Two conditions must hold:
|
||||
|
||||
* The current local hour matches ``target_hour``.
|
||||
* No backup file in ``data_dir/backups/`` has an mtime within the
|
||||
last :data:`FRESHNESS_HOURS` (23h). The 23h window prevents a
|
||||
double-backup within the same target hour while still allowing
|
||||
the next day's run to fire on time.
|
||||
|
||||
Local time (not UTC) is used for the hour comparison per the
|
||||
requirements ("03:00 local time"). The filename embeds a UTC stamp
|
||||
so file ordering remains unambiguous across DST transitions.
|
||||
"""
|
||||
now = datetime.now()
|
||||
if now.hour != target_hour:
|
||||
return False
|
||||
backup_dir = data_dir / "backups"
|
||||
if not backup_dir.exists():
|
||||
return True
|
||||
files = list(backup_dir.glob("chat-*.db"))
|
||||
if not files:
|
||||
return True
|
||||
most_recent = max(files, key=lambda f: f.stat().st_mtime)
|
||||
age_hours = (
|
||||
datetime.now().timestamp() - most_recent.stat().st_mtime
|
||||
) / 3600
|
||||
return age_hours >= FRESHNESS_HOURS
|
||||
@@ -0,0 +1,92 @@
|
||||
"""Tests for nightly DB backups (T32).
|
||||
|
||||
The backup service is intentionally simple: a flat ``data/backups/`` dir
|
||||
containing timestamped copies of ``chat.db``, with retention of the most
|
||||
recent 14. The scheduling decision (``should_take_backup``) is a pure
|
||||
function of clock + filesystem state so it can be unit-tested without
|
||||
spinning up the BackgroundWorker tick loop.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
from unittest.mock import patch
|
||||
|
||||
from chat.services.backup import (
|
||||
prune_backups,
|
||||
should_take_backup,
|
||||
take_backup,
|
||||
)
|
||||
|
||||
|
||||
def test_take_backup_creates_timestamped_copy(tmp_path):
|
||||
db = tmp_path / "chat.db"
|
||||
db.write_text("fake db contents")
|
||||
backup_path = take_backup(db_path=db, data_dir=tmp_path / "data")
|
||||
assert backup_path.exists()
|
||||
assert backup_path.name.startswith("chat-")
|
||||
assert backup_path.name.endswith(".db")
|
||||
# Contents copied
|
||||
assert backup_path.read_text() == "fake db contents"
|
||||
# Located in data/backups/
|
||||
assert backup_path.parent == tmp_path / "data" / "backups"
|
||||
|
||||
|
||||
def test_prune_keeps_last_14(tmp_path):
|
||||
backup_dir = tmp_path / "data" / "backups"
|
||||
backup_dir.mkdir(parents=True)
|
||||
# Create 17 dummy backup files spanning days 1..17 of Jan 2026.
|
||||
# Filenames sort lexicographically by the embedded timestamp, so
|
||||
# prune_backups should drop the three oldest.
|
||||
for i in range(1, 18):
|
||||
(backup_dir / f"chat-202601{i:02d}T000000Z.db").write_text(
|
||||
f"backup {i}"
|
||||
)
|
||||
removed = prune_backups(tmp_path / "data", keep=14)
|
||||
assert removed == 3
|
||||
remaining = sorted(backup_dir.glob("chat-*.db"))
|
||||
assert len(remaining) == 14
|
||||
# Days 1, 2, 3 removed; day 4 is now the oldest retained backup.
|
||||
assert remaining[0].name == "chat-20260104T000000Z.db"
|
||||
|
||||
|
||||
def test_should_take_backup_when_no_prior_and_target_hour_matches(tmp_path):
|
||||
from chat.services import backup as backup_mod
|
||||
|
||||
class FakeDateTime(datetime):
|
||||
@classmethod
|
||||
def now(cls, tz=None):
|
||||
return datetime(2026, 4, 26, 3, 0, 0)
|
||||
|
||||
with patch.object(backup_mod, "datetime", FakeDateTime):
|
||||
assert should_take_backup(tmp_path / "data") is True
|
||||
|
||||
|
||||
def test_should_not_take_backup_outside_target_hour(tmp_path):
|
||||
from chat.services import backup as backup_mod
|
||||
|
||||
class FakeDateTime(datetime):
|
||||
@classmethod
|
||||
def now(cls, tz=None):
|
||||
return datetime(2026, 4, 26, 14, 0, 0)
|
||||
|
||||
with patch.object(backup_mod, "datetime", FakeDateTime):
|
||||
assert should_take_backup(tmp_path / "data") is False
|
||||
|
||||
|
||||
def test_should_not_take_backup_when_recent_backup_exists(tmp_path):
|
||||
backup_dir = tmp_path / "data" / "backups"
|
||||
backup_dir.mkdir(parents=True)
|
||||
recent = backup_dir / "chat-recent.db"
|
||||
recent.write_text("x")
|
||||
# mtime defaults to "now" — within the 23h freshness window so
|
||||
# should_take_backup must return False even at the target hour.
|
||||
from chat.services import backup as backup_mod
|
||||
|
||||
class FakeDateTime(datetime):
|
||||
@classmethod
|
||||
def now(cls, tz=None):
|
||||
return datetime(2026, 4, 26, 3, 0, 0)
|
||||
|
||||
with patch.object(backup_mod, "datetime", FakeDateTime):
|
||||
assert should_take_backup(tmp_path / "data") is False
|
||||
Reference in New Issue
Block a user