feat: nightly DB backups with 14-day retention
This commit is contained in:
@@ -32,6 +32,11 @@ from chat.config import Settings
|
|||||||
from chat.db.connection import open_db
|
from chat.db.connection import open_db
|
||||||
from chat.eventlog.log import append_and_apply
|
from chat.eventlog.log import append_and_apply
|
||||||
from chat.llm.client import LLMClient
|
from chat.llm.client import LLMClient
|
||||||
|
from chat.services.backup import (
|
||||||
|
prune_backups,
|
||||||
|
should_take_backup,
|
||||||
|
take_backup,
|
||||||
|
)
|
||||||
from chat.services.significance import compute_significance
|
from chat.services.significance import compute_significance
|
||||||
from chat.services.snapshot import (
|
from chat.services.snapshot import (
|
||||||
prune_periodic_snapshots,
|
prune_periodic_snapshots,
|
||||||
@@ -39,6 +44,11 @@ from chat.services.snapshot import (
|
|||||||
take_snapshot,
|
take_snapshot,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# T32: tick-loop wake interval. 60s gives a single backup window per
|
||||||
|
# target hour with plenty of slack: should_take_backup's 23h freshness
|
||||||
|
# guard prevents back-to-back runs.
|
||||||
|
BACKUP_TICK_INTERVAL_SECONDS = 60.0
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@@ -75,14 +85,27 @@ class BackgroundWorker:
|
|||||||
self._llm_client_factory = llm_client_factory
|
self._llm_client_factory = llm_client_factory
|
||||||
self._queue: asyncio.Queue[SignificanceJob | None] = asyncio.Queue()
|
self._queue: asyncio.Queue[SignificanceJob | None] = asyncio.Queue()
|
||||||
self._task: asyncio.Task | None = None
|
self._task: asyncio.Task | None = None
|
||||||
|
# T32: nightly-backup tick loop runs alongside the job loop. The
|
||||||
|
# event is set by stop() to wake the loop early so shutdown is
|
||||||
|
# snappy even mid-tick.
|
||||||
|
self._tick_task: asyncio.Task | None = None
|
||||||
|
self._tick_stop: asyncio.Event = asyncio.Event()
|
||||||
self.enabled = enabled
|
self.enabled = enabled
|
||||||
|
|
||||||
async def start(self) -> None:
|
async def start(self) -> None:
|
||||||
if self._task is not None:
|
if self._task is not None:
|
||||||
return
|
return
|
||||||
self._task = asyncio.create_task(self._run())
|
self._task = asyncio.create_task(self._run())
|
||||||
|
self._tick_task = asyncio.create_task(self._tick_loop())
|
||||||
|
|
||||||
async def stop(self) -> None:
|
async def stop(self) -> None:
|
||||||
|
# Stop the tick loop first — it has no in-flight work to drain,
|
||||||
|
# so signalling early lets it exit while the job loop is still
|
||||||
|
# finishing its sentinel handoff.
|
||||||
|
self._tick_stop.set()
|
||||||
|
if self._tick_task is not None:
|
||||||
|
await self._tick_task
|
||||||
|
self._tick_task = None
|
||||||
if self._task is None:
|
if self._task is None:
|
||||||
return
|
return
|
||||||
await self._queue.put(None) # sentinel
|
await self._queue.put(None) # sentinel
|
||||||
@@ -104,6 +127,40 @@ class BackgroundWorker:
|
|||||||
except Exception as exc: # noqa: BLE001 — worker must not die
|
except Exception as exc: # noqa: BLE001 — worker must not die
|
||||||
log.exception("significance job failed: %s", exc)
|
log.exception("significance job failed: %s", exc)
|
||||||
|
|
||||||
|
async def _tick_loop(self) -> None:
|
||||||
|
"""Periodic-operations loop (T32 nightly backup).
|
||||||
|
|
||||||
|
Wakes every :data:`BACKUP_TICK_INTERVAL_SECONDS` seconds and
|
||||||
|
asks :func:`should_take_backup` whether a backup is due. The
|
||||||
|
scheduling decision lives in the backup module so we don't
|
||||||
|
duplicate the "is it 03:00?" logic here. Failures are caught
|
||||||
|
and logged so a flaky disk doesn't kill the loop — the next
|
||||||
|
tick will retry.
|
||||||
|
|
||||||
|
Wait uses :func:`asyncio.wait_for` on ``_tick_stop`` so that
|
||||||
|
:meth:`stop` can interrupt a sleeping tick instead of having to
|
||||||
|
wait the full interval.
|
||||||
|
"""
|
||||||
|
while not self._tick_stop.is_set():
|
||||||
|
try:
|
||||||
|
if should_take_backup(self._settings.data_dir):
|
||||||
|
take_backup(
|
||||||
|
db_path=self._settings.db_path,
|
||||||
|
data_dir=self._settings.data_dir,
|
||||||
|
)
|
||||||
|
prune_backups(self._settings.data_dir, keep=14)
|
||||||
|
log.info("nightly backup taken")
|
||||||
|
except Exception as exc: # noqa: BLE001 — never break the loop
|
||||||
|
log.exception("backup tick failed: %s", exc)
|
||||||
|
try:
|
||||||
|
await asyncio.wait_for(
|
||||||
|
self._tick_stop.wait(),
|
||||||
|
timeout=BACKUP_TICK_INTERVAL_SECONDS,
|
||||||
|
)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
# Normal path: timed out waiting for stop, run another tick.
|
||||||
|
pass
|
||||||
|
|
||||||
async def _process(self, job: SignificanceJob) -> None:
|
async def _process(self, job: SignificanceJob) -> None:
|
||||||
client = self._llm_client_factory()
|
client = self._llm_client_factory()
|
||||||
score = await compute_significance(
|
score = await compute_significance(
|
||||||
|
|||||||
@@ -0,0 +1,106 @@
|
|||||||
|
"""Nightly DB backup service (T32, Requirements §12).
|
||||||
|
|
||||||
|
A simple in-process scheduler: at 03:00 local time daily, copy
|
||||||
|
``chat.db`` to ``data/backups/chat-<utc-timestamp>.db`` and prune to the
|
||||||
|
14 most recent. The BackgroundWorker tick loop calls
|
||||||
|
:func:`should_take_backup` every 60 seconds; when it returns True the
|
||||||
|
worker calls :func:`take_backup` then :func:`prune_backups`.
|
||||||
|
|
||||||
|
The launchd plist suggested in §12 can replace this later by invoking a
|
||||||
|
small script that calls :func:`take_backup` directly. For v1 the
|
||||||
|
in-process loop is enough — the daemon already runs continuously to
|
||||||
|
serve requests, so there's no extra moving part to install.
|
||||||
|
|
||||||
|
Backups capture the live ``.db`` file via :func:`shutil.copy2`. SQLite's
|
||||||
|
WAL mode means an in-flight transaction's pages might live in the
|
||||||
|
``-wal`` sidecar rather than the main file, but our codebase commits
|
||||||
|
every write transaction synchronously, so the .db alone is sufficient
|
||||||
|
for v1. A truly safe online backup would use
|
||||||
|
``sqlite3.Connection.backup()``; deferred.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import shutil
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# 03:00 local time per Requirements §12. Hardcoded for v1 — making this
|
||||||
|
# configurable via Settings is straightforward but not needed yet.
|
||||||
|
DEFAULT_BACKUP_HOUR = 3
|
||||||
|
|
||||||
|
# Retention window per Requirements §12 ("Last 14 retained").
|
||||||
|
DEFAULT_KEEP = 14
|
||||||
|
|
||||||
|
# Wake interval for should_take_backup's freshness check. We wake the
|
||||||
|
# tick loop every 60s, so a backup taken in the previous tick within the
|
||||||
|
# same target hour must NOT trigger another. 23h gives us a generous
|
||||||
|
# safety margin against scheduling jitter while still allowing a single
|
||||||
|
# backup per day.
|
||||||
|
FRESHNESS_HOURS = 23
|
||||||
|
|
||||||
|
|
||||||
|
def take_backup(*, db_path: Path, data_dir: Path) -> Path:
|
||||||
|
"""Copy ``db_path`` to ``data_dir/backups/chat-<utc-timestamp>.db``.
|
||||||
|
|
||||||
|
Returns the new file path. Creates the backup directory if missing.
|
||||||
|
Uses :func:`shutil.copy2` so the destination's mtime is preserved —
|
||||||
|
:func:`should_take_backup` reads mtime to gate fresh backups.
|
||||||
|
"""
|
||||||
|
backup_dir = data_dir / "backups"
|
||||||
|
backup_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
||||||
|
backup_path = backup_dir / f"chat-{timestamp}.db"
|
||||||
|
shutil.copy2(db_path, backup_path)
|
||||||
|
return backup_path
|
||||||
|
|
||||||
|
|
||||||
|
def prune_backups(data_dir: Path, *, keep: int = DEFAULT_KEEP) -> int:
|
||||||
|
"""Remove all but the most recent ``keep`` backup files.
|
||||||
|
|
||||||
|
Returns the number of files removed. Safe when the directory is
|
||||||
|
missing (returns 0). Sorting is by filename, which is the UTC
|
||||||
|
timestamp embedded in the name — lexicographic order matches
|
||||||
|
chronological order.
|
||||||
|
"""
|
||||||
|
backup_dir = data_dir / "backups"
|
||||||
|
if not backup_dir.exists():
|
||||||
|
return 0
|
||||||
|
files = sorted(backup_dir.glob("chat-*.db"))
|
||||||
|
to_remove = files[:-keep] if len(files) > keep else []
|
||||||
|
for f in to_remove:
|
||||||
|
f.unlink()
|
||||||
|
return len(to_remove)
|
||||||
|
|
||||||
|
|
||||||
|
def should_take_backup(
|
||||||
|
data_dir: Path, *, target_hour: int = DEFAULT_BACKUP_HOUR
|
||||||
|
) -> bool:
|
||||||
|
"""Decide whether a nightly backup is due.
|
||||||
|
|
||||||
|
Two conditions must hold:
|
||||||
|
|
||||||
|
* The current local hour matches ``target_hour``.
|
||||||
|
* No backup file in ``data_dir/backups/`` has an mtime within the
|
||||||
|
last :data:`FRESHNESS_HOURS` (23h). The 23h window prevents a
|
||||||
|
double-backup within the same target hour while still allowing
|
||||||
|
the next day's run to fire on time.
|
||||||
|
|
||||||
|
Local time (not UTC) is used for the hour comparison per the
|
||||||
|
requirements ("03:00 local time"). The filename embeds a UTC stamp
|
||||||
|
so file ordering remains unambiguous across DST transitions.
|
||||||
|
"""
|
||||||
|
now = datetime.now()
|
||||||
|
if now.hour != target_hour:
|
||||||
|
return False
|
||||||
|
backup_dir = data_dir / "backups"
|
||||||
|
if not backup_dir.exists():
|
||||||
|
return True
|
||||||
|
files = list(backup_dir.glob("chat-*.db"))
|
||||||
|
if not files:
|
||||||
|
return True
|
||||||
|
most_recent = max(files, key=lambda f: f.stat().st_mtime)
|
||||||
|
age_hours = (
|
||||||
|
datetime.now().timestamp() - most_recent.stat().st_mtime
|
||||||
|
) / 3600
|
||||||
|
return age_hours >= FRESHNESS_HOURS
|
||||||
@@ -0,0 +1,92 @@
|
|||||||
|
"""Tests for nightly DB backups (T32).
|
||||||
|
|
||||||
|
The backup service is intentionally simple: a flat ``data/backups/`` dir
|
||||||
|
containing timestamped copies of ``chat.db``, with retention of the most
|
||||||
|
recent 14. The scheduling decision (``should_take_backup``) is a pure
|
||||||
|
function of clock + filesystem state so it can be unit-tested without
|
||||||
|
spinning up the BackgroundWorker tick loop.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
from chat.services.backup import (
|
||||||
|
prune_backups,
|
||||||
|
should_take_backup,
|
||||||
|
take_backup,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_take_backup_creates_timestamped_copy(tmp_path):
|
||||||
|
db = tmp_path / "chat.db"
|
||||||
|
db.write_text("fake db contents")
|
||||||
|
backup_path = take_backup(db_path=db, data_dir=tmp_path / "data")
|
||||||
|
assert backup_path.exists()
|
||||||
|
assert backup_path.name.startswith("chat-")
|
||||||
|
assert backup_path.name.endswith(".db")
|
||||||
|
# Contents copied
|
||||||
|
assert backup_path.read_text() == "fake db contents"
|
||||||
|
# Located in data/backups/
|
||||||
|
assert backup_path.parent == tmp_path / "data" / "backups"
|
||||||
|
|
||||||
|
|
||||||
|
def test_prune_keeps_last_14(tmp_path):
|
||||||
|
backup_dir = tmp_path / "data" / "backups"
|
||||||
|
backup_dir.mkdir(parents=True)
|
||||||
|
# Create 17 dummy backup files spanning days 1..17 of Jan 2026.
|
||||||
|
# Filenames sort lexicographically by the embedded timestamp, so
|
||||||
|
# prune_backups should drop the three oldest.
|
||||||
|
for i in range(1, 18):
|
||||||
|
(backup_dir / f"chat-202601{i:02d}T000000Z.db").write_text(
|
||||||
|
f"backup {i}"
|
||||||
|
)
|
||||||
|
removed = prune_backups(tmp_path / "data", keep=14)
|
||||||
|
assert removed == 3
|
||||||
|
remaining = sorted(backup_dir.glob("chat-*.db"))
|
||||||
|
assert len(remaining) == 14
|
||||||
|
# Days 1, 2, 3 removed; day 4 is now the oldest retained backup.
|
||||||
|
assert remaining[0].name == "chat-20260104T000000Z.db"
|
||||||
|
|
||||||
|
|
||||||
|
def test_should_take_backup_when_no_prior_and_target_hour_matches(tmp_path):
|
||||||
|
from chat.services import backup as backup_mod
|
||||||
|
|
||||||
|
class FakeDateTime(datetime):
|
||||||
|
@classmethod
|
||||||
|
def now(cls, tz=None):
|
||||||
|
return datetime(2026, 4, 26, 3, 0, 0)
|
||||||
|
|
||||||
|
with patch.object(backup_mod, "datetime", FakeDateTime):
|
||||||
|
assert should_take_backup(tmp_path / "data") is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_should_not_take_backup_outside_target_hour(tmp_path):
|
||||||
|
from chat.services import backup as backup_mod
|
||||||
|
|
||||||
|
class FakeDateTime(datetime):
|
||||||
|
@classmethod
|
||||||
|
def now(cls, tz=None):
|
||||||
|
return datetime(2026, 4, 26, 14, 0, 0)
|
||||||
|
|
||||||
|
with patch.object(backup_mod, "datetime", FakeDateTime):
|
||||||
|
assert should_take_backup(tmp_path / "data") is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_should_not_take_backup_when_recent_backup_exists(tmp_path):
|
||||||
|
backup_dir = tmp_path / "data" / "backups"
|
||||||
|
backup_dir.mkdir(parents=True)
|
||||||
|
recent = backup_dir / "chat-recent.db"
|
||||||
|
recent.write_text("x")
|
||||||
|
# mtime defaults to "now" — within the 23h freshness window so
|
||||||
|
# should_take_backup must return False even at the target hour.
|
||||||
|
from chat.services import backup as backup_mod
|
||||||
|
|
||||||
|
class FakeDateTime(datetime):
|
||||||
|
@classmethod
|
||||||
|
def now(cls, tz=None):
|
||||||
|
return datetime(2026, 4, 26, 3, 0, 0)
|
||||||
|
|
||||||
|
with patch.object(backup_mod, "datetime", FakeDateTime):
|
||||||
|
assert should_take_backup(tmp_path / "data") is False
|
||||||
Reference in New Issue
Block a user