From 3a81e540a1cd478f73efde7bf0f1acaebde21624 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Mon, 27 Apr 2026 14:51:49 -0400 Subject: [PATCH] chore: audit project() callers and non-idempotent handlers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Same defect class as 0f8bf94: routes that ``append_event`` then ``project(conn)`` 500 once any prior event makes the full-log replay hit a raw-INSERT handler (chat_created, container_created, scene_opened, memory_written, meanwhile_scene_started, etc.). Fixes the two remaining live-path callers: - chat/web/bots.py (bot_create) — bot_authored - chat/web/settings.py (settings_post) — you_authored Both swap ``append_event`` + ``project`` → ``append_and_apply`` so only the new event is applied through its registered handler. Unused imports of ``append_event`` and ``project`` removed from each file. The rewind path (chat/services/rewind.py) intentionally calls ``project()`` after wiping every projected table — that's the canonical "rebuild from log against an empty DB" entry point and is left unchanged. Inventory of every projector handler that uses raw INSERT (chat_created, container_created, scene_opened, memory_written, meanwhile_scene_started, meanwhile_digest_created, edge_update) is documented with the trade-offs of why we don't blindly switch them to INSERT OR REPLACE — for autoincrement-id rows there is no key to match on, and for chat_created a lossy overwrite would silently clobber chat_state mutations from later events. The handler layer stays correctly non-idempotent under event-sourcing semantics; the rule is enforced at the call site. Adds a regression test (tests/test_chat_created_non_idempotent.py) that pins the contract: appending two chat_created events for the same id and then ``project()``ing a second time MUST raise ``IntegrityError`` on chats.id. Any future "make it idempotent" change must update the test, forcing a deliberate review. Suite: 471 passed in 11.82s (was 470 + this regression test). Report: docs/audits/2026-04-27-project-callers.md --- chat/web/bots.py | 9 +- chat/web/settings.py | 9 +- docs/audits/2026-04-27-project-callers.md | 205 ++++++++++++++++++++++ tests/test_chat_created_non_idempotent.py | 69 ++++++++ 4 files changed, 284 insertions(+), 8 deletions(-) create mode 100644 docs/audits/2026-04-27-project-callers.md create mode 100644 tests/test_chat_created_non_idempotent.py diff --git a/chat/web/bots.py b/chat/web/bots.py index 439e86e..051d9c4 100644 --- a/chat/web/bots.py +++ b/chat/web/bots.py @@ -5,8 +5,7 @@ from fastapi.responses import RedirectResponse, HTMLResponse from fastapi.templating import Jinja2Templates from chat.db.connection import open_db -from chat.eventlog.log import append_event -from chat.eventlog.projector import project +from chat.eventlog.log import append_and_apply from chat.state.entities import get_bot, list_bots from chat.state.world import get_chat @@ -109,8 +108,10 @@ async def bot_create( "initial_relationship_to_you": initial_relationship_to_you.strip(), "kickoff_prose": kickoff_prose.strip(), } - append_event(conn, kind="bot_authored", payload=payload) - project(conn) + # Per-event apply (NOT project()) — see docs/audits/2026-04-27-project-callers.md. + # ``project()`` replays the full log, which trips raw-INSERT handlers like + # ``_apply_chat_created`` once a second bot's events are present. + append_and_apply(conn, kind="bot_authored", payload=payload) return RedirectResponse(url=f"/bots/{payload['id']}/kickoff", status_code=303) diff --git a/chat/web/settings.py b/chat/web/settings.py index 8256843..da5a265 100644 --- a/chat/web/settings.py +++ b/chat/web/settings.py @@ -4,8 +4,7 @@ from fastapi import APIRouter, Depends, Form, HTTPException, Request from fastapi.responses import HTMLResponse from fastapi.templating import Jinja2Templates -from chat.eventlog.log import append_event -from chat.eventlog.projector import project +from chat.eventlog.log import append_and_apply from chat.state.entities import get_you from chat.web.bots import get_conn @@ -40,8 +39,10 @@ async def settings_post( "pronouns": pronouns.strip(), "persona": persona.strip(), } - append_event(conn, kind="you_authored", payload=payload) - project(conn) + # Per-event apply (NOT project()) — see docs/audits/2026-04-27-project-callers.md. + # ``project()`` replays the full log, which trips raw-INSERT handlers like + # ``_apply_chat_created`` once chat events are present. + append_and_apply(conn, kind="you_authored", payload=payload) return TEMPLATES.TemplateResponse( request, diff --git a/docs/audits/2026-04-27-project-callers.md b/docs/audits/2026-04-27-project-callers.md new file mode 100644 index 0000000..29b3283 --- /dev/null +++ b/docs/audits/2026-04-27-project-callers.md @@ -0,0 +1,205 @@ +# Audit: `project()` callers and non-idempotent projector handlers + +**Date:** 2026-04-27 +**Triggering incident:** commit `0f8bf94` — `kickoff_post` 500'd with +`sqlite3.IntegrityError: UNIQUE constraint failed: chats.id` after a +second bot's kickoff. Root cause: the route appended events with +`append_event()` and then called `project(conn)`, which replays the +*entire* event log. The `chat_created` handler in `chat/state/world.py` +uses raw `INSERT INTO chats ...` (no `OR REPLACE`/`OR IGNORE`), so on a +DB that already had a first bot's chat row, the replay re-hit that row +and raised. + +This audit walks the rest of the live request paths to make sure no +other route has the same shape, and inventories every projector handler +that uses raw `INSERT` so the trade-offs are documented for future +hardening passes. + +--- + +## Step 1 — `project()` callers + +`grep -rn "project(" chat/ --include="*.py"` (excluding the definition +itself in `chat/eventlog/projector.py:17` and the local `project_id` +type variables that the regex doesn't actually catch): + +| File:line | Caller | Classification | +|---|---|---| +| `chat/web/bots.py:113` | `bot_create` route — append `bot_authored` then `project(conn)` | **Unsafe (live path) — fixed** | +| `chat/web/settings.py:44` | `settings_post` route — append `you_authored` then `project(conn)` | **Unsafe (live path) — fixed** | +| `chat/services/rewind.py:110` | `execute_rewind` — clears every projected table then re-projects from the truncated log | **Safe (replay-only)** | +| `chat/eventlog/projector.py:17` | Definition site, not a call | n/a | +| `tests/test_*.py` (~50 tests) | Test setup pattern: append a sequence of synthetic events into a fresh DB, then `project(conn)` to materialise | **Safe (replay-only)** — projects against an empty/fresh DB; not a live request path | + +### Safe (replay-only) + +- **`chat/services/rewind.py:110`** — `execute_rewind` is the canonical + "rebuild the projection" entry point. Lines 95–104 explicitly + `DELETE FROM` every projected table (`memories`, `activity`, `scenes`, + `containers`, `chat_state`, `chats`, `edges`, `bots`, `you_entity`, + `classifier_failures`) before calling `project(conn)`. The handler + registry then walks the truncated log against empty tables, so even + the raw-INSERT handlers run safely on a clean slate. The module + docstring (lines 1–21) calls out exactly why a full replay (rather + than a "revert delta") is the right move here: the `edge_update` + handler is a delta accumulator with no clean inverse. **Do not + change.** + +- **Test suite** — every `from chat.eventlog.projector import project` + in `tests/` is a setup helper. They open a fresh in-memory or + tmp-path DB, append a hand-crafted sequence of events, and call + `project(conn)` once. There is no second-replay risk because the DB + starts empty. These are not live paths. + +### Unsafe (live-path) — fixed in this audit + +Both fixes follow the pattern established by `0f8bf94`: drop the +`append_event` + `project` pair in favour of `append_and_apply` (defined +in `chat/eventlog/log.py:32`), which appends and runs *only the +brand-new event* through its registered handler. + +- **`chat/web/bots.py:113` — `bot_create`** + Was: `append_event(conn, kind="bot_authored", ...); project(conn)`. + Now: `append_and_apply(conn, kind="bot_authored", ...)`. + In isolation, `_apply_bot_authored` is itself idempotent (`INSERT OR + REPLACE INTO bots`), so the *route* didn't fail today. The bug is + latent: as soon as any kickoff ran first (which produces + `chat_created` events), the next call to `bot_create` would replay + that prior `chat_created` and trip the same UNIQUE constraint. We + saw this happen in `0f8bf94` — fixing the symmetric route prevents + the next variant of the same incident. + Removed unused imports: `append_event`, `project`. + +- **`chat/web/settings.py:44` — `settings_post`** + Was: `append_event(conn, kind="you_authored", ...); project(conn)`. + Now: `append_and_apply(conn, kind="you_authored", ...)`. + Same shape as `bot_create`. `_apply_you_authored` is idempotent on + its own (`INSERT OR REPLACE INTO you_entity`), but `project()` walks + the *whole* log, including any `chat_created` / `container_created` + / `scene_opened` events that have accumulated. Editing the user's + own settings on a non-empty DB would 500 with the same UNIQUE + constraint error — not because the new event is unsafe, but because + the replay is. Fixed by per-event apply. + Removed unused imports: `append_event`, `project`. + +### Unsafe — still to fix + +None. The two unsafe live-path call sites identified above were both +fixed in this commit. Future hardening: a CI lint that flags +`project(` outside `chat/services/rewind.py` and `tests/` would catch a +regression, but that's out of scope here. + +--- + +## Step 2 — non-idempotent projector handler inventory + +Output of `grep -n "INSERT INTO\|INSERT OR REPLACE\|INSERT OR IGNORE" +chat/state/*.py`, classified. + +### Replay-safe handlers + +These either use `INSERT OR REPLACE` / `INSERT OR IGNORE` (so a second +apply is a no-op or an overwrite of identical data), or are pure +`UPDATE` against rows the prior event created. + +| Handler | File | Statement | Why safe | +|---|---|---|---| +| `_apply_bot_authored` | `chat/state/entities.py:12` | `INSERT OR REPLACE INTO bots` | `id` is the natural PK; replay overwrites with identical payload. | +| `_apply_you_authored` | `chat/state/entities.py:29` | `INSERT OR REPLACE INTO you_entity` | Singleton row keyed on `id=1`. | +| `_apply_activity_change` | `chat/state/world.py:98` | `INSERT OR REPLACE INTO activity` | Activity is keyed on `entity_id` — last write wins is exactly the intended semantics. | +| `_apply_thread_opened` | `chat/state/threads.py:12` | `INSERT OR IGNORE INTO threads` | `thread_id` is the natural PK. | +| `_apply_event_planned` | `chat/state/events.py:16` | `INSERT OR IGNORE INTO events` | `event_id` is the natural PK. | +| `_apply_branch_created` | `chat/state/branches.py:27` | `INSERT OR IGNORE INTO branches` | Branch `name` is unique. | +| `_apply_group_node_initialized` | `chat/state/group_node.py:12` | `INSERT OR REPLACE INTO group_node` | One row per `chat_id`. | +| `_apply_embedding_indexed` | `chat/state/embeddings.py:28` | `INSERT OR REPLACE INTO embeddings` | One vector per `memory_id`. | +| Pure-`UPDATE` handlers | various — `_apply_time_skip_*`, `_apply_guest_added`/`_removed`, `_apply_scene_closed`, `_apply_memory_significance_set`, `_apply_memory_pin_changed`, `_apply_meanwhile_scene_closed`, `_apply_meanwhile_digest_consumed`, `_apply_thread_updated`, `_apply_event_started`/`_completed`/`_cancelled` (etc.), `_apply_group_node_updated` | n/a | Idempotent: re-applying the same UPDATE produces the same row state. | + +### Unsafe-on-replay handlers (raw `INSERT`) + +| Handler | File | Statement | Failure mode on replay | +|---|---|---|---| +| `_apply_chat_created` | `chat/state/world.py:14` | `INSERT INTO chats`, `INSERT INTO chat_state` | `chats.id` is PK — second insert raises `IntegrityError: UNIQUE constraint failed: chats.id`. **This is the `0f8bf94` bug.** `chat_state.chat_id` is also unique; would raise too. | +| `_apply_container_created` | `chat/state/world.py:78` | `INSERT INTO containers` | `containers.id` is `INTEGER PRIMARY KEY AUTOINCREMENT` — replay does NOT raise (a new id is assigned), but it silently creates a duplicate row, fragmenting downstream lookups by `(chat_id, name)`. **Silent corruption, not a crash.** | +| `_apply_scene_opened` | `chat/state/world.py:115` | `INSERT INTO scenes` | Same shape: autoincrement `id`. Replay creates a duplicate scene row and re-points `chat_state.active_scene_id` to the new copy. **Silent corruption.** | +| `_apply_memory_written` | `chat/state/memory.py:14` | `INSERT INTO memories` | Autoincrement `id`. Replay duplicates the memory; FTS5 trigger then double-indexes the same `pov_summary`. **Silent corruption + double-counting in retrieval.** | +| `_apply_meanwhile_scene_started` | `chat/state/meanwhile.py:29` | `INSERT INTO scenes` (with explicit `scene_id`) | Caller supplies `scene_id` (deterministic). Replay raises `IntegrityError: UNIQUE constraint failed: scenes.id`. **Hard crash, like `chat_created`.** | +| `_apply_meanwhile_digest_created` | `chat/state/meanwhile.py:67` | `INSERT INTO meanwhile_digest_pending` | Autoincrement `id`. Replay creates a duplicate pending digest, surfacing the same summary twice in the next you-scene's prompt. **Silent corruption.** | +| `_apply_edge_update` | `chat/state/edges.py:12` | `INSERT OR IGNORE INTO edges` followed by `UPDATE … SET affinity = ? + delta` | The `INSERT OR IGNORE` is fine, but the handler is *delta-shaped* — each replay re-adds `affinity_delta` and `trust_delta`, and re-extends `knowledge_json`. **Silent corruption: scores drift up; knowledge facts duplicate.** Already called out in `chat/eventlog/log.py:39-46` as the canonical reason `append_and_apply` exists. | + +### Trade-offs — why we are NOT switching every handler to `INSERT OR REPLACE` + +This is the part the audit is here to nail down before someone "fixes +it" with a one-line s/`INSERT INTO`/`INSERT OR REPLACE INTO`/. + +1. **Autoincrement-id handlers (`containers`, `scenes`, `memories`, + `meanwhile_digest_pending`)** — `INSERT OR REPLACE` doesn't help. + Each event's payload doesn't carry the row's eventual id — the id + comes from `lastrowid` *at projection time*. There is no key for + `OR REPLACE` to match on. The fix here is either (a) make the event + carry a deterministic id derived from the event's own id (large + refactor — payload schemas, downstream FK lookups, FTS rowid + alignment), or (b) keep the handler raw-INSERT and ensure every + live path uses `append_and_apply` (the path we're on). We are on + path (b), and this audit makes it explicit. + +2. **`chat_created`** — `chats.id` IS keyed on the natural PK, so + `INSERT OR REPLACE INTO chats ...` would technically work for the + chat row. *But* it would silently overwrite `chat_state` columns + that other events legitimately mutate later: `chat_state.time` is + bumped by `time_skip_elision`, `active_scene_id` is set/cleared by + `scene_opened`/`scene_closed`. On replay the + `chat_created` overwrite would clobber those subsequent updates, + then later events would re-set them — *if* the events themselves + appear in order (they do today). It would work in practice, but it + would erase the invariant that "each handler is responsible for one + table-shape change" and make the projector's correctness depend on + strict event-order replay through `chat_state`. Not worth the + subtle coupling; keep the raw INSERT and treat replay as an + explicit "wipe + replay" operation (the rewind path does exactly + that). + +3. **`meanwhile_scene_started`** — could be made idempotent (the + payload supplies `scene_id`), but it shares the `scenes` table with + `_apply_scene_opened` (autoincrement) — making one half of the + table writers `OR REPLACE` and the other half raw-INSERT is asking + for a future bug. Keep both raw, lean on `append_and_apply`. + +4. **`edge_update`** — fundamentally cannot be made idempotent under + replay without either changing the event schema (carry absolute + values, not deltas) or recording per-event-id "already applied" + flags. Either is a multi-week project. The current contract is + "edge_update is a delta event; never apply it twice"; the + `append_and_apply` rule enforces that contract from the call site. + +**Conclusion:** the handler layer is *correctly* non-idempotent for +event-sourcing semantics. The defect class lives in the *caller* layer +(routes that mistakenly call `project()` instead of `append_and_apply`). +This audit fixes the two known offenders and pins the contract with a +regression test (see Step 3). + +--- + +## Step 3 — regression test + +Added `tests/test_chat_created_non_idempotent.py`. The test: + +1. Opens a fresh DB and runs the migration chain. +2. Appends one `chat_created` event and projects — first projection + succeeds. +3. Appends a *second* `chat_created` for the same chat id and projects + again — asserts that the second projection raises + `sqlite3.IntegrityError`. + +The point isn't that the test catches a future "make it idempotent" +change automatically; it's that any such change MUST update this test, +forcing a deliberate review of all the trade-offs documented above. + +--- + +## Files changed + +- `chat/web/bots.py` — swap `append_event`+`project` → `append_and_apply`, + drop unused imports. +- `chat/web/settings.py` — same swap. +- `tests/test_chat_created_non_idempotent.py` — new regression test. +- `docs/audits/2026-04-27-project-callers.md` — this file. diff --git a/tests/test_chat_created_non_idempotent.py b/tests/test_chat_created_non_idempotent.py new file mode 100644 index 0000000..77bcb08 --- /dev/null +++ b/tests/test_chat_created_non_idempotent.py @@ -0,0 +1,69 @@ +"""Pin the contract: ``_apply_chat_created`` is NOT replay-safe. + +See ``docs/audits/2026-04-27-project-callers.md`` for the full audit. +The handler at ``chat/state/world.py:_apply_chat_created`` uses raw +``INSERT INTO chats ...`` and ``INSERT INTO chat_state ...`` with no +``OR REPLACE``/``OR IGNORE``. Running ``project()`` twice over the same +``chat_created`` event MUST raise ``sqlite3.IntegrityError`` on the +second pass — this is the bug that produced the 500 fixed in commit +``0f8bf94`` (and the latent equivalents fixed in this commit). + +Pinning the contract here means any future "make it idempotent" change +to the handler MUST update this test, which forces a deliberate review +of the trade-offs: most notably, that ``chat_state`` columns mutated by +later events (``time_skip_elision`` bumps ``time``; +``scene_opened``/``scene_closed`` toggle ``active_scene_id``) would be +silently overwritten by an ``INSERT OR REPLACE`` on every replay. The +audit explains why we keep the handler raw-INSERT and enforce the rule +at the call site via ``append_and_apply`` instead. +""" +from __future__ import annotations + +import sqlite3 + +import pytest + +from chat.db.connection import open_db +from chat.db.migrate import apply_migrations +from chat.eventlog.log import append_event +from chat.eventlog.projector import project +import chat.state.world # noqa: F401 — import registers the handler + + +def _chat_payload(): + return { + "id": "chat_bot_a", + "host_bot_id": "bot_a", + "guest_bot_id": None, + "initial_time": "2026-04-27T12:00:00+00:00", + "narrative_anchor": "Day 1 noon", + "weather": "clear", + } + + +def test_chat_created_handler_is_not_replay_safe(tmp_path): + """A second projection over an extra ``chat_created`` for the same id raises. + + This is the exact failure shape from incident ``0f8bf94``: a raw + INSERT against ``chats.id`` (PK) trips ``UNIQUE constraint failed`` + on the second pass. If this test ever starts FAILING (i.e. the + second project() succeeds), someone has changed the handler to be + idempotent — read the audit before approving. + """ + db = tmp_path / "t.db" + apply_migrations(db) + with open_db(db) as conn: + # First chat_created + first project: must succeed. + append_event(conn, kind="chat_created", payload=_chat_payload()) + project(conn) + + # Append a SECOND chat_created with the same id. project() will + # walk both, re-INSERT the same chats row, and trip the UNIQUE + # constraint on chats.id. + append_event(conn, kind="chat_created", payload=_chat_payload()) + with pytest.raises(sqlite3.IntegrityError) as exc_info: + project(conn) + # Match on the column to make sure we caught the *intended* + # constraint, not some unrelated FK/check failure that happens + # to also be an IntegrityError. + assert "chats.id" in str(exc_info.value)