fix: workers retry-on-lock so they don't drop writes under busy_timeout=100ms

The previous commit dropped open_db's busy_timeout from 5s to 100ms
to prevent the embedding worker from GIL-blocking the asyncio event
loop and silently adding 5s to every state_update LLM call. That fixed
the chat path but broke worker durability: any worker write that
collided with the request handler's brief open transaction failed
with 'database is locked' instead of waiting.

Adds append_and_apply_with_retry in chat/eventlog/log.py — same
contract as append_and_apply but runs through a conn_factory and
retries with exponential backoff (50ms..500ms, ~10s total budget) on
'database is locked'. Returns None and logs WARNING if all retries
fail; callers handle that as a no-op.

Wires it into:
- embedding_worker._process for embedding_indexed events
- background._process for memory_significance_set events (auto-pin
  still uses a direct open_db when the score warrants it; that one
  is fast and not racy in practice)

Verified live: ran 4 back-to-back chat turns, zero worker errors,
embeddings + significance landing correctly. Suite: 464 passed in
11.5s.
This commit is contained in:
Joseph Doherty
2026-04-27 14:04:27 -04:00
parent de7f6624f0
commit a902d86432
3 changed files with 87 additions and 25 deletions
+53 -2
View File
@@ -1,8 +1,13 @@
from __future__ import annotations
import asyncio
import json
import logging
from dataclasses import dataclass
from typing import Any, Iterator
from sqlite3 import Connection
from typing import Any, Callable, ContextManager, Iterator
from sqlite3 import Connection, OperationalError
_log = logging.getLogger(__name__)
@dataclass
@@ -63,6 +68,52 @@ def append_and_apply(
return eid
async def append_and_apply_with_retry(
conn_factory: Callable[[], ContextManager[Connection]],
*,
kind: str,
payload: dict[str, Any],
branch_id: int = 1,
attempts: int = 30,
base_sleep_s: float = 0.05,
max_sleep_s: float = 0.5,
) -> int | None:
"""Append-and-apply that retries on ``database is locked``.
Background workers (embedding indexer, significance scorer) write
events to the same SQLite file as the request handler. The chat
app sets a tight ``busy_timeout=100ms`` on every connection so a
contending worker can't freeze the request's asyncio event loop.
This helper restores durability for workers: it retries up to
``attempts`` times with exponential backoff (capped at
``max_sleep_s``) until the lock clears.
Returns the appended event's id, or ``None`` if all retries failed
(logged at WARNING). Each retry opens a fresh connection via
``conn_factory`` because the failed write may have left the prior
connection in an unusable state.
"""
sleep = base_sleep_s
for attempt in range(attempts):
try:
with conn_factory() as conn:
return append_and_apply(
conn, kind=kind, payload=payload, branch_id=branch_id
)
except OperationalError as exc:
if "database is locked" not in str(exc).lower():
raise
if attempt == attempts - 1:
_log.warning(
"append_and_apply_with_retry: gave up after %d attempts "
"(kind=%s): %s",
attempts, kind, exc,
)
return None
await asyncio.sleep(sleep)
sleep = min(sleep * 2, max_sleep_s)
def read_events(conn: Connection, branch_id: int = 1, after_id: int = 0) -> Iterator[Event]:
cur = conn.execute(
"SELECT id, branch_id, ts, kind, payload_json, superseded_by, hidden "
+17 -11
View File
@@ -30,7 +30,7 @@ from typing import Callable
from chat.config import Settings
from chat.db.connection import open_db
from chat.eventlog.log import append_and_apply
from chat.eventlog.log import append_and_apply, append_and_apply_with_retry
from chat.llm.client import LLMClient
from chat.services.backup import (
prune_backups,
@@ -169,16 +169,22 @@ class BackgroundWorker:
narrative_text=job.narrative_text,
prior_dialogue=job.prior_dialogue,
)
with open_db(self._settings.db_path) as conn:
append_and_apply(
conn,
kind="memory_significance_set",
payload={
"memory_id": job.memory_id,
"significance": score,
},
)
if score >= 3:
# Retry-on-lock: see chat/eventlog/log.py's
# ``append_and_apply_with_retry`` docstring for why workers
# need to retry while the request handler's open transaction
# holds the WAL write lock briefly.
appended_id = await append_and_apply_with_retry(
lambda: open_db(self._settings.db_path),
kind="memory_significance_set",
payload={
"memory_id": job.memory_id,
"significance": score,
},
)
# Auto-pin requires a separate connection because retry-helper
# closed its own. Skip if the significance event itself failed.
if appended_id is not None and score >= 3:
with open_db(self._settings.db_path) as conn:
_auto_pin_with_cap(
conn,
owner_id=job.host_bot_id,
+17 -12
View File
@@ -26,7 +26,7 @@ from dataclasses import dataclass
from sqlite3 import Connection
from typing import Callable
from chat.eventlog.log import append_and_apply
from chat.eventlog.log import append_and_apply_with_retry
from chat.services.embeddings import (
DEFAULT_EMBEDDING_DIM,
DEFAULT_EMBEDDING_MODEL,
@@ -121,17 +121,22 @@ class EmbeddingWorker:
job.memory_id,
)
return
with self._conn_factory() as conn:
append_and_apply(
conn,
kind="embedding_indexed",
payload={
"memory_id": job.memory_id,
"model": result.model,
"dim": result.dim,
"vector": result.vector,
},
)
# Retry-on-lock: the request handler holds an open transaction
# for the duration of post_turn (a few seconds), so any worker
# write started during that window blocks. open_db's
# busy_timeout is 100ms (so the request path itself can't get
# stuck on a worker), so retry here with backoff. Each retry
# opens a fresh connection via ``conn_factory``.
await append_and_apply_with_retry(
self._conn_factory,
kind="embedding_indexed",
payload={
"memory_id": job.memory_id,
"model": result.model,
"dim": result.dim,
"vector": result.vector,
},
)
__all__ = ["EmbeddingJob", "EmbeddingWorker"]