fix: workers retry-on-lock so they don't drop writes under busy_timeout=100ms
The previous commit dropped open_db's busy_timeout from 5s to 100ms to prevent the embedding worker from GIL-blocking the asyncio event loop and silently adding 5s to every state_update LLM call. That fixed the chat path but broke worker durability: any worker write that collided with the request handler's brief open transaction failed with 'database is locked' instead of waiting. Adds append_and_apply_with_retry in chat/eventlog/log.py — same contract as append_and_apply but runs through a conn_factory and retries with exponential backoff (50ms..500ms, ~10s total budget) on 'database is locked'. Returns None and logs WARNING if all retries fail; callers handle that as a no-op. Wires it into: - embedding_worker._process for embedding_indexed events - background._process for memory_significance_set events (auto-pin still uses a direct open_db when the score warrants it; that one is fast and not racy in practice) Verified live: ran 4 back-to-back chat turns, zero worker errors, embeddings + significance landing correctly. Suite: 464 passed in 11.5s.
This commit is contained in:
+53
-2
@@ -1,8 +1,13 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
import asyncio
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Any, Iterator
|
from typing import Any, Callable, ContextManager, Iterator
|
||||||
from sqlite3 import Connection
|
from sqlite3 import Connection, OperationalError
|
||||||
|
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -63,6 +68,52 @@ def append_and_apply(
|
|||||||
return eid
|
return eid
|
||||||
|
|
||||||
|
|
||||||
|
async def append_and_apply_with_retry(
|
||||||
|
conn_factory: Callable[[], ContextManager[Connection]],
|
||||||
|
*,
|
||||||
|
kind: str,
|
||||||
|
payload: dict[str, Any],
|
||||||
|
branch_id: int = 1,
|
||||||
|
attempts: int = 30,
|
||||||
|
base_sleep_s: float = 0.05,
|
||||||
|
max_sleep_s: float = 0.5,
|
||||||
|
) -> int | None:
|
||||||
|
"""Append-and-apply that retries on ``database is locked``.
|
||||||
|
|
||||||
|
Background workers (embedding indexer, significance scorer) write
|
||||||
|
events to the same SQLite file as the request handler. The chat
|
||||||
|
app sets a tight ``busy_timeout=100ms`` on every connection so a
|
||||||
|
contending worker can't freeze the request's asyncio event loop.
|
||||||
|
This helper restores durability for workers: it retries up to
|
||||||
|
``attempts`` times with exponential backoff (capped at
|
||||||
|
``max_sleep_s``) until the lock clears.
|
||||||
|
|
||||||
|
Returns the appended event's id, or ``None`` if all retries failed
|
||||||
|
(logged at WARNING). Each retry opens a fresh connection via
|
||||||
|
``conn_factory`` because the failed write may have left the prior
|
||||||
|
connection in an unusable state.
|
||||||
|
"""
|
||||||
|
sleep = base_sleep_s
|
||||||
|
for attempt in range(attempts):
|
||||||
|
try:
|
||||||
|
with conn_factory() as conn:
|
||||||
|
return append_and_apply(
|
||||||
|
conn, kind=kind, payload=payload, branch_id=branch_id
|
||||||
|
)
|
||||||
|
except OperationalError as exc:
|
||||||
|
if "database is locked" not in str(exc).lower():
|
||||||
|
raise
|
||||||
|
if attempt == attempts - 1:
|
||||||
|
_log.warning(
|
||||||
|
"append_and_apply_with_retry: gave up after %d attempts "
|
||||||
|
"(kind=%s): %s",
|
||||||
|
attempts, kind, exc,
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
await asyncio.sleep(sleep)
|
||||||
|
sleep = min(sleep * 2, max_sleep_s)
|
||||||
|
|
||||||
|
|
||||||
def read_events(conn: Connection, branch_id: int = 1, after_id: int = 0) -> Iterator[Event]:
|
def read_events(conn: Connection, branch_id: int = 1, after_id: int = 0) -> Iterator[Event]:
|
||||||
cur = conn.execute(
|
cur = conn.execute(
|
||||||
"SELECT id, branch_id, ts, kind, payload_json, superseded_by, hidden "
|
"SELECT id, branch_id, ts, kind, payload_json, superseded_by, hidden "
|
||||||
|
|||||||
+17
-11
@@ -30,7 +30,7 @@ from typing import Callable
|
|||||||
|
|
||||||
from chat.config import Settings
|
from chat.config import Settings
|
||||||
from chat.db.connection import open_db
|
from chat.db.connection import open_db
|
||||||
from chat.eventlog.log import append_and_apply
|
from chat.eventlog.log import append_and_apply, append_and_apply_with_retry
|
||||||
from chat.llm.client import LLMClient
|
from chat.llm.client import LLMClient
|
||||||
from chat.services.backup import (
|
from chat.services.backup import (
|
||||||
prune_backups,
|
prune_backups,
|
||||||
@@ -169,16 +169,22 @@ class BackgroundWorker:
|
|||||||
narrative_text=job.narrative_text,
|
narrative_text=job.narrative_text,
|
||||||
prior_dialogue=job.prior_dialogue,
|
prior_dialogue=job.prior_dialogue,
|
||||||
)
|
)
|
||||||
with open_db(self._settings.db_path) as conn:
|
# Retry-on-lock: see chat/eventlog/log.py's
|
||||||
append_and_apply(
|
# ``append_and_apply_with_retry`` docstring for why workers
|
||||||
conn,
|
# need to retry while the request handler's open transaction
|
||||||
kind="memory_significance_set",
|
# holds the WAL write lock briefly.
|
||||||
payload={
|
appended_id = await append_and_apply_with_retry(
|
||||||
"memory_id": job.memory_id,
|
lambda: open_db(self._settings.db_path),
|
||||||
"significance": score,
|
kind="memory_significance_set",
|
||||||
},
|
payload={
|
||||||
)
|
"memory_id": job.memory_id,
|
||||||
if score >= 3:
|
"significance": score,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
# Auto-pin requires a separate connection because retry-helper
|
||||||
|
# closed its own. Skip if the significance event itself failed.
|
||||||
|
if appended_id is not None and score >= 3:
|
||||||
|
with open_db(self._settings.db_path) as conn:
|
||||||
_auto_pin_with_cap(
|
_auto_pin_with_cap(
|
||||||
conn,
|
conn,
|
||||||
owner_id=job.host_bot_id,
|
owner_id=job.host_bot_id,
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ from dataclasses import dataclass
|
|||||||
from sqlite3 import Connection
|
from sqlite3 import Connection
|
||||||
from typing import Callable
|
from typing import Callable
|
||||||
|
|
||||||
from chat.eventlog.log import append_and_apply
|
from chat.eventlog.log import append_and_apply_with_retry
|
||||||
from chat.services.embeddings import (
|
from chat.services.embeddings import (
|
||||||
DEFAULT_EMBEDDING_DIM,
|
DEFAULT_EMBEDDING_DIM,
|
||||||
DEFAULT_EMBEDDING_MODEL,
|
DEFAULT_EMBEDDING_MODEL,
|
||||||
@@ -121,17 +121,22 @@ class EmbeddingWorker:
|
|||||||
job.memory_id,
|
job.memory_id,
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
with self._conn_factory() as conn:
|
# Retry-on-lock: the request handler holds an open transaction
|
||||||
append_and_apply(
|
# for the duration of post_turn (a few seconds), so any worker
|
||||||
conn,
|
# write started during that window blocks. open_db's
|
||||||
kind="embedding_indexed",
|
# busy_timeout is 100ms (so the request path itself can't get
|
||||||
payload={
|
# stuck on a worker), so retry here with backoff. Each retry
|
||||||
"memory_id": job.memory_id,
|
# opens a fresh connection via ``conn_factory``.
|
||||||
"model": result.model,
|
await append_and_apply_with_retry(
|
||||||
"dim": result.dim,
|
self._conn_factory,
|
||||||
"vector": result.vector,
|
kind="embedding_indexed",
|
||||||
},
|
payload={
|
||||||
)
|
"memory_id": job.memory_id,
|
||||||
|
"model": result.model,
|
||||||
|
"dim": result.dim,
|
||||||
|
"vector": result.vector,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["EmbeddingJob", "EmbeddingWorker"]
|
__all__ = ["EmbeddingJob", "EmbeddingWorker"]
|
||||||
|
|||||||
Reference in New Issue
Block a user