Compare commits
17 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 2d1900bc8f | |||
| 50ab0c8229 | |||
| 49be3cf4b9 | |||
| a11255a5e6 | |||
| 3a81e540a1 | |||
| 0f8bf94d29 | |||
| 6d57fe88b4 | |||
| f775eb7e92 | |||
| e535a0181e | |||
| f7eec707a9 | |||
| 3b83786b8b | |||
| a902d86432 | |||
| de7f6624f0 | |||
| d656ee8805 | |||
| fe9c497038 | |||
| b3d78c1603 | |||
| a03f664407 |
@@ -5,6 +5,7 @@ data/
|
|||||||
|
|
||||||
# Python
|
# Python
|
||||||
.venv/
|
.venv/
|
||||||
|
.mlx-venv/
|
||||||
__pycache__/
|
__pycache__/
|
||||||
*.pyc
|
*.pyc
|
||||||
.pytest_cache/
|
.pytest_cache/
|
||||||
|
|||||||
+27
-4
@@ -72,17 +72,40 @@ async def lifespan(app: FastAPI):
|
|||||||
# (free / lower paid tiers cap at 2). Shared across all
|
# (free / lower paid tiers cap at 2). Shared across all
|
||||||
# FeatherlessClient instances in the process.
|
# FeatherlessClient instances in the process.
|
||||||
from chat.llm.featherless import FeatherlessClient
|
from chat.llm.featherless import FeatherlessClient
|
||||||
|
from chat.llm.local_mlx import LocalMLXClient
|
||||||
|
from chat.llm.router import RoutedLLMClient
|
||||||
|
|
||||||
FeatherlessClient.configure_concurrency(settings.featherless_max_concurrent)
|
FeatherlessClient.configure_concurrency(settings.featherless_max_concurrent)
|
||||||
|
LocalMLXClient.configure_concurrency(settings.local_mlx_max_concurrent)
|
||||||
|
|
||||||
# Background worker for the async significance pass (T22). Each job
|
# Background workers (significance scoring, embedding indexer)
|
||||||
# constructs a fresh FeatherlessClient via the factory; tests can
|
# construct a fresh client per job via the factory. Workers route
|
||||||
# disable enqueue by toggling ``app.state.background_worker.enabled``.
|
# through the same RoutedLLMClient as request-time traffic so the
|
||||||
|
# narrative model still goes to Featherless and the classifier +
|
||||||
|
# embeddings hit the local MLX server.
|
||||||
def _factory():
|
def _factory():
|
||||||
return FeatherlessClient(
|
narrative = FeatherlessClient(
|
||||||
api_key=settings.featherless_api_key,
|
api_key=settings.featherless_api_key,
|
||||||
base_url=settings.featherless_base_url,
|
base_url=settings.featherless_base_url,
|
||||||
)
|
)
|
||||||
|
classifier = None
|
||||||
|
if settings.classifier_provider_order:
|
||||||
|
classifier = FeatherlessClient(
|
||||||
|
api_key=settings.featherless_api_key,
|
||||||
|
base_url=settings.featherless_base_url,
|
||||||
|
default_extra_body={
|
||||||
|
"provider": {
|
||||||
|
"order": list(settings.classifier_provider_order)
|
||||||
|
}
|
||||||
|
},
|
||||||
|
)
|
||||||
|
local = LocalMLXClient(base_url=settings.local_mlx_base_url)
|
||||||
|
return RoutedLLMClient(
|
||||||
|
narrative=narrative,
|
||||||
|
classifier=classifier,
|
||||||
|
local=local,
|
||||||
|
narrative_model=settings.narrative_model,
|
||||||
|
)
|
||||||
|
|
||||||
worker = BackgroundWorker(settings, llm_client_factory=_factory)
|
worker = BackgroundWorker(settings, llm_client_factory=_factory)
|
||||||
await worker.start()
|
await worker.start()
|
||||||
|
|||||||
+34
-12
@@ -23,13 +23,22 @@ class Settings(BaseModel):
|
|||||||
retrieval_k: int = 4
|
retrieval_k: int = 4
|
||||||
narrative_budget_hard: int = 8000
|
narrative_budget_hard: int = 8000
|
||||||
narrative_budget_soft: int = 6000
|
narrative_budget_soft: int = 6000
|
||||||
# Cap on each generated bot response. ~400 tokens ≈ 1–2 short paragraphs.
|
# Cap on each generated bot response. The asterisk-action format
|
||||||
# Bump if you want longer scenes; drop to 200 for terse banter.
|
# (see ``_closing_instruction`` in chat/services/prompt.py) targets
|
||||||
narrative_max_tokens: int = 400
|
# 2-3 short interleaved action+dialogue beats. Verbose roleplay
|
||||||
|
# narrators (Cydonia, Magnum) ignore the prompt's cap and keep
|
||||||
|
# going; ``trim_to_max_beats`` in chat/services/prompt.py handles
|
||||||
|
# the actual cap by trimming at a beat boundary post-stream. This
|
||||||
|
# max_tokens setting just gives the third beat enough room to
|
||||||
|
# complete naturally before max_tokens cuts mid-action: 160 fits
|
||||||
|
# 3 substantive beats with margin. Bump to 250 for longer scenes;
|
||||||
|
# drop to 80 for terse banter.
|
||||||
|
narrative_max_tokens: int = 160
|
||||||
# Sampling temperature for narrative generation. 0.7 = grounded /
|
# Sampling temperature for narrative generation. 0.7 = grounded /
|
||||||
# consistent; 0.85 = creative-but-in-character (default); 1.0 = wide
|
# instruction-compliant (current — Cydonia is verbose-by-default and
|
||||||
# variety, can drift; >1.0 = often off-the-rails.
|
# tighter temperature helps it respect the 2-3-beat cap);
|
||||||
narrative_temperature: float = 0.85
|
# 0.85 = creative; 1.0 = wide variety; >1.0 = often off-the-rails.
|
||||||
|
narrative_temperature: float = 0.7
|
||||||
classifier_budget_hard: int = 4000
|
classifier_budget_hard: int = 4000
|
||||||
classifier_timeout_s: float = 30.0
|
classifier_timeout_s: float = 30.0
|
||||||
# Featherless free tier and lower paid tiers cap concurrent connections.
|
# Featherless free tier and lower paid tiers cap concurrent connections.
|
||||||
@@ -39,13 +48,26 @@ class Settings(BaseModel):
|
|||||||
data_dir: Path = REPO_ROOT / "data"
|
data_dir: Path = REPO_ROOT / "data"
|
||||||
bind_host: str = "127.0.0.1"
|
bind_host: str = "127.0.0.1"
|
||||||
bind_port: int = 8000
|
bind_port: int = 8000
|
||||||
|
# Local MLX server (e.g. ``mlx-omni-server``) — serves any model
|
||||||
|
# whose id starts with one of ``local_prefixes`` (default
|
||||||
|
# ``"mlx-community/"``). The :class:`RoutedLLMClient` inspects the
|
||||||
|
# ``model`` kwarg at call time: local-prefix -> local, else -> remote.
|
||||||
|
# ``embed()`` always routes local.
|
||||||
|
local_mlx_base_url: str = "http://127.0.0.1:10240/v1"
|
||||||
|
local_mlx_max_concurrent: int = 1
|
||||||
|
# Optional OpenRouter-style provider pinning for the classifier
|
||||||
|
# client. Maps to the ``provider`` field on chat.completions.create
|
||||||
|
# via ``extra_body``; the FeatherlessClient (which is just an
|
||||||
|
# AsyncOpenAI wrapper) merges it into every call. Useful for forcing
|
||||||
|
# Llama-3.1-8B classifier traffic onto Cerebras (~423 tok/s, 10x
|
||||||
|
# the default Nebius). Empty list = no pin (provider is
|
||||||
|
# OpenRouter's choice).
|
||||||
|
classifier_provider_order: list[str] = Field(default_factory=list)
|
||||||
# T112 (Phase 4.5): embedding model identifier. Default is the
|
# T112 (Phase 4.5): embedding model identifier. Default is the
|
||||||
# deterministic local pseudo (semantically meaningless but keeps the
|
# deterministic local pseudo so fresh installs / tests don't need
|
||||||
# vector pipeline structurally valid). Swap to a real model name
|
# any external infra. Override via config.toml to a real model id
|
||||||
# (e.g. "bge-small-en-v1.5") once the LLMClient implementation
|
# (e.g. ``"mlx-community/bge-small-en-v1.5-bf16"``) once a local
|
||||||
# supports embed() — currently FeatherlessClient does NOT, so a
|
# MLX server is running.
|
||||||
# non-default value will trigger the zero-vector fallback path
|
|
||||||
# plus a T107 warning until a different provider is wired in.
|
|
||||||
embedding_model: str = "pseudo-sha256-384"
|
embedding_model: str = "pseudo-sha256-384"
|
||||||
|
|
||||||
def load_settings() -> Settings:
|
def load_settings() -> Settings:
|
||||||
|
|||||||
+14
-1
@@ -7,7 +7,20 @@ from pathlib import Path
|
|||||||
@contextmanager
|
@contextmanager
|
||||||
def open_db(path: Path, *, check_same_thread: bool = True):
|
def open_db(path: Path, *, check_same_thread: bool = True):
|
||||||
path.parent.mkdir(parents=True, exist_ok=True)
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
conn = sqlite3.connect(path, check_same_thread=check_same_thread)
|
# ``timeout`` here sets SQLite's busy_timeout, in seconds: how long
|
||||||
|
# ``conn.execute`` blocks when another connection holds the WAL
|
||||||
|
# write lock. The Python default is 5.0, which is fatal for the
|
||||||
|
# async chat app: ``conn.execute``'s busy-wait does NOT release the
|
||||||
|
# GIL, so a contending background worker (e.g. the embedding worker
|
||||||
|
# writing ``embedding_indexed`` while the request handler holds an
|
||||||
|
# open transaction) freezes the whole asyncio event loop for up to
|
||||||
|
# 5 seconds — silently turning every concurrent LLM call into a 5s
|
||||||
|
# wall-clock hit. 0.1s lets contending writers fail fast; callers
|
||||||
|
# that need durability should retry, and the embedding worker
|
||||||
|
# already logs failures so a missed embedding can be backfilled.
|
||||||
|
conn = sqlite3.connect(
|
||||||
|
path, check_same_thread=check_same_thread, timeout=0.1
|
||||||
|
)
|
||||||
conn.execute("PRAGMA journal_mode=WAL")
|
conn.execute("PRAGMA journal_mode=WAL")
|
||||||
conn.execute("PRAGMA foreign_keys=ON")
|
conn.execute("PRAGMA foreign_keys=ON")
|
||||||
try:
|
try:
|
||||||
|
|||||||
+53
-2
@@ -1,8 +1,13 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
import asyncio
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Any, Iterator
|
from typing import Any, Callable, ContextManager, Iterator
|
||||||
from sqlite3 import Connection
|
from sqlite3 import Connection, OperationalError
|
||||||
|
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -63,6 +68,52 @@ def append_and_apply(
|
|||||||
return eid
|
return eid
|
||||||
|
|
||||||
|
|
||||||
|
async def append_and_apply_with_retry(
|
||||||
|
conn_factory: Callable[[], ContextManager[Connection]],
|
||||||
|
*,
|
||||||
|
kind: str,
|
||||||
|
payload: dict[str, Any],
|
||||||
|
branch_id: int = 1,
|
||||||
|
attempts: int = 30,
|
||||||
|
base_sleep_s: float = 0.05,
|
||||||
|
max_sleep_s: float = 0.5,
|
||||||
|
) -> int | None:
|
||||||
|
"""Append-and-apply that retries on ``database is locked``.
|
||||||
|
|
||||||
|
Background workers (embedding indexer, significance scorer) write
|
||||||
|
events to the same SQLite file as the request handler. The chat
|
||||||
|
app sets a tight ``busy_timeout=100ms`` on every connection so a
|
||||||
|
contending worker can't freeze the request's asyncio event loop.
|
||||||
|
This helper restores durability for workers: it retries up to
|
||||||
|
``attempts`` times with exponential backoff (capped at
|
||||||
|
``max_sleep_s``) until the lock clears.
|
||||||
|
|
||||||
|
Returns the appended event's id, or ``None`` if all retries failed
|
||||||
|
(logged at WARNING). Each retry opens a fresh connection via
|
||||||
|
``conn_factory`` because the failed write may have left the prior
|
||||||
|
connection in an unusable state.
|
||||||
|
"""
|
||||||
|
sleep = base_sleep_s
|
||||||
|
for attempt in range(attempts):
|
||||||
|
try:
|
||||||
|
with conn_factory() as conn:
|
||||||
|
return append_and_apply(
|
||||||
|
conn, kind=kind, payload=payload, branch_id=branch_id
|
||||||
|
)
|
||||||
|
except OperationalError as exc:
|
||||||
|
if "database is locked" not in str(exc).lower():
|
||||||
|
raise
|
||||||
|
if attempt == attempts - 1:
|
||||||
|
_log.warning(
|
||||||
|
"append_and_apply_with_retry: gave up after %d attempts "
|
||||||
|
"(kind=%s): %s",
|
||||||
|
attempts, kind, exc,
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
await asyncio.sleep(sleep)
|
||||||
|
sleep = min(sleep * 2, max_sleep_s)
|
||||||
|
|
||||||
|
|
||||||
def read_events(conn: Connection, branch_id: int = 1, after_id: int = 0) -> Iterator[Event]:
|
def read_events(conn: Connection, branch_id: int = 1, after_id: int = 0) -> Iterator[Event]:
|
||||||
cur = conn.execute(
|
cur = conn.execute(
|
||||||
"SELECT id, branch_id, ts, kind, payload_json, superseded_by, hidden "
|
"SELECT id, branch_id, ts, kind, payload_json, superseded_by, hidden "
|
||||||
|
|||||||
+30
-2
@@ -1,11 +1,13 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
import json
|
import json
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import logging
|
||||||
from typing import TypeVar
|
from typing import TypeVar
|
||||||
from pydantic import BaseModel, ValidationError
|
from pydantic import BaseModel, ValidationError
|
||||||
from .client import LLMClient, Message
|
from .client import LLMClient, Message
|
||||||
|
|
||||||
T = TypeVar("T", bound=BaseModel)
|
T = TypeVar("T", bound=BaseModel)
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
REFUSAL_PATTERNS = ("i can't", "i cannot", "i'm sorry, but", "as an ai")
|
REFUSAL_PATTERNS = ("i can't", "i cannot", "i'm sorry, but", "as an ai")
|
||||||
|
|
||||||
@@ -31,6 +33,7 @@ async def classify(
|
|||||||
schema: type[T],
|
schema: type[T],
|
||||||
default: T | None = None,
|
default: T | None = None,
|
||||||
timeout_s: float = 10.0,
|
timeout_s: float = 10.0,
|
||||||
|
max_tokens: int = 512,
|
||||||
) -> T:
|
) -> T:
|
||||||
schema_json = json.dumps(schema.model_json_schema(), indent=2)
|
schema_json = json.dumps(schema.model_json_schema(), indent=2)
|
||||||
schema_block = (
|
schema_block = (
|
||||||
@@ -41,22 +44,47 @@ async def classify(
|
|||||||
Message(role="system", content=system + schema_block),
|
Message(role="system", content=system + schema_block),
|
||||||
Message(role="user", content=user),
|
Message(role="user", content=user),
|
||||||
]
|
]
|
||||||
|
# Cap output length so a misbehaving model (e.g. one that ignores
|
||||||
|
# ``response_format=json_object`` and generates prose) can't burn
|
||||||
|
# several seconds on tokens we'll never use. Classifier responses
|
||||||
|
# are small JSON objects — 512 tokens is generous; usual completions
|
||||||
|
# are 50-150.
|
||||||
|
last_text = None
|
||||||
|
last_error: BaseException | None = None
|
||||||
for attempt in range(3):
|
for attempt in range(3):
|
||||||
try:
|
try:
|
||||||
text = await asyncio.wait_for(
|
text = await asyncio.wait_for(
|
||||||
client.generate(msgs, model=model, response_format={"type": "json_object"}),
|
client.generate(
|
||||||
|
msgs,
|
||||||
|
model=model,
|
||||||
|
response_format={"type": "json_object"},
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
),
|
||||||
timeout=timeout_s,
|
timeout=timeout_s,
|
||||||
)
|
)
|
||||||
|
last_text = text
|
||||||
cleaned = _strip_json_fences(text)
|
cleaned = _strip_json_fences(text)
|
||||||
if any(p in cleaned.lower()[:80] for p in REFUSAL_PATTERNS) and not cleaned.lstrip().startswith("{"):
|
if any(p in cleaned.lower()[:80] for p in REFUSAL_PATTERNS) and not cleaned.lstrip().startswith("{"):
|
||||||
raise ValueError("refusal-shaped response")
|
raise ValueError("refusal-shaped response")
|
||||||
return schema.model_validate_json(cleaned)
|
return schema.model_validate_json(cleaned)
|
||||||
except (ValidationError, ValueError, json.JSONDecodeError, asyncio.TimeoutError):
|
except (ValidationError, ValueError, json.JSONDecodeError, asyncio.TimeoutError) as exc:
|
||||||
|
last_error = exc
|
||||||
msgs[0] = Message(
|
msgs[0] = Message(
|
||||||
role="system",
|
role="system",
|
||||||
content=system + schema_block + "\n\nRespond with valid JSON ONLY. No prose, no markdown fences.",
|
content=system + schema_block + "\n\nRespond with valid JSON ONLY. No prose, no markdown fences.",
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
|
# Log when we're falling back so flapping classifiers are
|
||||||
|
# diagnosable without taking down the request.
|
||||||
|
snippet = (last_text or "")[:200].replace("\n", " ")
|
||||||
|
_log.warning(
|
||||||
|
"classify(%s) exhausted 3 attempts; last_error=%s last_text=%r; "
|
||||||
|
"falling back to %s",
|
||||||
|
schema.__name__,
|
||||||
|
type(last_error).__name__ if last_error else "?",
|
||||||
|
snippet,
|
||||||
|
"default" if default is not None else "RuntimeError (no default)",
|
||||||
|
)
|
||||||
if default is None:
|
if default is None:
|
||||||
raise RuntimeError(f"classify failed for schema {schema.__name__} with no default")
|
raise RuntimeError(f"classify failed for schema {schema.__name__} with no default")
|
||||||
return default
|
return default
|
||||||
|
|||||||
+61
-12
@@ -29,19 +29,60 @@ class FeatherlessClient:
|
|||||||
cls._semaphore = asyncio.Semaphore(2)
|
cls._semaphore = asyncio.Semaphore(2)
|
||||||
return cls._semaphore
|
return cls._semaphore
|
||||||
|
|
||||||
def __init__(self, api_key: str, base_url: str = "https://api.featherless.ai/v1"):
|
def __init__(
|
||||||
|
self,
|
||||||
|
api_key: str,
|
||||||
|
base_url: str = "https://api.featherless.ai/v1",
|
||||||
|
*,
|
||||||
|
default_extra_body: dict | None = None,
|
||||||
|
):
|
||||||
self._client = AsyncOpenAI(api_key=api_key, base_url=base_url)
|
self._client = AsyncOpenAI(api_key=api_key, base_url=base_url)
|
||||||
|
# ``default_extra_body`` is merged into every chat.completions.create
|
||||||
|
# call's ``extra_body``. Useful with OpenRouter to pin specific
|
||||||
|
# upstream providers (e.g. ``{"provider": {"order": ["Cerebras"]}}``
|
||||||
|
# for 10x throughput on Llama-3.1-8B). Featherless ignores the
|
||||||
|
# field, so it's safe to leave set even when ``base_url`` points
|
||||||
|
# back at Featherless.
|
||||||
|
self._default_extra_body = default_extra_body or {}
|
||||||
|
|
||||||
|
def _merge_extra_body(self, params: dict) -> dict:
|
||||||
|
if not self._default_extra_body:
|
||||||
|
return params
|
||||||
|
eb = dict(self._default_extra_body)
|
||||||
|
eb.update(params.pop("extra_body", {}) or {})
|
||||||
|
params["extra_body"] = eb
|
||||||
|
return params
|
||||||
|
|
||||||
async def generate(self, messages: Sequence[Message], *, model: str, **params) -> str:
|
async def generate(self, messages: Sequence[Message], *, model: str, **params) -> str:
|
||||||
|
params = self._merge_extra_body(dict(params))
|
||||||
async with self._sem():
|
async with self._sem():
|
||||||
resp = await self._client.chat.completions.create(
|
resp = await self._client.chat.completions.create(
|
||||||
model=model,
|
model=model,
|
||||||
messages=[{"role": m.role, "content": m.content} for m in messages],
|
messages=[{"role": m.role, "content": m.content} for m in messages],
|
||||||
**params,
|
**params,
|
||||||
)
|
)
|
||||||
|
# Diagnostic: stash provider+usage on a side-channel for the
|
||||||
|
# router timing log to pick up. OpenRouter sticks a 'provider'
|
||||||
|
# field on the response (not part of the OAI spec, but the
|
||||||
|
# SDK passes it through on its model dict).
|
||||||
|
try: # pragma: no cover — diagnostic only
|
||||||
|
import os as _os
|
||||||
|
if _os.environ.get("CHAT_LLM_TIMING") == "1":
|
||||||
|
prov = getattr(resp, "provider", None)
|
||||||
|
usage = getattr(resp, "usage", None)
|
||||||
|
ct = getattr(usage, "completion_tokens", "?") if usage else "?"
|
||||||
|
pt = getattr(usage, "prompt_tokens", "?") if usage else "?"
|
||||||
|
import logging as _logging
|
||||||
|
_logging.getLogger("chat.llm.router").info(
|
||||||
|
" ↪ provider=%s prompt_toks=%s completion_toks=%s",
|
||||||
|
prov, pt, ct,
|
||||||
|
)
|
||||||
|
except Exception: # pragma: no cover
|
||||||
|
pass
|
||||||
return resp.choices[0].message.content or ""
|
return resp.choices[0].message.content or ""
|
||||||
|
|
||||||
async def stream(self, messages: Sequence[Message], *, model: str, **params) -> AsyncIterator[str]:
|
async def stream(self, messages: Sequence[Message], *, model: str, **params) -> AsyncIterator[str]:
|
||||||
|
params = self._merge_extra_body(dict(params))
|
||||||
async with self._sem():
|
async with self._sem():
|
||||||
stream = await self._client.chat.completions.create(
|
stream = await self._client.chat.completions.create(
|
||||||
model=model,
|
model=model,
|
||||||
@@ -55,24 +96,32 @@ class FeatherlessClient:
|
|||||||
yield delta
|
yield delta
|
||||||
|
|
||||||
async def embed(self, text: str, *, model: str) -> list[float]:
|
async def embed(self, text: str, *, model: str) -> list[float]:
|
||||||
"""Embeddings via Featherless — currently unsupported.
|
"""Embeddings via Featherless — unsupported in practice.
|
||||||
|
|
||||||
T112 (Phase 4.5) extends the LLMClient Protocol with ``embed()``
|
T112 (Phase 4.5) extends the LLMClient Protocol with ``embed()``
|
||||||
for a future real-embedding swap. Featherless's OpenAI-compatible
|
for a future real-embedding swap. Featherless's OpenAI-compatible
|
||||||
surface does NOT expose ``/v1/embeddings`` at the time of writing,
|
surface routes ``/v1/embeddings`` (no 404), but every request
|
||||||
so this implementation raises ``NotImplementedError`` rather than
|
returns HTTP 500 ``{"error": {"type": "completions_error", ...}}``
|
||||||
attempting a request that would 404. The
|
— including standard names like ``text-embedding-3-small`` and
|
||||||
|
``BAAI/bge-small-en-v1.5``. ``/v1/models`` confirms it: the
|
||||||
|
catalog has no embedding-class entries, only chat/completion
|
||||||
|
classes (``llama3-*``, ``gemma3-*``, ``glm5-*``, etc.).
|
||||||
|
|
||||||
|
Rather than ship a request that always 500s, this implementation
|
||||||
|
raises ``NotImplementedError``. The
|
||||||
:func:`chat.services.embeddings.generate_embedding` wrapper
|
:func:`chat.services.embeddings.generate_embedding` wrapper
|
||||||
catches this and degrades to the existing zero-vector fallback
|
catches it and degrades to the existing zero-vector fallback
|
||||||
(with the T107 warning), so misconfigured callers fail loudly in
|
(with the T107 warning), so misconfigured callers fail loudly in
|
||||||
logs but the request path keeps working.
|
logs but the request path keeps working.
|
||||||
|
|
||||||
If Featherless ships embeddings, swap the body for an
|
For real embeddings, configure a different provider (OpenAI
|
||||||
``self._client.embeddings.create(model=..., input=...)`` call
|
direct, Cohere, Voyage, Together, self-hosted Ollama /
|
||||||
guarded by ``self._sem()`` (mirrors ``generate``/``stream``).
|
sentence-transformers). The Mock + routing seam from T112 keeps
|
||||||
|
the swap to a one-class change in ``chat/llm/``.
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"Featherless does not expose /v1/embeddings; "
|
"Featherless /v1/embeddings always returns 500 "
|
||||||
"configure a different embedding provider or stick with "
|
'("completions_error") and the model catalog has no '
|
||||||
"the default pseudo-sha256-384 model."
|
"embedding class; configure a different embedding provider "
|
||||||
|
"or stick with the default pseudo-sha256-384 model."
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -0,0 +1,95 @@
|
|||||||
|
"""Local MLX OpenAI-compatible client.
|
||||||
|
|
||||||
|
Talks to a locally-running MLX server (e.g., ``mlx-omni-server``) over
|
||||||
|
the same OpenAI surface that :class:`chat.llm.featherless.FeatherlessClient`
|
||||||
|
uses, via :class:`openai.AsyncOpenAI`. The underlying server runs MLX
|
||||||
|
models on Apple Silicon (M-series) for chat completions AND embeddings.
|
||||||
|
|
||||||
|
Use cases (Phase 4.5+):
|
||||||
|
- Classifier traffic moved off Featherless to local MLX (cost + latency).
|
||||||
|
- Embeddings via ``client.embed`` actually work — Featherless's
|
||||||
|
``/v1/embeddings`` always returns 500.
|
||||||
|
|
||||||
|
Constructor takes a ``base_url`` (e.g., ``"http://127.0.0.1:10240/v1"``)
|
||||||
|
and an optional ``api_key`` (most local MLX servers don't authenticate;
|
||||||
|
the OpenAI SDK requires *some* string, so we default to a placeholder).
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
import asyncio
|
||||||
|
from typing import AsyncIterator, Sequence
|
||||||
|
|
||||||
|
from openai import AsyncOpenAI
|
||||||
|
|
||||||
|
from .client import Message
|
||||||
|
|
||||||
|
|
||||||
|
class LocalMLXClient:
|
||||||
|
"""OpenAI-compatible client for a local MLX server.
|
||||||
|
|
||||||
|
The server is single-process by default (``mlx-omni-server`` loads
|
||||||
|
one model at a time and swaps on demand). The class-level semaphore
|
||||||
|
serializes concurrent requests so we never queue more than
|
||||||
|
``max_concurrent`` at a time — defaults to 1, since MLX inference
|
||||||
|
on a single M-series device is sequential anyway.
|
||||||
|
"""
|
||||||
|
|
||||||
|
_semaphore: asyncio.Semaphore | None = None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def configure_concurrency(cls, max_concurrent: int) -> None:
|
||||||
|
cls._semaphore = asyncio.Semaphore(max(1, int(max_concurrent)))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _sem(cls) -> asyncio.Semaphore:
|
||||||
|
if cls._semaphore is None:
|
||||||
|
cls._semaphore = asyncio.Semaphore(1)
|
||||||
|
return cls._semaphore
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
base_url: str = "http://127.0.0.1:10240/v1",
|
||||||
|
api_key: str = "not-needed",
|
||||||
|
):
|
||||||
|
self._client = AsyncOpenAI(api_key=api_key, base_url=base_url)
|
||||||
|
|
||||||
|
async def generate(
|
||||||
|
self, messages: Sequence[Message], *, model: str, **params
|
||||||
|
) -> str:
|
||||||
|
async with self._sem():
|
||||||
|
resp = await self._client.chat.completions.create(
|
||||||
|
model=model,
|
||||||
|
messages=[{"role": m.role, "content": m.content} for m in messages],
|
||||||
|
**params,
|
||||||
|
)
|
||||||
|
return resp.choices[0].message.content or ""
|
||||||
|
|
||||||
|
async def stream(
|
||||||
|
self, messages: Sequence[Message], *, model: str, **params
|
||||||
|
) -> AsyncIterator[str]:
|
||||||
|
async with self._sem():
|
||||||
|
stream = await self._client.chat.completions.create(
|
||||||
|
model=model,
|
||||||
|
messages=[{"role": m.role, "content": m.content} for m in messages],
|
||||||
|
stream=True,
|
||||||
|
**params,
|
||||||
|
)
|
||||||
|
async for chunk in stream:
|
||||||
|
delta = chunk.choices[0].delta.content or ""
|
||||||
|
if delta:
|
||||||
|
yield delta
|
||||||
|
|
||||||
|
async def embed(self, text: str, *, model: str) -> list[float]:
|
||||||
|
"""Return an embedding vector for ``text`` using the named model.
|
||||||
|
|
||||||
|
Targets ``/v1/embeddings`` on the local MLX server; the server
|
||||||
|
loads the model on first request and caches it. The embedding
|
||||||
|
model is independent of the chat model loaded for ``generate``
|
||||||
|
/ ``stream`` (the server can serve both).
|
||||||
|
"""
|
||||||
|
async with self._sem():
|
||||||
|
resp = await self._client.embeddings.create(
|
||||||
|
model=model,
|
||||||
|
input=text,
|
||||||
|
)
|
||||||
|
return list(resp.data[0].embedding)
|
||||||
@@ -0,0 +1,149 @@
|
|||||||
|
"""Routed LLM client — splits traffic across multiple backends by model.
|
||||||
|
|
||||||
|
Phase 4.5+ deployment: the 24B narrative model stays on Featherless,
|
||||||
|
the 8B classifier model moves to local MLX, and embeddings run on a
|
||||||
|
local BGE/MLX model. One :class:`LLMClient` interface, two underlying
|
||||||
|
backends, dispatched by the ``model`` argument at every call site.
|
||||||
|
|
||||||
|
Routing rule: requests whose ``model`` argument matches the configured
|
||||||
|
``narrative_model`` go to the narrative backend; everything else
|
||||||
|
(classifier, embeddings, future locally-hosted models) goes to the
|
||||||
|
local backend.
|
||||||
|
|
||||||
|
Set the env var ``CHAT_LLM_TIMING=1`` to log per-call timing at INFO
|
||||||
|
level. Useful for finding the slow link in a turn.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
from typing import AsyncIterator, Sequence
|
||||||
|
|
||||||
|
from .client import LLMClient, Message
|
||||||
|
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
_TIMING = os.environ.get("CHAT_LLM_TIMING") == "1"
|
||||||
|
if _TIMING and not _log.handlers:
|
||||||
|
# Wire a stderr handler when timing is enabled so the per-call
|
||||||
|
# logs show up under uvicorn (which doesn't configure non-uvicorn
|
||||||
|
# loggers by default).
|
||||||
|
_h = logging.StreamHandler()
|
||||||
|
_h.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(name)s: %(message)s"))
|
||||||
|
_log.addHandler(_h)
|
||||||
|
_log.setLevel(logging.INFO)
|
||||||
|
_log.propagate = False
|
||||||
|
|
||||||
|
|
||||||
|
class RoutedLLMClient:
|
||||||
|
"""Delegates to one of two underlying clients based on ``model``.
|
||||||
|
|
||||||
|
Routing rule: any model id starting with one of ``local_prefixes``
|
||||||
|
goes to the local backend (e.g. ``"mlx-community/"`` for models
|
||||||
|
served by ``mlx-omni-server``). Everything else — narrative model,
|
||||||
|
remote classifiers, anything on a hosted provider — routes to the
|
||||||
|
remote backend.
|
||||||
|
|
||||||
|
``embed`` always routes locally (the remote provider doesn't
|
||||||
|
expose a working ``/v1/embeddings``; see
|
||||||
|
:class:`chat.llm.featherless.FeatherlessClient.embed`).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
narrative: LLMClient,
|
||||||
|
local: LLMClient,
|
||||||
|
narrative_model: str,
|
||||||
|
classifier: LLMClient | None = None,
|
||||||
|
local_prefixes: tuple[str, ...] = ("mlx-community/",),
|
||||||
|
) -> None:
|
||||||
|
# ``classifier`` is an optional separate backend for the
|
||||||
|
# classifier model. Useful when classifier and narrative both
|
||||||
|
# live on a remote OpenRouter-style provider but need different
|
||||||
|
# provider-pinning (e.g. Cerebras for the 8B classifier,
|
||||||
|
# default Friendli/etc. for the narrative). When ``classifier``
|
||||||
|
# is None, classifier traffic falls through to ``narrative``
|
||||||
|
# (the remote client) so old wiring keeps working.
|
||||||
|
self._narrative = narrative
|
||||||
|
self._classifier = classifier
|
||||||
|
self._local = local
|
||||||
|
self._narrative_model = narrative_model
|
||||||
|
self._local_prefixes = local_prefixes
|
||||||
|
|
||||||
|
def _pick(self, model: str) -> LLMClient:
|
||||||
|
if any(model.startswith(p) for p in self._local_prefixes):
|
||||||
|
return self._local
|
||||||
|
if model == self._narrative_model:
|
||||||
|
return self._narrative
|
||||||
|
# Anything else (most importantly, the classifier model) goes
|
||||||
|
# to the classifier client when configured, otherwise to the
|
||||||
|
# narrative remote client.
|
||||||
|
return self._classifier or self._narrative
|
||||||
|
|
||||||
|
async def generate(
|
||||||
|
self, messages: Sequence[Message], *, model: str, **params
|
||||||
|
) -> str:
|
||||||
|
client = self._pick(model)
|
||||||
|
backend = (
|
||||||
|
"narrative" if client is self._narrative else
|
||||||
|
"classifier" if client is self._classifier else
|
||||||
|
"local"
|
||||||
|
)
|
||||||
|
if not _TIMING:
|
||||||
|
return await client.generate(messages, model=model, **params)
|
||||||
|
in_chars = sum(len(m.content) for m in messages)
|
||||||
|
_log.info("LLM generate START [%s] %s in_chars=%d", backend, model, in_chars)
|
||||||
|
t0 = time.perf_counter()
|
||||||
|
try:
|
||||||
|
return await client.generate(messages, model=model, **params)
|
||||||
|
finally:
|
||||||
|
_log.info(
|
||||||
|
"LLM generate END [%s] %s in_chars=%d %.2fs",
|
||||||
|
backend, model, in_chars, time.perf_counter() - t0,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def stream(
|
||||||
|
self, messages: Sequence[Message], *, model: str, **params
|
||||||
|
) -> AsyncIterator[str]:
|
||||||
|
client = self._pick(model)
|
||||||
|
backend = (
|
||||||
|
"narrative" if client is self._narrative else
|
||||||
|
"classifier" if client is self._classifier else
|
||||||
|
"local"
|
||||||
|
)
|
||||||
|
if not _TIMING:
|
||||||
|
async for chunk in client.stream(messages, model=model, **params):
|
||||||
|
yield chunk
|
||||||
|
return
|
||||||
|
t0 = time.perf_counter()
|
||||||
|
ttft = None
|
||||||
|
chars_out = 0
|
||||||
|
try:
|
||||||
|
async for chunk in client.stream(messages, model=model, **params):
|
||||||
|
if ttft is None:
|
||||||
|
ttft = time.perf_counter() - t0
|
||||||
|
chars_out += len(chunk)
|
||||||
|
yield chunk
|
||||||
|
finally:
|
||||||
|
dt = time.perf_counter() - t0
|
||||||
|
in_chars = sum(len(m.content) for m in messages)
|
||||||
|
_log.info(
|
||||||
|
"LLM stream [%s] %s in_chars=%d out_chars=%d ttft=%.2fs total=%.2fs",
|
||||||
|
backend, model, in_chars, chars_out, ttft or 0.0, dt,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def embed(self, text: str, *, model: str) -> list[float]:
|
||||||
|
# Embeddings always run on the local backend — the remote
|
||||||
|
# provider doesn't expose a working ``/v1/embeddings`` endpoint.
|
||||||
|
if not _TIMING:
|
||||||
|
return await self._local.embed(text, model=model)
|
||||||
|
t0 = time.perf_counter()
|
||||||
|
try:
|
||||||
|
return await self._local.embed(text, model=model)
|
||||||
|
finally:
|
||||||
|
_log.info(
|
||||||
|
"LLM embed [local] %s in_chars=%d %.2fs",
|
||||||
|
model, len(text), time.perf_counter() - t0,
|
||||||
|
)
|
||||||
+17
-11
@@ -30,7 +30,7 @@ from typing import Callable
|
|||||||
|
|
||||||
from chat.config import Settings
|
from chat.config import Settings
|
||||||
from chat.db.connection import open_db
|
from chat.db.connection import open_db
|
||||||
from chat.eventlog.log import append_and_apply
|
from chat.eventlog.log import append_and_apply, append_and_apply_with_retry
|
||||||
from chat.llm.client import LLMClient
|
from chat.llm.client import LLMClient
|
||||||
from chat.services.backup import (
|
from chat.services.backup import (
|
||||||
prune_backups,
|
prune_backups,
|
||||||
@@ -169,16 +169,22 @@ class BackgroundWorker:
|
|||||||
narrative_text=job.narrative_text,
|
narrative_text=job.narrative_text,
|
||||||
prior_dialogue=job.prior_dialogue,
|
prior_dialogue=job.prior_dialogue,
|
||||||
)
|
)
|
||||||
with open_db(self._settings.db_path) as conn:
|
# Retry-on-lock: see chat/eventlog/log.py's
|
||||||
append_and_apply(
|
# ``append_and_apply_with_retry`` docstring for why workers
|
||||||
conn,
|
# need to retry while the request handler's open transaction
|
||||||
kind="memory_significance_set",
|
# holds the WAL write lock briefly.
|
||||||
payload={
|
appended_id = await append_and_apply_with_retry(
|
||||||
"memory_id": job.memory_id,
|
lambda: open_db(self._settings.db_path),
|
||||||
"significance": score,
|
kind="memory_significance_set",
|
||||||
},
|
payload={
|
||||||
)
|
"memory_id": job.memory_id,
|
||||||
if score >= 3:
|
"significance": score,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
# Auto-pin requires a separate connection because retry-helper
|
||||||
|
# closed its own. Skip if the significance event itself failed.
|
||||||
|
if appended_id is not None and score >= 3:
|
||||||
|
with open_db(self._settings.db_path) as conn:
|
||||||
_auto_pin_with_cap(
|
_auto_pin_with_cap(
|
||||||
conn,
|
conn,
|
||||||
owner_id=job.host_bot_id,
|
owner_id=job.host_bot_id,
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ from dataclasses import dataclass
|
|||||||
from sqlite3 import Connection
|
from sqlite3 import Connection
|
||||||
from typing import Callable
|
from typing import Callable
|
||||||
|
|
||||||
from chat.eventlog.log import append_and_apply
|
from chat.eventlog.log import append_and_apply_with_retry
|
||||||
from chat.services.embeddings import (
|
from chat.services.embeddings import (
|
||||||
DEFAULT_EMBEDDING_DIM,
|
DEFAULT_EMBEDDING_DIM,
|
||||||
DEFAULT_EMBEDDING_MODEL,
|
DEFAULT_EMBEDDING_MODEL,
|
||||||
@@ -121,17 +121,22 @@ class EmbeddingWorker:
|
|||||||
job.memory_id,
|
job.memory_id,
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
with self._conn_factory() as conn:
|
# Retry-on-lock: the request handler holds an open transaction
|
||||||
append_and_apply(
|
# for the duration of post_turn (a few seconds), so any worker
|
||||||
conn,
|
# write started during that window blocks. open_db's
|
||||||
kind="embedding_indexed",
|
# busy_timeout is 100ms (so the request path itself can't get
|
||||||
payload={
|
# stuck on a worker), so retry here with backoff. Each retry
|
||||||
"memory_id": job.memory_id,
|
# opens a fresh connection via ``conn_factory``.
|
||||||
"model": result.model,
|
await append_and_apply_with_retry(
|
||||||
"dim": result.dim,
|
self._conn_factory,
|
||||||
"vector": result.vector,
|
kind="embedding_indexed",
|
||||||
},
|
payload={
|
||||||
)
|
"memory_id": job.memory_id,
|
||||||
|
"model": result.model,
|
||||||
|
"dim": result.dim,
|
||||||
|
"vector": result.vector,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["EmbeddingJob", "EmbeddingWorker"]
|
__all__ = ["EmbeddingJob", "EmbeddingWorker"]
|
||||||
|
|||||||
@@ -4,13 +4,15 @@ Wraps single-pair compute_state_update to run state updates for ALL
|
|||||||
directed pairs of present entities. With 3 present entities (you, host,
|
directed pairs of present entities. With 3 present entities (you, host,
|
||||||
guest) that's 6 directed pairs. With 2 present (you, host) it's 2 pairs.
|
guest) that's 6 directed pairs. With 2 present (you, host) it's 2 pairs.
|
||||||
|
|
||||||
Calls run sequentially to respect Featherless's 2-connection cap (the
|
Pairs run concurrently via :func:`asyncio.gather`; the underlying
|
||||||
client-level semaphore would serialize them anyway, but doing it here
|
client should impose its own concurrency cap if the upstream provider
|
||||||
keeps the failure surface clean — a hung pair doesn't queue behind
|
needs it (e.g., Featherless's 2-conn semaphore). Returning order is
|
||||||
itself).
|
preserved (natural iteration over ``present_ids x present_ids``,
|
||||||
|
src != tgt) so downstream event-append order stays deterministic.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
import asyncio
|
||||||
|
|
||||||
from chat.llm.client import LLMClient
|
from chat.llm.client import LLMClient
|
||||||
from chat.services.state_update import StateUpdate, compute_state_update
|
from chat.services.state_update import StateUpdate, compute_state_update
|
||||||
@@ -28,35 +30,44 @@ async def compute_state_updates_for_present(
|
|||||||
timeout_s: float = 30.0,
|
timeout_s: float = 30.0,
|
||||||
) -> list[tuple[str, str, StateUpdate]]:
|
) -> list[tuple[str, str, StateUpdate]]:
|
||||||
"""Run compute_state_update for every directed pair (src != tgt) over
|
"""Run compute_state_update for every directed pair (src != tgt) over
|
||||||
``present_ids``. Returns list of ``(source_id, target_id, update)``
|
``present_ids``, concurrently. Returns list of
|
||||||
tuples in the natural iteration order over ``present_ids x present_ids``.
|
``(source_id, target_id, update)`` tuples in the natural iteration
|
||||||
|
order over ``present_ids x present_ids`` — concurrent dispatch does
|
||||||
|
not change the returned order.
|
||||||
|
|
||||||
A single failing pair falls back to the schema-default StateUpdate
|
A single failing pair falls back to the schema-default StateUpdate
|
||||||
(zero deltas, empty facts) inside ``compute_state_update``; the batch
|
(zero deltas, empty facts) inside ``compute_state_update``; sibling
|
||||||
keeps going.
|
pairs continue independently because each call is wrapped in its
|
||||||
|
own try/except inside ``compute_state_update``.
|
||||||
"""
|
"""
|
||||||
out: list[tuple[str, str, StateUpdate]] = []
|
pair_keys: list[tuple[str, str]] = [
|
||||||
for src in present_ids:
|
(src, tgt)
|
||||||
for tgt in present_ids:
|
for src in present_ids
|
||||||
if src == tgt:
|
for tgt in present_ids
|
||||||
continue
|
if src != tgt
|
||||||
edge = prior_edges.get((src, tgt), {})
|
]
|
||||||
update = await compute_state_update(
|
if not pair_keys:
|
||||||
client,
|
return []
|
||||||
model=classifier_model,
|
|
||||||
source_id=src,
|
async def _one(src: str, tgt: str) -> StateUpdate:
|
||||||
target_id=tgt,
|
edge = prior_edges.get((src, tgt), {})
|
||||||
source_name=present_names.get(src, src),
|
return await compute_state_update(
|
||||||
source_persona=personas.get(src, "") or "",
|
client,
|
||||||
target_name=present_names.get(tgt, tgt),
|
model=classifier_model,
|
||||||
prior_affinity=int(edge.get("affinity", 50)),
|
source_id=src,
|
||||||
prior_trust=int(edge.get("trust", 50)),
|
target_id=tgt,
|
||||||
prior_summary=edge.get("summary", "") or "",
|
source_name=present_names.get(src, src),
|
||||||
recent_dialogue=recent_dialogue,
|
source_persona=personas.get(src, "") or "",
|
||||||
timeout_s=timeout_s,
|
target_name=present_names.get(tgt, tgt),
|
||||||
)
|
prior_affinity=int(edge.get("affinity", 50)),
|
||||||
out.append((src, tgt, update))
|
prior_trust=int(edge.get("trust", 50)),
|
||||||
return out
|
prior_summary=edge.get("summary", "") or "",
|
||||||
|
recent_dialogue=recent_dialogue,
|
||||||
|
timeout_s=timeout_s,
|
||||||
|
)
|
||||||
|
|
||||||
|
updates = await asyncio.gather(*(_one(src, tgt) for src, tgt in pair_keys))
|
||||||
|
return [(src, tgt, upd) for (src, tgt), upd in zip(pair_keys, updates)]
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["compute_state_updates_for_present"]
|
__all__ = ["compute_state_updates_for_present"]
|
||||||
|
|||||||
+51
-6
@@ -325,14 +325,59 @@ def _build_open_threads_block(threads: list[dict]) -> str | None:
|
|||||||
return "\n".join(lines)
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def trim_to_max_beats(text: str, max_beats: int = 3) -> str:
|
||||||
|
"""Truncate ``text`` to at most ``max_beats`` asterisk-action beats.
|
||||||
|
|
||||||
|
A "beat" is one ``*action*`` markdown-italic block plus the dialogue
|
||||||
|
that follows it; counting ``*`` characters works as a deterministic
|
||||||
|
boundary detector since each complete beat contributes exactly two
|
||||||
|
asterisks (open + close). The (2*max_beats + 1)th asterisk is the
|
||||||
|
opening of an over-the-cap beat; we trim immediately before it and
|
||||||
|
strip trailing whitespace.
|
||||||
|
|
||||||
|
Belt-and-suspenders for verbose roleplay-tuned narrators (Cydonia,
|
||||||
|
Magnum, etc.) that reliably ignore "HARD CAP: 2-3 beats" prompt
|
||||||
|
instructions and keep going. A physical max_tokens cap helps but
|
||||||
|
truncates mid-word; this trims at a beat boundary instead.
|
||||||
|
|
||||||
|
Idempotent and safe on outputs with fewer beats than the cap (just
|
||||||
|
returns the text unchanged after a single pass).
|
||||||
|
"""
|
||||||
|
if max_beats <= 0:
|
||||||
|
return ""
|
||||||
|
target = max_beats * 2
|
||||||
|
count = 0
|
||||||
|
for i, ch in enumerate(text):
|
||||||
|
if ch == "*":
|
||||||
|
count += 1
|
||||||
|
if count > target:
|
||||||
|
return text[:i].rstrip()
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
def _closing_instruction(speaker_name: str, addressee_name: str) -> str:
|
def _closing_instruction(speaker_name: str, addressee_name: str) -> str:
|
||||||
return (
|
return (
|
||||||
f"Continue the scene as {speaker_name}, in their voice, responding "
|
f"Continue as {speaker_name}. Format strictly:\n"
|
||||||
"naturally. Use *asterisks* for actions and quotes for dialogue. "
|
f"- Wrap actions and gestures in *asterisks*, third person "
|
||||||
f"Stay in character. Do not narrate {addressee_name}'s actions or "
|
f"({speaker_name}/she/he/they) — never first person, never inner "
|
||||||
"thoughts. "
|
"thoughts inside asterisks.\n"
|
||||||
"Keep your response to a single beat — one or two short paragraphs "
|
"- Speak dialogue as plain text between action beats, no quote "
|
||||||
"at most. Don't monologue; leave room for the other person to react."
|
"marks. Keep speech fragmented, not paragraphs.\n"
|
||||||
|
"- HARD CAP: 2-3 beats per response. A beat is one *asterisk "
|
||||||
|
"action* paired with a short dialogue fragment. After the "
|
||||||
|
"third beat, STOP — do not add a fourth, do not summarize, do "
|
||||||
|
f"not narrate {addressee_name}'s reaction. Long responses break "
|
||||||
|
"the scene's rhythm.\n"
|
||||||
|
"- Each beat is one concrete gesture or sensory image. No "
|
||||||
|
"explanation, no inner monologue, no stage-direction adverbs.\n"
|
||||||
|
"- Trailing ellipses (...) are fine for emotional weight.\n"
|
||||||
|
"EXAMPLE (3 beats, stops cleanly):\n"
|
||||||
|
"*She turns with soapy hands to cup your face* That's how I know "
|
||||||
|
"it's real... *She kisses you softly* You love me when I'm messy... "
|
||||||
|
"*She smiles tearfully* ...and every moment in between.\n"
|
||||||
|
f"Show only what {addressee_name} could externally observe of "
|
||||||
|
f"{speaker_name}; never narrate {addressee_name}'s actions, "
|
||||||
|
"thoughts, or speech. One response — leave room to react."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -107,13 +107,23 @@ async def parse_turn(
|
|||||||
without an LLM call (the classifier would error on empty input
|
without an LLM call (the classifier would error on empty input
|
||||||
anyway, and the result is unambiguous).
|
anyway, and the result is unambiguous).
|
||||||
|
|
||||||
Raises ``RuntimeError`` if the classifier fails twice — no default
|
Falls back to a single dialogue-shaped segment containing the
|
||||||
is supplied, since the caller (T19's turn flow) is responsible for
|
whole prose if the classifier flaps after retries — the turn flow
|
||||||
surfacing the error to the user.
|
can keep moving (the narrative will still fire on the prose) at
|
||||||
|
the cost of finer-grained segment classification. The original
|
||||||
|
code raised ``RuntimeError`` here, which 500'd the whole request
|
||||||
|
and was particularly painful in multi-bot scenes where every
|
||||||
|
user turn paid the classifier round-trip.
|
||||||
"""
|
"""
|
||||||
if not prose.strip():
|
if not prose.strip():
|
||||||
return ParsedTurn(segments=[])
|
return ParsedTurn(segments=[])
|
||||||
|
|
||||||
|
fallback = ParsedTurn(
|
||||||
|
segments=[TurnSegment(kind="dialogue", text=prose)],
|
||||||
|
intent="narrative",
|
||||||
|
landing_state_hint="",
|
||||||
|
)
|
||||||
|
|
||||||
user_prompt = f"INPUT:\n{prose}"
|
user_prompt = f"INPUT:\n{prose}"
|
||||||
return await classify(
|
return await classify(
|
||||||
client,
|
client,
|
||||||
@@ -121,5 +131,6 @@ async def parse_turn(
|
|||||||
system=_SYSTEM_PROMPT,
|
system=_SYSTEM_PROMPT,
|
||||||
user=user_prompt,
|
user=user_prompt,
|
||||||
schema=ParsedTurn,
|
schema=ParsedTurn,
|
||||||
|
default=fallback,
|
||||||
timeout_s=timeout_s,
|
timeout_s=timeout_s,
|
||||||
)
|
)
|
||||||
|
|||||||
+291
-7
@@ -5,7 +5,12 @@ body {
|
|||||||
color: #1c1c1c;
|
color: #1c1c1c;
|
||||||
background: #fafafa;
|
background: #fafafa;
|
||||||
display: flex;
|
display: flex;
|
||||||
min-height: 100vh;
|
/* Locked to viewport (was ``min-height: 100vh``) so flex children
|
||||||
|
like the chat ``.timeline`` get a bounded height and can use
|
||||||
|
``overflow-y: auto`` to scroll independently. The other pages
|
||||||
|
have ``.content`` with ``overflow: auto`` so their own
|
||||||
|
overflow still scrolls inside the right pane. */
|
||||||
|
height: 100vh;
|
||||||
}
|
}
|
||||||
.rail {
|
.rail {
|
||||||
width: 200px;
|
width: 200px;
|
||||||
@@ -101,12 +106,291 @@ code { font-family: ui-monospace, "SF Mono", Menlo, monospace; }
|
|||||||
}
|
}
|
||||||
.turn-input { display: flex; flex-direction: column; gap: 8px; padding-top: 12px; border-top: 1px solid #e5e5e5; }
|
.turn-input { display: flex; flex-direction: column; gap: 8px; padding-top: 12px; border-top: 1px solid #e5e5e5; }
|
||||||
.turn-input textarea { padding: 8px; font: inherit; border: 1px solid #ccc; border-radius: 3px; resize: vertical; }
|
.turn-input textarea { padding: 8px; font: inherit; border: 1px solid #ccc; border-radius: 3px; resize: vertical; }
|
||||||
.drawer { position: fixed; top: 0; right: 0; width: 360px; height: 100vh; background: #fff; border-left: 1px solid #e5e5e5; padding: 16px; overflow-y: auto; z-index: 10; }
|
/* ===========================================================
|
||||||
.drawer[hidden] { display: none; }
|
Drawer — director's notebook overlay
|
||||||
.drawer-content { display: flex; flex-direction: column; gap: 16px; }
|
===========================================================
|
||||||
.drawer-header { display: flex; align-items: center; justify-content: space-between; padding-bottom: 8px; border-bottom: 1px solid #e5e5e5; }
|
Editorial popup design: a warm-paper panel floats over an inky
|
||||||
.drawer-close { border: none; background: transparent; color: #1c1c1c; font-size: 24px; padding: 0 4px; cursor: pointer; }
|
blurred backdrop. Single accent serif (Newsreader) at the title,
|
||||||
.drawer-section h3 { margin: 0 0 8px; font-size: 14px; text-transform: uppercase; letter-spacing: 0.5px; color: #666; }
|
single muted-amber accent for primary interactives, generous
|
||||||
|
spacing, controlled motion.
|
||||||
|
|
||||||
|
Design tokens (scoped to the drawer so the rest of the app stays
|
||||||
|
on its existing palette).
|
||||||
|
*/
|
||||||
|
.drawer-modal {
|
||||||
|
--paper: #f6f1e8; /* warm off-white panel */
|
||||||
|
--paper-edge: #e7dfce;
|
||||||
|
--ink: #1a1d29; /* deep ink-blue */
|
||||||
|
--ink-soft: #38405a;
|
||||||
|
--ink-faint: #6c7390;
|
||||||
|
--accent: #b97e30; /* muted amber */
|
||||||
|
--accent-soft: #efd9b1;
|
||||||
|
--rule: rgba(26, 29, 41, 0.10);
|
||||||
|
--shadow-near: 0 1px 2px rgba(26, 29, 41, 0.08);
|
||||||
|
--shadow-far: 0 32px 64px -24px rgba(26, 29, 41, 0.45),
|
||||||
|
0 12px 24px -12px rgba(26, 29, 41, 0.25);
|
||||||
|
--serif: "Newsreader", "Iowan Old Style", Georgia, serif;
|
||||||
|
--duration: 180ms;
|
||||||
|
--ease: cubic-bezier(0.22, 0.61, 0.36, 1);
|
||||||
|
|
||||||
|
position: fixed;
|
||||||
|
inset: 0;
|
||||||
|
z-index: 100;
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
justify-content: center;
|
||||||
|
padding: clamp(16px, 4vw, 48px);
|
||||||
|
/* Open/close transitions live here so the backdrop and panel
|
||||||
|
can fade together; .is-open promotes both to their visible
|
||||||
|
end-states. */
|
||||||
|
opacity: 0;
|
||||||
|
transition: opacity var(--duration) var(--ease);
|
||||||
|
}
|
||||||
|
.drawer-modal[hidden] { display: none; }
|
||||||
|
.drawer-modal.is-open { opacity: 1; }
|
||||||
|
|
||||||
|
.drawer-modal-backdrop {
|
||||||
|
position: absolute;
|
||||||
|
inset: 0;
|
||||||
|
background:
|
||||||
|
radial-gradient(circle at 30% 25%, rgba(26, 29, 41, 0.55), rgba(26, 29, 41, 0.85) 75%);
|
||||||
|
backdrop-filter: blur(6px) saturate(1.05);
|
||||||
|
-webkit-backdrop-filter: blur(6px) saturate(1.05);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* The chat behind the modal stops scrolling and loses focus
|
||||||
|
entirely. body class set by the JS; resets on close. */
|
||||||
|
body.drawer-modal-open { overflow: hidden; }
|
||||||
|
|
||||||
|
.drawer-panel {
|
||||||
|
position: relative;
|
||||||
|
width: 100%;
|
||||||
|
max-width: 720px;
|
||||||
|
max-height: min(82vh, 760px);
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
background: var(--paper);
|
||||||
|
border-radius: 6px;
|
||||||
|
box-shadow: var(--shadow-far);
|
||||||
|
/* Subtle warm-paper texture: a single soft inner highlight at the
|
||||||
|
top edge plus a faint vignette toward the bottom. Cheap, no
|
||||||
|
external image. */
|
||||||
|
background-image:
|
||||||
|
linear-gradient(180deg,
|
||||||
|
rgba(255, 255, 255, 0.50) 0%,
|
||||||
|
rgba(255, 255, 255, 0.00) 18%,
|
||||||
|
rgba(0, 0, 0, 0.00) 80%,
|
||||||
|
rgba(120, 100, 70, 0.06) 100%);
|
||||||
|
/* A 1px ink rule at the very top, set INSIDE the radius so the
|
||||||
|
corners stay clean. ::before serves as a hairline accent. */
|
||||||
|
overflow: hidden;
|
||||||
|
/* Open/close: the backdrop fades; the panel additionally lifts
|
||||||
|
slightly and scales from 98% to 100%. Controlled, no bounce. */
|
||||||
|
transform: translateY(8px) scale(0.98);
|
||||||
|
transition:
|
||||||
|
transform var(--duration) var(--ease),
|
||||||
|
opacity var(--duration) var(--ease);
|
||||||
|
opacity: 0.98;
|
||||||
|
}
|
||||||
|
.drawer-modal.is-open .drawer-panel {
|
||||||
|
transform: translateY(0) scale(1);
|
||||||
|
opacity: 1;
|
||||||
|
}
|
||||||
|
.drawer-panel::before {
|
||||||
|
content: "";
|
||||||
|
position: absolute;
|
||||||
|
top: 0; left: 0; right: 0;
|
||||||
|
height: 2px;
|
||||||
|
background: linear-gradient(90deg,
|
||||||
|
transparent 0%, var(--accent) 14%, var(--accent) 86%, transparent 100%);
|
||||||
|
opacity: 0.85;
|
||||||
|
}
|
||||||
|
|
||||||
|
.drawer-panel-header {
|
||||||
|
display: flex;
|
||||||
|
align-items: baseline;
|
||||||
|
justify-content: space-between;
|
||||||
|
gap: 16px;
|
||||||
|
padding: 22px 28px 14px;
|
||||||
|
border-bottom: 1px solid var(--rule);
|
||||||
|
flex-shrink: 0;
|
||||||
|
}
|
||||||
|
.drawer-panel-header h2 {
|
||||||
|
margin: 0;
|
||||||
|
font-family: var(--serif);
|
||||||
|
font-weight: 500;
|
||||||
|
font-size: clamp(22px, 2.4vw, 28px);
|
||||||
|
letter-spacing: -0.01em;
|
||||||
|
color: var(--ink);
|
||||||
|
/* Tiny editorial flourish: lowercase the title so it reads like
|
||||||
|
a column header in a printed broadside. */
|
||||||
|
text-transform: lowercase;
|
||||||
|
}
|
||||||
|
.drawer-panel-header h2::after {
|
||||||
|
content: "";
|
||||||
|
display: inline-block;
|
||||||
|
width: 6px;
|
||||||
|
height: 6px;
|
||||||
|
margin-left: 10px;
|
||||||
|
border-radius: 50%;
|
||||||
|
background: var(--accent);
|
||||||
|
vertical-align: middle;
|
||||||
|
transform: translateY(-2px);
|
||||||
|
}
|
||||||
|
|
||||||
|
.drawer-panel-close {
|
||||||
|
appearance: none;
|
||||||
|
background: transparent;
|
||||||
|
border: none;
|
||||||
|
border-radius: 4px;
|
||||||
|
color: var(--ink-soft);
|
||||||
|
font-family: var(--serif);
|
||||||
|
font-size: 28px;
|
||||||
|
line-height: 1;
|
||||||
|
width: 36px;
|
||||||
|
height: 36px;
|
||||||
|
cursor: pointer;
|
||||||
|
transition:
|
||||||
|
background-color var(--duration) var(--ease),
|
||||||
|
color var(--duration) var(--ease),
|
||||||
|
transform var(--duration) var(--ease);
|
||||||
|
}
|
||||||
|
.drawer-panel-close:hover {
|
||||||
|
background: rgba(26, 29, 41, 0.06);
|
||||||
|
color: var(--ink);
|
||||||
|
transform: rotate(90deg);
|
||||||
|
}
|
||||||
|
.drawer-panel-close:focus-visible {
|
||||||
|
outline: 2px solid var(--accent);
|
||||||
|
outline-offset: 2px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.drawer-panel-body {
|
||||||
|
flex: 1 1 auto;
|
||||||
|
min-height: 0;
|
||||||
|
overflow-y: auto;
|
||||||
|
padding: 18px 28px 28px;
|
||||||
|
/* Restrict typography inside the body to the existing app font
|
||||||
|
so the existing drawer markup (forms, lists, buttons rendered
|
||||||
|
by /chats/<id>/drawer) keeps its current density and read-flow.
|
||||||
|
We only re-color a few items so they sit on the warm paper. */
|
||||||
|
color: var(--ink);
|
||||||
|
}
|
||||||
|
.drawer-panel-body .drawer-panel-loading {
|
||||||
|
font-family: var(--serif);
|
||||||
|
font-style: italic;
|
||||||
|
color: var(--ink-faint);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Scoped overrides for the drawer-content the server renders into
|
||||||
|
.drawer-panel-body. Keeps the existing class names working but
|
||||||
|
re-tunes them for the warm-paper context. */
|
||||||
|
/* Tabs nav — sits at the top of .drawer-content and lets the user
|
||||||
|
pivot between Scene / Cast / Story / Turns groups. Underline-style
|
||||||
|
active indicator (a single muted-amber rule) keeps the editorial
|
||||||
|
feel — no pills, no boxes, no hover-fills. */
|
||||||
|
.drawer-panel-body .drawer-tabs {
|
||||||
|
display: flex;
|
||||||
|
gap: 6px;
|
||||||
|
margin: 0 -8px 14px; /* bleed the divider rule slightly past the body padding */
|
||||||
|
padding: 0 8px 0;
|
||||||
|
border-bottom: 1px solid var(--rule);
|
||||||
|
flex-wrap: wrap;
|
||||||
|
}
|
||||||
|
.drawer-panel-body .drawer-tab {
|
||||||
|
appearance: none;
|
||||||
|
background: transparent;
|
||||||
|
border: none;
|
||||||
|
padding: 10px 14px 12px;
|
||||||
|
margin-bottom: -1px; /* sit on top of the parent's border-bottom */
|
||||||
|
font-family: var(--serif);
|
||||||
|
font-size: 15px;
|
||||||
|
font-weight: 400;
|
||||||
|
letter-spacing: 0.02em;
|
||||||
|
color: var(--ink-faint);
|
||||||
|
border-bottom: 2px solid transparent;
|
||||||
|
cursor: pointer;
|
||||||
|
transition:
|
||||||
|
color var(--duration) var(--ease),
|
||||||
|
border-color var(--duration) var(--ease);
|
||||||
|
border-radius: 0; /* strip the global button radius */
|
||||||
|
}
|
||||||
|
.drawer-panel-body .drawer-tab:hover {
|
||||||
|
color: var(--ink);
|
||||||
|
background: transparent;
|
||||||
|
border-color: transparent;
|
||||||
|
}
|
||||||
|
.drawer-panel-body .drawer-tab.is-active {
|
||||||
|
color: var(--ink);
|
||||||
|
border-bottom-color: var(--accent);
|
||||||
|
background: transparent;
|
||||||
|
}
|
||||||
|
.drawer-panel-body .drawer-tab.is-active:hover {
|
||||||
|
background: transparent;
|
||||||
|
color: var(--ink);
|
||||||
|
}
|
||||||
|
.drawer-panel-body .drawer-tab:focus-visible {
|
||||||
|
outline: 2px solid var(--accent);
|
||||||
|
outline-offset: 2px;
|
||||||
|
border-radius: 2px;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Panes — only one visible at a time. Uses [hidden] so the JS can
|
||||||
|
toggle attribute-driven instead of class-driven. */
|
||||||
|
.drawer-panel-body .drawer-tab-pane[hidden] { display: none; }
|
||||||
|
|
||||||
|
/* Sections inside a pane: drop the section-level rules since the
|
||||||
|
tabs already segment the content. Keep the section h3 as a sub-
|
||||||
|
heading inside its pane — useful when a tab groups multiple
|
||||||
|
sections (e.g. Cast = Guest + Group + Edges). */
|
||||||
|
.drawer-panel-body .drawer-section {
|
||||||
|
padding: 14px 0 18px;
|
||||||
|
border-bottom: 1px solid var(--rule);
|
||||||
|
}
|
||||||
|
.drawer-panel-body .drawer-tab-pane > .drawer-section:first-child { padding-top: 6px; }
|
||||||
|
.drawer-panel-body .drawer-tab-pane > .drawer-section:last-child { border-bottom: none; padding-bottom: 4px; }
|
||||||
|
/* When a pane has only one section, suppress the redundant h3 since
|
||||||
|
the tab label is the same name. */
|
||||||
|
.drawer-panel-body .drawer-tab-pane:has(> .drawer-section:only-child) > .drawer-section > h3 {
|
||||||
|
display: none;
|
||||||
|
}
|
||||||
|
.drawer-panel-body .drawer-section h3 {
|
||||||
|
margin: 0 0 10px;
|
||||||
|
font-family: var(--serif);
|
||||||
|
font-weight: 500;
|
||||||
|
font-size: 12px;
|
||||||
|
letter-spacing: 0.16em;
|
||||||
|
text-transform: uppercase;
|
||||||
|
color: var(--accent);
|
||||||
|
}
|
||||||
|
.drawer-panel-body .activity-row,
|
||||||
|
.drawer-panel-body .edge-row { margin-bottom: 12px; }
|
||||||
|
.drawer-panel-body .activity-row strong,
|
||||||
|
.drawer-panel-body .edge-row strong { display: block; color: var(--ink); }
|
||||||
|
.drawer-panel-body .muted { color: var(--ink-faint); }
|
||||||
|
.drawer-panel-body button,
|
||||||
|
.drawer-panel-body .btn {
|
||||||
|
background: var(--ink);
|
||||||
|
border: 1px solid var(--ink);
|
||||||
|
color: var(--paper);
|
||||||
|
border-radius: 3px;
|
||||||
|
}
|
||||||
|
.drawer-panel-body button:hover,
|
||||||
|
.drawer-panel-body .btn:hover {
|
||||||
|
background: var(--accent);
|
||||||
|
border-color: var(--accent);
|
||||||
|
color: var(--ink);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Respect reduced-motion preference: no scale, no rotate, no
|
||||||
|
blur transition — just the opacity fade. */
|
||||||
|
@media (prefers-reduced-motion: reduce) {
|
||||||
|
.drawer-modal,
|
||||||
|
.drawer-panel,
|
||||||
|
.drawer-panel-close { transition-duration: 0ms; }
|
||||||
|
.drawer-panel { transform: none; }
|
||||||
|
.drawer-panel-close:hover { transform: none; }
|
||||||
|
}
|
||||||
.activity-row, .edge-row { margin-bottom: 12px; }
|
.activity-row, .edge-row { margin-bottom: 12px; }
|
||||||
.activity-row strong, .edge-row strong { display: block; }
|
.activity-row strong, .edge-row strong { display: block; }
|
||||||
.memory-list { list-style: none; padding: 0; margin: 0; }
|
.memory-list { list-style: none; padding: 0; margin: 0; }
|
||||||
|
|||||||
@@ -5,7 +5,18 @@
|
|||||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
<title>{% block title %}chat{% endblock %}</title>
|
<title>{% block title %}chat{% endblock %}</title>
|
||||||
<link rel="stylesheet" href="/static/app.css">
|
<link rel="stylesheet" href="/static/app.css">
|
||||||
|
<!-- Newsreader: refined editorial serif for accent typography
|
||||||
|
(drawer modal title, etc.). Body stays system-ui for read-
|
||||||
|
flow legibility. Subset to the weight we use to keep the
|
||||||
|
payload tiny. -->
|
||||||
|
<link rel="preconnect" href="https://fonts.googleapis.com">
|
||||||
|
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
||||||
|
<link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Newsreader:opsz,wght@6..72,400;6..72,500&display=swap">
|
||||||
<script src="https://unpkg.com/htmx.org@1.9.12" defer></script>
|
<script src="https://unpkg.com/htmx.org@1.9.12" defer></script>
|
||||||
|
<!-- htmx 1.x bundles its SSE extension at /dist/ext/sse.js. The
|
||||||
|
standalone htmx-ext-sse@2.x package is for htmx 2.x and is
|
||||||
|
not compatible with the 1.x ext API. -->
|
||||||
|
<script src="https://unpkg.com/htmx.org@1.9.12/dist/ext/sse.js" defer></script>
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
{% block body %}{% endblock %}
|
{% block body %}{% endblock %}
|
||||||
|
|||||||
+358
-20
@@ -7,7 +7,9 @@
|
|||||||
<header class="chat-header">
|
<header class="chat-header">
|
||||||
<h1>{{ host_bot.name }}</h1>
|
<h1>{{ host_bot.name }}</h1>
|
||||||
<div class="chat-meta muted">{{ chat.time }}</div>
|
<div class="chat-meta muted">{{ chat.time }}</div>
|
||||||
<button class="drawer-toggle" type="button" aria-controls="drawer" aria-expanded="false">Drawer</button>
|
<button class="drawer-toggle" type="button"
|
||||||
|
aria-controls="drawer-modal" aria-expanded="false"
|
||||||
|
aria-haspopup="dialog">Drawer</button>
|
||||||
</header>
|
</header>
|
||||||
|
|
||||||
<section class="timeline" id="timeline"
|
<section class="timeline" id="timeline"
|
||||||
@@ -30,21 +32,251 @@
|
|||||||
<button type="submit">Send</button>
|
<button type="submit">Send</button>
|
||||||
</form>
|
</form>
|
||||||
|
|
||||||
<aside class="drawer" id="drawer" hidden
|
|
||||||
hx-get="/chats/{{ chat.id }}/drawer"
|
|
||||||
hx-trigger="revealed"
|
|
||||||
hx-swap="innerHTML">
|
|
||||||
<p class="muted">Loading drawer…</p>
|
|
||||||
</aside>
|
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<!-- Drawer modal — director's notebook overlay.
|
||||||
|
Sits outside .chat-shell so its position:fixed backdrop covers the
|
||||||
|
whole viewport. The panel still pulls its inner HTML from
|
||||||
|
/chats/<id>/drawer via HTMX; trigger is a custom 'drawer-open'
|
||||||
|
event that the open/close script dispatches each time the modal
|
||||||
|
opens, so the content refreshes on every open. -->
|
||||||
|
<div class="drawer-modal" id="drawer-modal" hidden
|
||||||
|
role="dialog"
|
||||||
|
aria-modal="true"
|
||||||
|
aria-labelledby="drawer-modal-title">
|
||||||
|
<div class="drawer-modal-backdrop" data-drawer-close></div>
|
||||||
|
<article class="drawer-panel">
|
||||||
|
<header class="drawer-panel-header">
|
||||||
|
<h2 id="drawer-modal-title">Drawer</h2>
|
||||||
|
<button class="drawer-panel-close" type="button"
|
||||||
|
data-drawer-close
|
||||||
|
aria-label="Close drawer">×</button>
|
||||||
|
</header>
|
||||||
|
<div class="drawer-panel-body" id="drawer"
|
||||||
|
hx-get="/chats/{{ chat.id }}/drawer"
|
||||||
|
hx-trigger="drawer-open from:body"
|
||||||
|
hx-swap="innerHTML">
|
||||||
|
<p class="muted drawer-panel-loading">Loading…</p>
|
||||||
|
</div>
|
||||||
|
</article>
|
||||||
|
</div>
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
document.querySelector('.drawer-toggle')?.addEventListener('click', (e) => {
|
// Drawer modal — open/close, focus management, and post-swap
|
||||||
const drawer = document.getElementById('drawer');
|
// tab-grouping. The server's /chats/<id>/drawer response is left
|
||||||
const isHidden = drawer.hasAttribute('hidden');
|
// unchanged; this script post-processes the swapped HTML to:
|
||||||
if (isHidden) drawer.removeAttribute('hidden');
|
// 1. Pull the bot name from the legacy <header><h2> and use it as
|
||||||
else drawer.setAttribute('hidden', '');
|
// the modal title.
|
||||||
e.target.setAttribute('aria-expanded', String(isHidden));
|
// 2. Remove the legacy header (it has its own onclick="hidden"
|
||||||
});
|
// close that targets the OLD drawer semantics — broken now).
|
||||||
|
// 3. Walk .drawer-section blocks and group them into 4 tabs by
|
||||||
|
// their <h3> title:
|
||||||
|
// Scene : Scene, Activity
|
||||||
|
// Cast : Guest, Group, Edges
|
||||||
|
// Story : Events, Threads, Branches
|
||||||
|
// Turns : Recent turns, Significance review
|
||||||
|
// A tab nav is rendered above the sections; clicking switches
|
||||||
|
// which group is visible. Empty tabs (no matching sections) are
|
||||||
|
// hidden.
|
||||||
|
(function () {
|
||||||
|
const modal = document.getElementById('drawer-modal');
|
||||||
|
const toggle = document.querySelector('.drawer-toggle');
|
||||||
|
if (!modal || !toggle) return;
|
||||||
|
const titleEl = modal.querySelector('#drawer-modal-title');
|
||||||
|
const body = modal.querySelector('.drawer-panel-body');
|
||||||
|
const panel = modal.querySelector('.drawer-panel');
|
||||||
|
|
||||||
|
let lastFocus = null;
|
||||||
|
|
||||||
|
function open() {
|
||||||
|
if (!modal.hasAttribute('hidden')) return;
|
||||||
|
lastFocus = document.activeElement;
|
||||||
|
modal.removeAttribute('hidden');
|
||||||
|
// Force reflow so the .is-open class triggers the transition.
|
||||||
|
void modal.offsetWidth;
|
||||||
|
modal.classList.add('is-open');
|
||||||
|
toggle.setAttribute('aria-expanded', 'true');
|
||||||
|
document.body.classList.add('drawer-modal-open');
|
||||||
|
// Re-fetch drawer content via the panel's hx-trigger.
|
||||||
|
document.body.dispatchEvent(new CustomEvent('drawer-open'));
|
||||||
|
// Focus the close button so Escape / Enter both work
|
||||||
|
// immediately and screen readers announce the dialog.
|
||||||
|
requestAnimationFrame(() => {
|
||||||
|
const closeBtn = modal.querySelector('.drawer-panel-close');
|
||||||
|
if (closeBtn) closeBtn.focus();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function close() {
|
||||||
|
if (modal.hasAttribute('hidden')) return;
|
||||||
|
modal.classList.remove('is-open');
|
||||||
|
toggle.setAttribute('aria-expanded', 'false');
|
||||||
|
document.body.classList.remove('drawer-modal-open');
|
||||||
|
// Wait for the fade-out before fully hiding so the transition
|
||||||
|
// can play. Match the CSS duration.
|
||||||
|
setTimeout(() => {
|
||||||
|
modal.setAttribute('hidden', '');
|
||||||
|
if (lastFocus && typeof lastFocus.focus === 'function') {
|
||||||
|
lastFocus.focus();
|
||||||
|
}
|
||||||
|
}, 180);
|
||||||
|
}
|
||||||
|
|
||||||
|
toggle.addEventListener('click', open);
|
||||||
|
|
||||||
|
// Bind close DIRECTLY to every element flagged data-drawer-close.
|
||||||
|
// Event delegation through .stopPropagation() previously swallowed
|
||||||
|
// the close button's click (it sits inside .drawer-panel, which
|
||||||
|
// stops propagation to keep backdrop clicks from leaking through
|
||||||
|
// the panel itself). Direct binding sidesteps that and keeps the
|
||||||
|
// panel-stops-propagation rule for everything else.
|
||||||
|
function bindCloseTargets(root) {
|
||||||
|
root.querySelectorAll('[data-drawer-close]').forEach((el) => {
|
||||||
|
// Idempotent: only bind once per element.
|
||||||
|
if (el.dataset.drawerCloseBound === '1') return;
|
||||||
|
el.dataset.drawerCloseBound = '1';
|
||||||
|
el.addEventListener('click', (e) => {
|
||||||
|
e.preventDefault();
|
||||||
|
e.stopPropagation();
|
||||||
|
close();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
bindCloseTargets(modal);
|
||||||
|
// Clicks inside the panel that AREN'T close targets must not
|
||||||
|
// reach the backdrop click handler. (We don't have one currently
|
||||||
|
// — backdrop close is via data-drawer-close on the backdrop div —
|
||||||
|
// but stopPropagation here is defensive against future handlers.)
|
||||||
|
panel.addEventListener('click', (e) => e.stopPropagation());
|
||||||
|
|
||||||
|
// Escape closes only when the modal is open.
|
||||||
|
document.addEventListener('keydown', (e) => {
|
||||||
|
if (e.key === 'Escape' && !modal.hasAttribute('hidden')) {
|
||||||
|
e.preventDefault();
|
||||||
|
close();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// ---- Tabs: group server-rendered .drawer-section blocks ----
|
||||||
|
|
||||||
|
const TAB_GROUPS = [
|
||||||
|
{ id: 'scene', label: 'Scene', sections: ['Scene', 'Activity'] },
|
||||||
|
{ id: 'cast', label: 'Cast', sections: ['Guest', 'Group', 'Edges'] },
|
||||||
|
{ id: 'story', label: 'Story', sections: ['Events', 'Threads', 'Branches'] },
|
||||||
|
{ id: 'turns', label: 'Turns', sections: ['Recent turns', 'Significance review'] },
|
||||||
|
];
|
||||||
|
|
||||||
|
function tabIdForSection(h3Text) {
|
||||||
|
const t = (h3Text || '').trim();
|
||||||
|
for (const g of TAB_GROUPS) {
|
||||||
|
if (g.sections.includes(t)) return g.id;
|
||||||
|
}
|
||||||
|
return 'scene'; // unknown sections fall into the first tab
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildTabs() {
|
||||||
|
// Clean up the legacy server-rendered header inside the body
|
||||||
|
// (duplicate close + duplicate title).
|
||||||
|
const legacyHeader = body.querySelector(':scope > .drawer-content > .drawer-header');
|
||||||
|
if (legacyHeader) {
|
||||||
|
// Promote the bot name to the modal title before discarding.
|
||||||
|
const h2 = legacyHeader.querySelector('h2');
|
||||||
|
if (h2 && h2.textContent.trim()) {
|
||||||
|
titleEl.textContent = h2.textContent.trim();
|
||||||
|
}
|
||||||
|
legacyHeader.remove();
|
||||||
|
}
|
||||||
|
|
||||||
|
// The drawer-content wrapper holds all the sections. Group them.
|
||||||
|
const content = body.querySelector('.drawer-content');
|
||||||
|
if (!content) return;
|
||||||
|
|
||||||
|
const sections = Array.from(content.querySelectorAll(':scope > .drawer-section'));
|
||||||
|
if (sections.length === 0) return;
|
||||||
|
|
||||||
|
// Bucket sections by tab id.
|
||||||
|
const buckets = new Map(TAB_GROUPS.map((g) => [g.id, []]));
|
||||||
|
for (const sec of sections) {
|
||||||
|
const h3 = sec.querySelector(':scope > h3');
|
||||||
|
const tabId = tabIdForSection(h3 ? h3.textContent : '');
|
||||||
|
buckets.get(tabId).push(sec);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build the tab nav. Skip empty buckets so the nav reflects
|
||||||
|
// what the chat actually has (e.g. no Guest tab when 1:1).
|
||||||
|
const nav = document.createElement('nav');
|
||||||
|
nav.className = 'drawer-tabs';
|
||||||
|
nav.setAttribute('role', 'tablist');
|
||||||
|
|
||||||
|
const panes = document.createElement('div');
|
||||||
|
panes.className = 'drawer-tab-panes';
|
||||||
|
|
||||||
|
let firstActive = null;
|
||||||
|
for (const group of TAB_GROUPS) {
|
||||||
|
const items = buckets.get(group.id);
|
||||||
|
if (!items.length) continue;
|
||||||
|
|
||||||
|
const btn = document.createElement('button');
|
||||||
|
btn.type = 'button';
|
||||||
|
btn.className = 'drawer-tab';
|
||||||
|
btn.setAttribute('role', 'tab');
|
||||||
|
btn.id = `drawer-tab-${group.id}`;
|
||||||
|
btn.dataset.tabTarget = group.id;
|
||||||
|
btn.textContent = group.label;
|
||||||
|
btn.setAttribute('aria-controls', `drawer-pane-${group.id}`);
|
||||||
|
nav.appendChild(btn);
|
||||||
|
|
||||||
|
const pane = document.createElement('section');
|
||||||
|
pane.className = 'drawer-tab-pane';
|
||||||
|
pane.id = `drawer-pane-${group.id}`;
|
||||||
|
pane.setAttribute('role', 'tabpanel');
|
||||||
|
pane.setAttribute('aria-labelledby', `drawer-tab-${group.id}`);
|
||||||
|
// Move the section nodes into the pane (preserves any HTMX
|
||||||
|
// event listeners and the sections' interactive forms).
|
||||||
|
for (const sec of items) pane.appendChild(sec);
|
||||||
|
panes.appendChild(pane);
|
||||||
|
|
||||||
|
if (!firstActive) firstActive = group.id;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Replace the existing content with [nav][panes].
|
||||||
|
content.innerHTML = '';
|
||||||
|
content.appendChild(nav);
|
||||||
|
content.appendChild(panes);
|
||||||
|
|
||||||
|
// Tab click handler.
|
||||||
|
nav.addEventListener('click', (e) => {
|
||||||
|
const target = e.target;
|
||||||
|
if (!(target instanceof HTMLElement)) return;
|
||||||
|
const tabId = target.dataset.tabTarget;
|
||||||
|
if (!tabId) return;
|
||||||
|
activateTab(content, tabId);
|
||||||
|
});
|
||||||
|
|
||||||
|
if (firstActive) activateTab(content, firstActive);
|
||||||
|
}
|
||||||
|
|
||||||
|
function activateTab(content, tabId) {
|
||||||
|
content.querySelectorAll('.drawer-tab').forEach((btn) => {
|
||||||
|
const isActive = btn.dataset.tabTarget === tabId;
|
||||||
|
btn.classList.toggle('is-active', isActive);
|
||||||
|
btn.setAttribute('aria-selected', String(isActive));
|
||||||
|
btn.setAttribute('tabindex', isActive ? '0' : '-1');
|
||||||
|
});
|
||||||
|
content.querySelectorAll('.drawer-tab-pane').forEach((pane) => {
|
||||||
|
const isActive = pane.id === `drawer-pane-${tabId}`;
|
||||||
|
pane.toggleAttribute('hidden', !isActive);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run after every HTMX swap into the panel body. Covers the
|
||||||
|
// initial open AND any subsequent server-driven re-render
|
||||||
|
// (e.g. an in-drawer form submit that returns refreshed HTML).
|
||||||
|
body.addEventListener('htmx:afterSwap', () => {
|
||||||
|
buildTabs();
|
||||||
|
bindCloseTargets(modal);
|
||||||
|
});
|
||||||
|
})();
|
||||||
</script>
|
</script>
|
||||||
<script>
|
<script>
|
||||||
// Streaming UX (T34): typing indicator, Stop button, send-lock,
|
// Streaming UX (T34): typing indicator, Stop button, send-lock,
|
||||||
@@ -66,6 +298,44 @@ document.querySelector('.drawer-toggle')?.addEventListener('click', (e) => {
|
|||||||
let isStreaming = false;
|
let isStreaming = false;
|
||||||
let typingEl = null;
|
let typingEl = null;
|
||||||
|
|
||||||
|
// Sticky-bottom autoscroll: scroll the timeline to the latest
|
||||||
|
// message when new content arrives, but ONLY if the user is
|
||||||
|
// already pinned to the bottom. Once they scroll up to read older
|
||||||
|
// turns, we leave their position alone until they manually scroll
|
||||||
|
// back down.
|
||||||
|
//
|
||||||
|
// ``isPinnedToBottom`` flips on every scroll event based on
|
||||||
|
// distance-from-bottom (with a small tolerance so a few pixels of
|
||||||
|
// overshoot from a layout shift doesn't unpin). A MutationObserver
|
||||||
|
// catches every node added to the timeline — covers the SSE-
|
||||||
|
// injected ``turn_html`` swap, the optimistic ``appendUserTurn``
|
||||||
|
// render, and the streaming typing-indicator updates.
|
||||||
|
const STICK_TOLERANCE_PX = 64;
|
||||||
|
let isPinnedToBottom = true;
|
||||||
|
|
||||||
|
function distanceFromBottom() {
|
||||||
|
return timeline.scrollHeight - timeline.scrollTop - timeline.clientHeight;
|
||||||
|
}
|
||||||
|
function scrollToBottom() {
|
||||||
|
timeline.scrollTop = timeline.scrollHeight;
|
||||||
|
}
|
||||||
|
// Initial state: stick to the bottom on page load so the latest
|
||||||
|
// turn is visible without manual scrolling.
|
||||||
|
requestAnimationFrame(scrollToBottom);
|
||||||
|
|
||||||
|
timeline.addEventListener('scroll', () => {
|
||||||
|
isPinnedToBottom = distanceFromBottom() <= STICK_TOLERANCE_PX;
|
||||||
|
}, { passive: true });
|
||||||
|
|
||||||
|
const timelineObserver = new MutationObserver(() => {
|
||||||
|
if (isPinnedToBottom) scrollToBottom();
|
||||||
|
});
|
||||||
|
timelineObserver.observe(timeline, {
|
||||||
|
childList: true,
|
||||||
|
subtree: true,
|
||||||
|
characterData: true, // streaming token-by-token edits
|
||||||
|
});
|
||||||
|
|
||||||
function ensureTypingEl() {
|
function ensureTypingEl() {
|
||||||
if (typingEl) return typingEl;
|
if (typingEl) return typingEl;
|
||||||
typingEl = document.createElement('div');
|
typingEl = document.createElement('div');
|
||||||
@@ -162,13 +432,62 @@ document.querySelector('.drawer-toggle')?.addEventListener('click', (e) => {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
form.addEventListener('submit', () => {
|
// Enter-to-send (Shift+Enter for newline). Submits via the form's
|
||||||
isStreaming = true;
|
// own submit event so all the optimistic-render + fetch logic
|
||||||
|
// below applies uniformly to keyboard and click submissions.
|
||||||
|
if (textarea) {
|
||||||
|
textarea.addEventListener('keydown', (e) => {
|
||||||
|
if (e.key === 'Enter' && !e.shiftKey && !e.isComposing) {
|
||||||
|
e.preventDefault();
|
||||||
|
if (typeof form.requestSubmit === 'function') {
|
||||||
|
form.requestSubmit();
|
||||||
|
} else {
|
||||||
|
form.dispatchEvent(new Event('submit', { cancelable: true }));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Render the user's prose optimistically as a turn-you DOM node.
|
||||||
|
// Without this the user can't see what they just sent until the page
|
||||||
|
// reloads — the server persists ``user_turn`` events but doesn't
|
||||||
|
// publish a turn_html for them (the SSE channel is bot-output-only).
|
||||||
|
function appendUserTurn(prose) {
|
||||||
|
const div = document.createElement('div');
|
||||||
|
div.className = 'turn turn-you';
|
||||||
|
const strong = document.createElement('strong');
|
||||||
|
strong.textContent = 'you';
|
||||||
|
const p = document.createElement('p');
|
||||||
|
p.textContent = prose;
|
||||||
|
div.appendChild(strong);
|
||||||
|
div.appendChild(p);
|
||||||
|
// Sending a message means the user wants to see it land — force
|
||||||
|
// sticky-bottom even if they were scrolled up reading older
|
||||||
|
// turns. The MutationObserver handles the actual scroll.
|
||||||
|
isPinnedToBottom = true;
|
||||||
|
timeline.appendChild(div);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Intercept the form submit and POST via fetch so we can:
|
||||||
|
// 1. Render the user's prose immediately (optimistic).
|
||||||
|
// 2. Clear the textarea immediately.
|
||||||
|
// 3. Keep the page state intact while the bot streams its
|
||||||
|
// response over SSE — vanilla form POST + 204 leaves the
|
||||||
|
// browser in a half-loaded state with the textarea unflushed.
|
||||||
|
form.addEventListener('submit', async (e) => {
|
||||||
|
e.preventDefault();
|
||||||
|
if (isStreaming) return;
|
||||||
|
const prose = textarea ? (textarea.value || '').trim() : '';
|
||||||
|
if (!prose) return;
|
||||||
|
|
||||||
|
appendUserTurn(prose);
|
||||||
|
if (textarea) {
|
||||||
|
textarea.value = '';
|
||||||
|
textarea.readOnly = true;
|
||||||
|
}
|
||||||
if (sendBtn) sendBtn.disabled = true;
|
if (sendBtn) sendBtn.disabled = true;
|
||||||
// readOnly (not disabled) — disabled fields are excluded from the
|
isStreaming = true;
|
||||||
// form submission, which would send prose="" and trigger the
|
|
||||||
// server's empty-prose 400.
|
|
||||||
if (textarea) textarea.readOnly = true;
|
|
||||||
if (!shell.querySelector('.stop-streaming')) {
|
if (!shell.querySelector('.stop-streaming')) {
|
||||||
const stopBtn = document.createElement('button');
|
const stopBtn = document.createElement('button');
|
||||||
stopBtn.type = 'button';
|
stopBtn.type = 'button';
|
||||||
@@ -186,6 +505,25 @@ document.querySelector('.drawer-toggle')?.addEventListener('click', (e) => {
|
|||||||
});
|
});
|
||||||
form.parentElement.insertBefore(stopBtn, form);
|
form.parentElement.insertBefore(stopBtn, form);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Fire the actual POST. The bot's response arrives via SSE
|
||||||
|
// (``turn_html`` event swaps into the timeline; ``unlock()`` runs
|
||||||
|
// on receipt to clear streaming state and re-enable the form).
|
||||||
|
try {
|
||||||
|
const body = new URLSearchParams({ prose }).toString();
|
||||||
|
const resp = await fetch(form.action, {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/x-www-form-urlencoded' },
|
||||||
|
body,
|
||||||
|
});
|
||||||
|
if (!resp.ok && resp.status !== 204) {
|
||||||
|
showBanner('send failed (HTTP ' + resp.status + ') — try again');
|
||||||
|
unlock();
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
showBanner('send failed — check your connection');
|
||||||
|
unlock();
|
||||||
|
}
|
||||||
});
|
});
|
||||||
})();
|
})();
|
||||||
</script>
|
</script>
|
||||||
|
|||||||
+27
-5
@@ -5,9 +5,9 @@ from fastapi.responses import RedirectResponse, HTMLResponse
|
|||||||
from fastapi.templating import Jinja2Templates
|
from fastapi.templating import Jinja2Templates
|
||||||
|
|
||||||
from chat.db.connection import open_db
|
from chat.db.connection import open_db
|
||||||
from chat.eventlog.log import append_event
|
from chat.eventlog.log import append_and_apply
|
||||||
from chat.eventlog.projector import project
|
from chat.state.entities import get_bot, list_bots
|
||||||
from chat.state.entities import list_bots
|
from chat.state.world import get_chat
|
||||||
|
|
||||||
TEMPLATES = Jinja2Templates(directory=str(Path(__file__).resolve().parent.parent / "templates"))
|
TEMPLATES = Jinja2Templates(directory=str(Path(__file__).resolve().parent.parent / "templates"))
|
||||||
|
|
||||||
@@ -108,11 +108,33 @@ async def bot_create(
|
|||||||
"initial_relationship_to_you": initial_relationship_to_you.strip(),
|
"initial_relationship_to_you": initial_relationship_to_you.strip(),
|
||||||
"kickoff_prose": kickoff_prose.strip(),
|
"kickoff_prose": kickoff_prose.strip(),
|
||||||
}
|
}
|
||||||
append_event(conn, kind="bot_authored", payload=payload)
|
# Per-event apply (NOT project()) — see docs/audits/2026-04-27-project-callers.md.
|
||||||
project(conn)
|
# ``project()`` replays the full log, which trips raw-INSERT handlers like
|
||||||
|
# ``_apply_chat_created`` once a second bot's events are present.
|
||||||
|
append_and_apply(conn, kind="bot_authored", payload=payload)
|
||||||
return RedirectResponse(url=f"/bots/{payload['id']}/kickoff", status_code=303)
|
return RedirectResponse(url=f"/bots/{payload['id']}/kickoff", status_code=303)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/bots/{bot_id}")
|
||||||
|
async def bot_detail(bot_id: str, conn=Depends(get_conn)):
|
||||||
|
"""Click-through from the bots list. Routes to the bot's existing
|
||||||
|
chat when there is one (the v1 model is one-chat-per-host-bot,
|
||||||
|
keyed by ``chat_<bot_id>``), otherwise to the kickoff page so the
|
||||||
|
user can author the chat's opening state. 404 if the bot itself
|
||||||
|
doesn't exist.
|
||||||
|
|
||||||
|
Defined AFTER the ``/bots/new`` and ``/bots/{bot_id}/kickoff``
|
||||||
|
routes — FastAPI matches in declaration order, and a path
|
||||||
|
parameter would otherwise swallow ``/bots/new``.
|
||||||
|
"""
|
||||||
|
if get_bot(conn, bot_id) is None:
|
||||||
|
raise HTTPException(status_code=404, detail="bot not found")
|
||||||
|
chat_id = f"chat_{bot_id}"
|
||||||
|
if get_chat(conn, chat_id) is not None:
|
||||||
|
return RedirectResponse(url=f"/chats/{chat_id}", status_code=303)
|
||||||
|
return RedirectResponse(url=f"/bots/{bot_id}/kickoff", status_code=303)
|
||||||
|
|
||||||
|
|
||||||
@router.post("/bots/{bot_id}/reset")
|
@router.post("/bots/{bot_id}/reset")
|
||||||
async def reset_bot_route(
|
async def reset_bot_route(
|
||||||
bot_id: str,
|
bot_id: str,
|
||||||
|
|||||||
+103
-18
@@ -17,8 +17,7 @@ from fastapi import APIRouter, Depends, Form, HTTPException, Request
|
|||||||
from fastapi.responses import HTMLResponse, RedirectResponse
|
from fastapi.responses import HTMLResponse, RedirectResponse
|
||||||
from fastapi.templating import Jinja2Templates
|
from fastapi.templating import Jinja2Templates
|
||||||
|
|
||||||
from chat.eventlog.log import append_event
|
from chat.eventlog.log import append_and_apply
|
||||||
from chat.eventlog.projector import project
|
|
||||||
from chat.llm.client import LLMClient
|
from chat.llm.client import LLMClient
|
||||||
from chat.services.kickoff import parse_kickoff
|
from chat.services.kickoff import parse_kickoff
|
||||||
from chat.state.entities import get_bot, get_you
|
from chat.state.entities import get_bot, get_you
|
||||||
@@ -32,14 +31,97 @@ router = APIRouter()
|
|||||||
|
|
||||||
|
|
||||||
def get_llm_client(request: Request) -> LLMClient:
|
def get_llm_client(request: Request) -> LLMClient:
|
||||||
"""Production LLM client. Tests override this via ``app.dependency_overrides``."""
|
"""Production LLM client. Tests override this via ``app.dependency_overrides``.
|
||||||
|
|
||||||
|
Returns a :class:`chat.llm.router.RoutedLLMClient` that splits
|
||||||
|
traffic: the narrative model goes to Featherless, the classifier
|
||||||
|
+ embeddings go to the local MLX server (``mlx-omni-server``).
|
||||||
|
Both backends share the OpenAI-compatible surface, so the routing
|
||||||
|
is invisible to call sites — they just pass ``model=...`` and the
|
||||||
|
router picks the backend.
|
||||||
|
"""
|
||||||
settings = request.app.state.settings
|
settings = request.app.state.settings
|
||||||
from chat.llm.featherless import FeatherlessClient
|
from chat.llm.featherless import FeatherlessClient
|
||||||
|
from chat.llm.local_mlx import LocalMLXClient
|
||||||
|
from chat.llm.router import RoutedLLMClient
|
||||||
|
|
||||||
return FeatherlessClient(
|
narrative = FeatherlessClient(
|
||||||
api_key=settings.featherless_api_key,
|
api_key=settings.featherless_api_key,
|
||||||
base_url=settings.featherless_base_url,
|
base_url=settings.featherless_base_url,
|
||||||
)
|
)
|
||||||
|
# Dedicated classifier client when a provider pin is configured —
|
||||||
|
# routes Llama-3.1-8B (or whatever ``classifier_model`` is) onto a
|
||||||
|
# specific upstream like Cerebras for ~10x throughput. When the
|
||||||
|
# pin is empty, ``classifier`` is None and the router falls back
|
||||||
|
# to the narrative client for classifier traffic.
|
||||||
|
classifier = None
|
||||||
|
if settings.classifier_provider_order:
|
||||||
|
classifier = FeatherlessClient(
|
||||||
|
api_key=settings.featherless_api_key,
|
||||||
|
base_url=settings.featherless_base_url,
|
||||||
|
default_extra_body={
|
||||||
|
"provider": {"order": list(settings.classifier_provider_order)}
|
||||||
|
},
|
||||||
|
)
|
||||||
|
local = LocalMLXClient(base_url=settings.local_mlx_base_url)
|
||||||
|
return RoutedLLMClient(
|
||||||
|
narrative=narrative,
|
||||||
|
classifier=classifier,
|
||||||
|
local=local,
|
||||||
|
narrative_model=settings.narrative_model,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _coerce_iso_time(value: str) -> str:
|
||||||
|
"""Permissive parser that returns a canonical ISO 8601 datetime.
|
||||||
|
|
||||||
|
The kickoff classifier (chat/services/kickoff.py) returns
|
||||||
|
``initial_time_iso`` as a free-form string; in practice it emits
|
||||||
|
things like ``"Sun 2024-05-12 07:00:00"``,
|
||||||
|
``"Tuesday, May 14, 2024 7:00 AM"``, or proper ISO. The strict
|
||||||
|
``datetime.fromisoformat`` would 400 on those, so this helper
|
||||||
|
tries a sequence of common classifier-emitted formats and
|
||||||
|
returns a canonical ``YYYY-MM-DDTHH:MM:SS+00:00`` form.
|
||||||
|
|
||||||
|
Raises ``ValueError`` when nothing parses, so the caller can 400
|
||||||
|
cleanly.
|
||||||
|
"""
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
s = (value or "").strip()
|
||||||
|
if not s:
|
||||||
|
return s
|
||||||
|
# Strict ISO first (covers "2026-04-26T20:00:00+00:00" and friends).
|
||||||
|
try:
|
||||||
|
dt = datetime.fromisoformat(s)
|
||||||
|
except ValueError:
|
||||||
|
dt = None
|
||||||
|
if dt is None:
|
||||||
|
# Common classifier-emitted formats, in rough frequency order.
|
||||||
|
formats = [
|
||||||
|
"%a %Y-%m-%d %H:%M:%S", # Sun 2024-05-12 07:00:00
|
||||||
|
"%A %Y-%m-%d %H:%M:%S", # Sunday 2024-05-12 07:00:00
|
||||||
|
"%Y-%m-%d %H:%M:%S", # 2024-05-12 07:00:00
|
||||||
|
"%Y-%m-%d %H:%M", # 2024-05-12 07:00
|
||||||
|
"%Y-%m-%d", # 2024-05-12 (date only)
|
||||||
|
"%a %b %d %Y %H:%M:%S", # Sun May 12 2024 07:00:00
|
||||||
|
"%A, %B %d, %Y %I:%M %p", # Tuesday, May 14, 2024 7:00 AM
|
||||||
|
"%B %d, %Y %I:%M %p", # May 14, 2024 7:00 AM
|
||||||
|
"%a %b %d %H:%M:%S %Y", # Sun May 12 07:00:00 2024 (asctime-ish)
|
||||||
|
]
|
||||||
|
for fmt in formats:
|
||||||
|
try:
|
||||||
|
dt = datetime.strptime(s, fmt)
|
||||||
|
break
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
if dt is None:
|
||||||
|
raise ValueError(f"could not parse {value!r} as a datetime")
|
||||||
|
# Naive datetimes assumed UTC (the v1 model is single-user, single
|
||||||
|
# timezone — keeping it consistent with chat_state.time defaults).
|
||||||
|
if dt.tzinfo is None:
|
||||||
|
dt = dt.replace(tzinfo=timezone.utc)
|
||||||
|
return dt.isoformat(timespec="seconds")
|
||||||
|
|
||||||
|
|
||||||
def _parse_holding(text: str) -> list[str]:
|
def _parse_holding(text: str) -> list[str]:
|
||||||
@@ -157,11 +239,13 @@ async def kickoff_post(
|
|||||||
if bot is None:
|
if bot is None:
|
||||||
raise HTTPException(status_code=404, detail=f"bot not found: {bot_id}")
|
raise HTTPException(status_code=404, detail=f"bot not found: {bot_id}")
|
||||||
|
|
||||||
# Loose ISO 8601 validation. ``datetime.fromisoformat`` accepts the offset
|
# Permissive datetime parsing — the classifier emits a variety of
|
||||||
# form ``2026-04-26T20:00:00+00:00`` we use; reject anything it can't parse.
|
# human-readable formats ("Sun 2024-05-12 07:00:00",
|
||||||
|
# "Tuesday, May 14, 2024 7:00 AM", proper ISO, etc.). We coerce
|
||||||
|
# to canonical ISO and only 400 if NOTHING parses.
|
||||||
if initial_time_iso.strip():
|
if initial_time_iso.strip():
|
||||||
try:
|
try:
|
||||||
datetime.fromisoformat(initial_time_iso.strip())
|
initial_time_iso = _coerce_iso_time(initial_time_iso)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
status_code=400,
|
status_code=400,
|
||||||
@@ -178,8 +262,14 @@ async def kickoff_post(
|
|||||||
).fetchone()
|
).fetchone()
|
||||||
container_id = next_container_row[0]
|
container_id = next_container_row[0]
|
||||||
|
|
||||||
|
# Use ``append_and_apply`` per event (live-path pattern) rather than
|
||||||
|
# appending all-then-project. ``project()`` replays the *entire*
|
||||||
|
# event log; non-idempotent handlers like ``_apply_chat_created``
|
||||||
|
# (raw INSERT into chats) then 500 with UNIQUE constraint failures
|
||||||
|
# for any chats that already exist from prior kickoffs.
|
||||||
|
|
||||||
# 1. chat_created
|
# 1. chat_created
|
||||||
append_event(
|
append_and_apply(
|
||||||
conn,
|
conn,
|
||||||
kind="chat_created",
|
kind="chat_created",
|
||||||
payload={
|
payload={
|
||||||
@@ -192,7 +282,7 @@ async def kickoff_post(
|
|||||||
)
|
)
|
||||||
|
|
||||||
# 2. container_created
|
# 2. container_created
|
||||||
append_event(
|
append_and_apply(
|
||||||
conn,
|
conn,
|
||||||
kind="container_created",
|
kind="container_created",
|
||||||
payload={
|
payload={
|
||||||
@@ -208,7 +298,7 @@ async def kickoff_post(
|
|||||||
bot_interruptible = bool(bot_activity_action_interruptible)
|
bot_interruptible = bool(bot_activity_action_interruptible)
|
||||||
|
|
||||||
# 3. activity_change for "you"
|
# 3. activity_change for "you"
|
||||||
append_event(
|
append_and_apply(
|
||||||
conn,
|
conn,
|
||||||
kind="activity_change",
|
kind="activity_change",
|
||||||
payload={
|
payload={
|
||||||
@@ -229,7 +319,7 @@ async def kickoff_post(
|
|||||||
)
|
)
|
||||||
|
|
||||||
# 4. activity_change for bot
|
# 4. activity_change for bot
|
||||||
append_event(
|
append_and_apply(
|
||||||
conn,
|
conn,
|
||||||
kind="activity_change",
|
kind="activity_change",
|
||||||
payload={
|
payload={
|
||||||
@@ -250,7 +340,7 @@ async def kickoff_post(
|
|||||||
)
|
)
|
||||||
|
|
||||||
# 5. scene_opened
|
# 5. scene_opened
|
||||||
append_event(
|
append_and_apply(
|
||||||
conn,
|
conn,
|
||||||
kind="scene_opened",
|
kind="scene_opened",
|
||||||
payload={
|
payload={
|
||||||
@@ -267,7 +357,7 @@ async def kickoff_post(
|
|||||||
facts = _parse_facts(edge_seed_knowledge_facts)
|
facts = _parse_facts(edge_seed_knowledge_facts)
|
||||||
if edge_seed_summary.strip():
|
if edge_seed_summary.strip():
|
||||||
facts.insert(0, f"[summary] {edge_seed_summary.strip()}")
|
facts.insert(0, f"[summary] {edge_seed_summary.strip()}")
|
||||||
append_event(
|
append_and_apply(
|
||||||
conn,
|
conn,
|
||||||
kind="edge_update",
|
kind="edge_update",
|
||||||
payload={
|
payload={
|
||||||
@@ -278,9 +368,4 @@ async def kickoff_post(
|
|||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
# Project all events at once. ``bot_authored`` (already in log from prior
|
|
||||||
# POST) is idempotent (INSERT OR REPLACE); the new events project cleanly
|
|
||||||
# because they're being applied for the first time.
|
|
||||||
project(conn)
|
|
||||||
|
|
||||||
return RedirectResponse(url=f"/chats/{chat_id}", status_code=303)
|
return RedirectResponse(url=f"/chats/{chat_id}", status_code=303)
|
||||||
|
|||||||
@@ -4,8 +4,7 @@ from fastapi import APIRouter, Depends, Form, HTTPException, Request
|
|||||||
from fastapi.responses import HTMLResponse
|
from fastapi.responses import HTMLResponse
|
||||||
from fastapi.templating import Jinja2Templates
|
from fastapi.templating import Jinja2Templates
|
||||||
|
|
||||||
from chat.eventlog.log import append_event
|
from chat.eventlog.log import append_and_apply
|
||||||
from chat.eventlog.projector import project
|
|
||||||
from chat.state.entities import get_you
|
from chat.state.entities import get_you
|
||||||
from chat.web.bots import get_conn
|
from chat.web.bots import get_conn
|
||||||
|
|
||||||
@@ -40,8 +39,10 @@ async def settings_post(
|
|||||||
"pronouns": pronouns.strip(),
|
"pronouns": pronouns.strip(),
|
||||||
"persona": persona.strip(),
|
"persona": persona.strip(),
|
||||||
}
|
}
|
||||||
append_event(conn, kind="you_authored", payload=payload)
|
# Per-event apply (NOT project()) — see docs/audits/2026-04-27-project-callers.md.
|
||||||
project(conn)
|
# ``project()`` replays the full log, which trips raw-INSERT handlers like
|
||||||
|
# ``_apply_chat_created`` once chat events are present.
|
||||||
|
append_and_apply(conn, kind="you_authored", payload=payload)
|
||||||
|
|
||||||
return TEMPLATES.TemplateResponse(
|
return TEMPLATES.TemplateResponse(
|
||||||
request,
|
request,
|
||||||
|
|||||||
@@ -67,6 +67,7 @@ from chat.services.multi_state_update import compute_state_updates_for_present
|
|||||||
from chat.services.prompt import (
|
from chat.services.prompt import (
|
||||||
assemble_narrative_prompt,
|
assemble_narrative_prompt,
|
||||||
consume_pending_meanwhile_digests,
|
consume_pending_meanwhile_digests,
|
||||||
|
trim_to_max_beats,
|
||||||
)
|
)
|
||||||
from chat.services.rewind import compute_rewind_preview, execute_rewind
|
from chat.services.rewind import compute_rewind_preview, execute_rewind
|
||||||
from chat.services.scene_close import detect_scene_close
|
from chat.services.scene_close import detect_scene_close
|
||||||
@@ -482,6 +483,11 @@ async def post_turn(
|
|||||||
_in_flight_tasks.pop(chat_id, None)
|
_in_flight_tasks.pop(chat_id, None)
|
||||||
|
|
||||||
primary_text = "".join(primary_accumulated)
|
primary_text = "".join(primary_accumulated)
|
||||||
|
# Belt-and-suspenders: trim to 3 beats max even if the model
|
||||||
|
# ignored the "HARD CAP: 2-3 beats" prompt instruction. Roleplay-
|
||||||
|
# tuned narrators are reliably verbose; a physical max_tokens
|
||||||
|
# truncates mid-word, this trims at a beat boundary.
|
||||||
|
primary_text = trim_to_max_beats(primary_text, max_beats=3)
|
||||||
|
|
||||||
# 7. Append the assistant_turn with the final text. (See note above on
|
# 7. Append the assistant_turn with the final text. (See note above on
|
||||||
# why we skip ``project`` for these transcript-only event kinds.)
|
# why we skip ``project`` for these transcript-only event kinds.)
|
||||||
@@ -677,6 +683,10 @@ async def post_turn(
|
|||||||
_in_flight_tasks.pop(chat_id, None)
|
_in_flight_tasks.pop(chat_id, None)
|
||||||
|
|
||||||
interjection_text = "".join(interject_accumulated)
|
interjection_text = "".join(interject_accumulated)
|
||||||
|
# Same beat-cap as the primary turn — interjections are
|
||||||
|
# by definition short, but Cydonia-class narrators ignore
|
||||||
|
# that. 2 beats is plenty for a chime-in.
|
||||||
|
interjection_text = trim_to_max_beats(interjection_text, max_beats=2)
|
||||||
|
|
||||||
# Capture the event id (T86 follow-up) so the SSE fragment
|
# Capture the event id (T86 follow-up) so the SSE fragment
|
||||||
# below carries ``id="turn-<n>"`` for in-place swap.
|
# below carries ``id="turn-<n>"`` for in-place swap.
|
||||||
|
|||||||
@@ -0,0 +1,205 @@
|
|||||||
|
# Audit: `project()` callers and non-idempotent projector handlers
|
||||||
|
|
||||||
|
**Date:** 2026-04-27
|
||||||
|
**Triggering incident:** commit `0f8bf94` — `kickoff_post` 500'd with
|
||||||
|
`sqlite3.IntegrityError: UNIQUE constraint failed: chats.id` after a
|
||||||
|
second bot's kickoff. Root cause: the route appended events with
|
||||||
|
`append_event()` and then called `project(conn)`, which replays the
|
||||||
|
*entire* event log. The `chat_created` handler in `chat/state/world.py`
|
||||||
|
uses raw `INSERT INTO chats ...` (no `OR REPLACE`/`OR IGNORE`), so on a
|
||||||
|
DB that already had a first bot's chat row, the replay re-hit that row
|
||||||
|
and raised.
|
||||||
|
|
||||||
|
This audit walks the rest of the live request paths to make sure no
|
||||||
|
other route has the same shape, and inventories every projector handler
|
||||||
|
that uses raw `INSERT` so the trade-offs are documented for future
|
||||||
|
hardening passes.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Step 1 — `project()` callers
|
||||||
|
|
||||||
|
`grep -rn "project(" chat/ --include="*.py"` (excluding the definition
|
||||||
|
itself in `chat/eventlog/projector.py:17` and the local `project_id`
|
||||||
|
type variables that the regex doesn't actually catch):
|
||||||
|
|
||||||
|
| File:line | Caller | Classification |
|
||||||
|
|---|---|---|
|
||||||
|
| `chat/web/bots.py:113` | `bot_create` route — append `bot_authored` then `project(conn)` | **Unsafe (live path) — fixed** |
|
||||||
|
| `chat/web/settings.py:44` | `settings_post` route — append `you_authored` then `project(conn)` | **Unsafe (live path) — fixed** |
|
||||||
|
| `chat/services/rewind.py:110` | `execute_rewind` — clears every projected table then re-projects from the truncated log | **Safe (replay-only)** |
|
||||||
|
| `chat/eventlog/projector.py:17` | Definition site, not a call | n/a |
|
||||||
|
| `tests/test_*.py` (~50 tests) | Test setup pattern: append a sequence of synthetic events into a fresh DB, then `project(conn)` to materialise | **Safe (replay-only)** — projects against an empty/fresh DB; not a live request path |
|
||||||
|
|
||||||
|
### Safe (replay-only)
|
||||||
|
|
||||||
|
- **`chat/services/rewind.py:110`** — `execute_rewind` is the canonical
|
||||||
|
"rebuild the projection" entry point. Lines 95–104 explicitly
|
||||||
|
`DELETE FROM` every projected table (`memories`, `activity`, `scenes`,
|
||||||
|
`containers`, `chat_state`, `chats`, `edges`, `bots`, `you_entity`,
|
||||||
|
`classifier_failures`) before calling `project(conn)`. The handler
|
||||||
|
registry then walks the truncated log against empty tables, so even
|
||||||
|
the raw-INSERT handlers run safely on a clean slate. The module
|
||||||
|
docstring (lines 1–21) calls out exactly why a full replay (rather
|
||||||
|
than a "revert delta") is the right move here: the `edge_update`
|
||||||
|
handler is a delta accumulator with no clean inverse. **Do not
|
||||||
|
change.**
|
||||||
|
|
||||||
|
- **Test suite** — every `from chat.eventlog.projector import project`
|
||||||
|
in `tests/` is a setup helper. They open a fresh in-memory or
|
||||||
|
tmp-path DB, append a hand-crafted sequence of events, and call
|
||||||
|
`project(conn)` once. There is no second-replay risk because the DB
|
||||||
|
starts empty. These are not live paths.
|
||||||
|
|
||||||
|
### Unsafe (live-path) — fixed in this audit
|
||||||
|
|
||||||
|
Both fixes follow the pattern established by `0f8bf94`: drop the
|
||||||
|
`append_event` + `project` pair in favour of `append_and_apply` (defined
|
||||||
|
in `chat/eventlog/log.py:32`), which appends and runs *only the
|
||||||
|
brand-new event* through its registered handler.
|
||||||
|
|
||||||
|
- **`chat/web/bots.py:113` — `bot_create`**
|
||||||
|
Was: `append_event(conn, kind="bot_authored", ...); project(conn)`.
|
||||||
|
Now: `append_and_apply(conn, kind="bot_authored", ...)`.
|
||||||
|
In isolation, `_apply_bot_authored` is itself idempotent (`INSERT OR
|
||||||
|
REPLACE INTO bots`), so the *route* didn't fail today. The bug is
|
||||||
|
latent: as soon as any kickoff ran first (which produces
|
||||||
|
`chat_created` events), the next call to `bot_create` would replay
|
||||||
|
that prior `chat_created` and trip the same UNIQUE constraint. We
|
||||||
|
saw this happen in `0f8bf94` — fixing the symmetric route prevents
|
||||||
|
the next variant of the same incident.
|
||||||
|
Removed unused imports: `append_event`, `project`.
|
||||||
|
|
||||||
|
- **`chat/web/settings.py:44` — `settings_post`**
|
||||||
|
Was: `append_event(conn, kind="you_authored", ...); project(conn)`.
|
||||||
|
Now: `append_and_apply(conn, kind="you_authored", ...)`.
|
||||||
|
Same shape as `bot_create`. `_apply_you_authored` is idempotent on
|
||||||
|
its own (`INSERT OR REPLACE INTO you_entity`), but `project()` walks
|
||||||
|
the *whole* log, including any `chat_created` / `container_created`
|
||||||
|
/ `scene_opened` events that have accumulated. Editing the user's
|
||||||
|
own settings on a non-empty DB would 500 with the same UNIQUE
|
||||||
|
constraint error — not because the new event is unsafe, but because
|
||||||
|
the replay is. Fixed by per-event apply.
|
||||||
|
Removed unused imports: `append_event`, `project`.
|
||||||
|
|
||||||
|
### Unsafe — still to fix
|
||||||
|
|
||||||
|
None. The two unsafe live-path call sites identified above were both
|
||||||
|
fixed in this commit. Future hardening: a CI lint that flags
|
||||||
|
`project(` outside `chat/services/rewind.py` and `tests/` would catch a
|
||||||
|
regression, but that's out of scope here.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Step 2 — non-idempotent projector handler inventory
|
||||||
|
|
||||||
|
Output of `grep -n "INSERT INTO\|INSERT OR REPLACE\|INSERT OR IGNORE"
|
||||||
|
chat/state/*.py`, classified.
|
||||||
|
|
||||||
|
### Replay-safe handlers
|
||||||
|
|
||||||
|
These either use `INSERT OR REPLACE` / `INSERT OR IGNORE` (so a second
|
||||||
|
apply is a no-op or an overwrite of identical data), or are pure
|
||||||
|
`UPDATE` against rows the prior event created.
|
||||||
|
|
||||||
|
| Handler | File | Statement | Why safe |
|
||||||
|
|---|---|---|---|
|
||||||
|
| `_apply_bot_authored` | `chat/state/entities.py:12` | `INSERT OR REPLACE INTO bots` | `id` is the natural PK; replay overwrites with identical payload. |
|
||||||
|
| `_apply_you_authored` | `chat/state/entities.py:29` | `INSERT OR REPLACE INTO you_entity` | Singleton row keyed on `id=1`. |
|
||||||
|
| `_apply_activity_change` | `chat/state/world.py:98` | `INSERT OR REPLACE INTO activity` | Activity is keyed on `entity_id` — last write wins is exactly the intended semantics. |
|
||||||
|
| `_apply_thread_opened` | `chat/state/threads.py:12` | `INSERT OR IGNORE INTO threads` | `thread_id` is the natural PK. |
|
||||||
|
| `_apply_event_planned` | `chat/state/events.py:16` | `INSERT OR IGNORE INTO events` | `event_id` is the natural PK. |
|
||||||
|
| `_apply_branch_created` | `chat/state/branches.py:27` | `INSERT OR IGNORE INTO branches` | Branch `name` is unique. |
|
||||||
|
| `_apply_group_node_initialized` | `chat/state/group_node.py:12` | `INSERT OR REPLACE INTO group_node` | One row per `chat_id`. |
|
||||||
|
| `_apply_embedding_indexed` | `chat/state/embeddings.py:28` | `INSERT OR REPLACE INTO embeddings` | One vector per `memory_id`. |
|
||||||
|
| Pure-`UPDATE` handlers | various — `_apply_time_skip_*`, `_apply_guest_added`/`_removed`, `_apply_scene_closed`, `_apply_memory_significance_set`, `_apply_memory_pin_changed`, `_apply_meanwhile_scene_closed`, `_apply_meanwhile_digest_consumed`, `_apply_thread_updated`, `_apply_event_started`/`_completed`/`_cancelled` (etc.), `_apply_group_node_updated` | n/a | Idempotent: re-applying the same UPDATE produces the same row state. |
|
||||||
|
|
||||||
|
### Unsafe-on-replay handlers (raw `INSERT`)
|
||||||
|
|
||||||
|
| Handler | File | Statement | Failure mode on replay |
|
||||||
|
|---|---|---|---|
|
||||||
|
| `_apply_chat_created` | `chat/state/world.py:14` | `INSERT INTO chats`, `INSERT INTO chat_state` | `chats.id` is PK — second insert raises `IntegrityError: UNIQUE constraint failed: chats.id`. **This is the `0f8bf94` bug.** `chat_state.chat_id` is also unique; would raise too. |
|
||||||
|
| `_apply_container_created` | `chat/state/world.py:78` | `INSERT INTO containers` | `containers.id` is `INTEGER PRIMARY KEY AUTOINCREMENT` — replay does NOT raise (a new id is assigned), but it silently creates a duplicate row, fragmenting downstream lookups by `(chat_id, name)`. **Silent corruption, not a crash.** |
|
||||||
|
| `_apply_scene_opened` | `chat/state/world.py:115` | `INSERT INTO scenes` | Same shape: autoincrement `id`. Replay creates a duplicate scene row and re-points `chat_state.active_scene_id` to the new copy. **Silent corruption.** |
|
||||||
|
| `_apply_memory_written` | `chat/state/memory.py:14` | `INSERT INTO memories` | Autoincrement `id`. Replay duplicates the memory; FTS5 trigger then double-indexes the same `pov_summary`. **Silent corruption + double-counting in retrieval.** |
|
||||||
|
| `_apply_meanwhile_scene_started` | `chat/state/meanwhile.py:29` | `INSERT INTO scenes` (with explicit `scene_id`) | Caller supplies `scene_id` (deterministic). Replay raises `IntegrityError: UNIQUE constraint failed: scenes.id`. **Hard crash, like `chat_created`.** |
|
||||||
|
| `_apply_meanwhile_digest_created` | `chat/state/meanwhile.py:67` | `INSERT INTO meanwhile_digest_pending` | Autoincrement `id`. Replay creates a duplicate pending digest, surfacing the same summary twice in the next you-scene's prompt. **Silent corruption.** |
|
||||||
|
| `_apply_edge_update` | `chat/state/edges.py:12` | `INSERT OR IGNORE INTO edges` followed by `UPDATE … SET affinity = ? + delta` | The `INSERT OR IGNORE` is fine, but the handler is *delta-shaped* — each replay re-adds `affinity_delta` and `trust_delta`, and re-extends `knowledge_json`. **Silent corruption: scores drift up; knowledge facts duplicate.** Already called out in `chat/eventlog/log.py:39-46` as the canonical reason `append_and_apply` exists. |
|
||||||
|
|
||||||
|
### Trade-offs — why we are NOT switching every handler to `INSERT OR REPLACE`
|
||||||
|
|
||||||
|
This is the part the audit is here to nail down before someone "fixes
|
||||||
|
it" with a one-line s/`INSERT INTO`/`INSERT OR REPLACE INTO`/.
|
||||||
|
|
||||||
|
1. **Autoincrement-id handlers (`containers`, `scenes`, `memories`,
|
||||||
|
`meanwhile_digest_pending`)** — `INSERT OR REPLACE` doesn't help.
|
||||||
|
Each event's payload doesn't carry the row's eventual id — the id
|
||||||
|
comes from `lastrowid` *at projection time*. There is no key for
|
||||||
|
`OR REPLACE` to match on. The fix here is either (a) make the event
|
||||||
|
carry a deterministic id derived from the event's own id (large
|
||||||
|
refactor — payload schemas, downstream FK lookups, FTS rowid
|
||||||
|
alignment), or (b) keep the handler raw-INSERT and ensure every
|
||||||
|
live path uses `append_and_apply` (the path we're on). We are on
|
||||||
|
path (b), and this audit makes it explicit.
|
||||||
|
|
||||||
|
2. **`chat_created`** — `chats.id` IS keyed on the natural PK, so
|
||||||
|
`INSERT OR REPLACE INTO chats ...` would technically work for the
|
||||||
|
chat row. *But* it would silently overwrite `chat_state` columns
|
||||||
|
that other events legitimately mutate later: `chat_state.time` is
|
||||||
|
bumped by `time_skip_elision`, `active_scene_id` is set/cleared by
|
||||||
|
`scene_opened`/`scene_closed`. On replay the
|
||||||
|
`chat_created` overwrite would clobber those subsequent updates,
|
||||||
|
then later events would re-set them — *if* the events themselves
|
||||||
|
appear in order (they do today). It would work in practice, but it
|
||||||
|
would erase the invariant that "each handler is responsible for one
|
||||||
|
table-shape change" and make the projector's correctness depend on
|
||||||
|
strict event-order replay through `chat_state`. Not worth the
|
||||||
|
subtle coupling; keep the raw INSERT and treat replay as an
|
||||||
|
explicit "wipe + replay" operation (the rewind path does exactly
|
||||||
|
that).
|
||||||
|
|
||||||
|
3. **`meanwhile_scene_started`** — could be made idempotent (the
|
||||||
|
payload supplies `scene_id`), but it shares the `scenes` table with
|
||||||
|
`_apply_scene_opened` (autoincrement) — making one half of the
|
||||||
|
table writers `OR REPLACE` and the other half raw-INSERT is asking
|
||||||
|
for a future bug. Keep both raw, lean on `append_and_apply`.
|
||||||
|
|
||||||
|
4. **`edge_update`** — fundamentally cannot be made idempotent under
|
||||||
|
replay without either changing the event schema (carry absolute
|
||||||
|
values, not deltas) or recording per-event-id "already applied"
|
||||||
|
flags. Either is a multi-week project. The current contract is
|
||||||
|
"edge_update is a delta event; never apply it twice"; the
|
||||||
|
`append_and_apply` rule enforces that contract from the call site.
|
||||||
|
|
||||||
|
**Conclusion:** the handler layer is *correctly* non-idempotent for
|
||||||
|
event-sourcing semantics. The defect class lives in the *caller* layer
|
||||||
|
(routes that mistakenly call `project()` instead of `append_and_apply`).
|
||||||
|
This audit fixes the two known offenders and pins the contract with a
|
||||||
|
regression test (see Step 3).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Step 3 — regression test
|
||||||
|
|
||||||
|
Added `tests/test_chat_created_non_idempotent.py`. The test:
|
||||||
|
|
||||||
|
1. Opens a fresh DB and runs the migration chain.
|
||||||
|
2. Appends one `chat_created` event and projects — first projection
|
||||||
|
succeeds.
|
||||||
|
3. Appends a *second* `chat_created` for the same chat id and projects
|
||||||
|
again — asserts that the second projection raises
|
||||||
|
`sqlite3.IntegrityError`.
|
||||||
|
|
||||||
|
The point isn't that the test catches a future "make it idempotent"
|
||||||
|
change automatically; it's that any such change MUST update this test,
|
||||||
|
forcing a deliberate review of all the trade-offs documented above.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Files changed
|
||||||
|
|
||||||
|
- `chat/web/bots.py` — swap `append_event`+`project` → `append_and_apply`,
|
||||||
|
drop unused imports.
|
||||||
|
- `chat/web/settings.py` — same swap.
|
||||||
|
- `tests/test_chat_created_non_idempotent.py` — new regression test.
|
||||||
|
- `docs/audits/2026-04-27-project-callers.md` — this file.
|
||||||
Executable
+38
@@ -0,0 +1,38 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Start the local mlx-omni-server that serves the classifier + embedding
|
||||||
|
# models. The chat app's RoutedLLMClient routes everything except the
|
||||||
|
# narrative model to this server; with no MLX server running, classifier
|
||||||
|
# calls fail and embeddings degrade to the zero-vector fallback.
|
||||||
|
#
|
||||||
|
# Run in the foreground:
|
||||||
|
# ./scripts/start_mlx_server.sh
|
||||||
|
# Run as a background daemon (logs to data/mlx-server.log):
|
||||||
|
# ./scripts/start_mlx_server.sh --daemon
|
||||||
|
#
|
||||||
|
# Models are pulled from Hugging Face on first request; expect a delay
|
||||||
|
# the first time you exercise the classifier or embedding path.
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
||||||
|
VENV="${REPO_ROOT}/.mlx-venv"
|
||||||
|
LOG="${REPO_ROOT}/data/mlx-server.log"
|
||||||
|
PORT="${MLX_PORT:-10240}"
|
||||||
|
HOST="${MLX_HOST:-127.0.0.1}"
|
||||||
|
|
||||||
|
if [ ! -x "${VENV}/bin/mlx-omni-server" ]; then
|
||||||
|
echo "error: mlx-omni-server not installed in ${VENV}" >&2
|
||||||
|
echo "create the venv with:" >&2
|
||||||
|
echo " python3.12 -m venv ${VENV} && ${VENV}/bin/pip install mlx-omni-server" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "${1:-}" = "--daemon" ]; then
|
||||||
|
mkdir -p "$(dirname "${LOG}")"
|
||||||
|
nohup "${VENV}/bin/mlx-omni-server" --host "${HOST}" --port "${PORT}" \
|
||||||
|
>>"${LOG}" 2>&1 &
|
||||||
|
echo "mlx-omni-server started in background (pid $!)"
|
||||||
|
echo "logs: ${LOG}"
|
||||||
|
else
|
||||||
|
exec "${VENV}/bin/mlx-omni-server" --host "${HOST}" --port "${PORT}"
|
||||||
|
fi
|
||||||
@@ -0,0 +1,69 @@
|
|||||||
|
"""Pin the contract: ``_apply_chat_created`` is NOT replay-safe.
|
||||||
|
|
||||||
|
See ``docs/audits/2026-04-27-project-callers.md`` for the full audit.
|
||||||
|
The handler at ``chat/state/world.py:_apply_chat_created`` uses raw
|
||||||
|
``INSERT INTO chats ...`` and ``INSERT INTO chat_state ...`` with no
|
||||||
|
``OR REPLACE``/``OR IGNORE``. Running ``project()`` twice over the same
|
||||||
|
``chat_created`` event MUST raise ``sqlite3.IntegrityError`` on the
|
||||||
|
second pass — this is the bug that produced the 500 fixed in commit
|
||||||
|
``0f8bf94`` (and the latent equivalents fixed in this commit).
|
||||||
|
|
||||||
|
Pinning the contract here means any future "make it idempotent" change
|
||||||
|
to the handler MUST update this test, which forces a deliberate review
|
||||||
|
of the trade-offs: most notably, that ``chat_state`` columns mutated by
|
||||||
|
later events (``time_skip_elision`` bumps ``time``;
|
||||||
|
``scene_opened``/``scene_closed`` toggle ``active_scene_id``) would be
|
||||||
|
silently overwritten by an ``INSERT OR REPLACE`` on every replay. The
|
||||||
|
audit explains why we keep the handler raw-INSERT and enforce the rule
|
||||||
|
at the call site via ``append_and_apply`` instead.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from chat.db.connection import open_db
|
||||||
|
from chat.db.migrate import apply_migrations
|
||||||
|
from chat.eventlog.log import append_event
|
||||||
|
from chat.eventlog.projector import project
|
||||||
|
import chat.state.world # noqa: F401 — import registers the handler
|
||||||
|
|
||||||
|
|
||||||
|
def _chat_payload():
|
||||||
|
return {
|
||||||
|
"id": "chat_bot_a",
|
||||||
|
"host_bot_id": "bot_a",
|
||||||
|
"guest_bot_id": None,
|
||||||
|
"initial_time": "2026-04-27T12:00:00+00:00",
|
||||||
|
"narrative_anchor": "Day 1 noon",
|
||||||
|
"weather": "clear",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_chat_created_handler_is_not_replay_safe(tmp_path):
|
||||||
|
"""A second projection over an extra ``chat_created`` for the same id raises.
|
||||||
|
|
||||||
|
This is the exact failure shape from incident ``0f8bf94``: a raw
|
||||||
|
INSERT against ``chats.id`` (PK) trips ``UNIQUE constraint failed``
|
||||||
|
on the second pass. If this test ever starts FAILING (i.e. the
|
||||||
|
second project() succeeds), someone has changed the handler to be
|
||||||
|
idempotent — read the audit before approving.
|
||||||
|
"""
|
||||||
|
db = tmp_path / "t.db"
|
||||||
|
apply_migrations(db)
|
||||||
|
with open_db(db) as conn:
|
||||||
|
# First chat_created + first project: must succeed.
|
||||||
|
append_event(conn, kind="chat_created", payload=_chat_payload())
|
||||||
|
project(conn)
|
||||||
|
|
||||||
|
# Append a SECOND chat_created with the same id. project() will
|
||||||
|
# walk both, re-INSERT the same chats row, and trip the UNIQUE
|
||||||
|
# constraint on chats.id.
|
||||||
|
append_event(conn, kind="chat_created", payload=_chat_payload())
|
||||||
|
with pytest.raises(sqlite3.IntegrityError) as exc_info:
|
||||||
|
project(conn)
|
||||||
|
# Match on the column to make sure we caught the *intended*
|
||||||
|
# constraint, not some unrelated FK/check failure that happens
|
||||||
|
# to also be an IntegrityError.
|
||||||
|
assert "chats.id" in str(exc_info.value)
|
||||||
@@ -1,10 +1,12 @@
|
|||||||
"""Tests for FeatherlessClient (Phase 4.5+).
|
"""Tests for FeatherlessClient (Phase 4.5+).
|
||||||
|
|
||||||
Phase 4.5 adds an ``embed()`` method to the LLMClient Protocol (T112).
|
Phase 4.5 adds an ``embed()`` method to the LLMClient Protocol (T112).
|
||||||
Featherless does not expose an OpenAI-compatible ``/v1/embeddings``
|
Featherless's OpenAI-compatible surface routes ``/v1/embeddings`` but
|
||||||
endpoint, so its implementation deliberately raises
|
every request returns HTTP 500 ``{"type": "completions_error"}`` (the
|
||||||
``NotImplementedError`` to surface the gap clearly. The
|
router accepts the URL but the backend has no embedding handler), and
|
||||||
``generate_embedding`` wrapper catches this and degrades to the
|
``/v1/models`` lists no embedding-class models. The implementation
|
||||||
|
raises ``NotImplementedError`` rather than ship a request that always
|
||||||
|
errors; ``generate_embedding`` catches it and degrades to the
|
||||||
zero-vector fallback (the existing T107 warning path).
|
zero-vector fallback (the existing T107 warning path).
|
||||||
|
|
||||||
If/when Featherless ships embeddings, swap the body for a real call to
|
If/when Featherless ships embeddings, swap the body for a real call to
|
||||||
@@ -20,10 +22,11 @@ from chat.llm.featherless import FeatherlessClient
|
|||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_featherless_embed_raises_not_implemented():
|
async def test_featherless_embed_raises_not_implemented():
|
||||||
"""Featherless does not expose ``/v1/embeddings`` — embed() must
|
"""Featherless's ``/v1/embeddings`` always 500s with
|
||||||
raise ``NotImplementedError`` so callers (``generate_embedding``)
|
``"completions_error"`` and its model catalog has no embedding
|
||||||
can degrade to the fallback zero vector + warning rather than
|
class — embed() must raise ``NotImplementedError`` so callers
|
||||||
silently producing useless output."""
|
(``generate_embedding``) can degrade to the fallback zero vector
|
||||||
|
+ warning rather than silently producing useless output."""
|
||||||
client = FeatherlessClient(api_key="test-key")
|
client = FeatherlessClient(api_key="test-key")
|
||||||
with pytest.raises(NotImplementedError) as excinfo:
|
with pytest.raises(NotImplementedError) as excinfo:
|
||||||
await client.embed("hello world", model="bge-small-en-v1.5")
|
await client.embed("hello world", model="bge-small-en-v1.5")
|
||||||
|
|||||||
@@ -0,0 +1,84 @@
|
|||||||
|
"""Tests for LocalMLXClient (Phase 4.5+).
|
||||||
|
|
||||||
|
Talks to a local mlx-omni-server over the OpenAI-compatible surface.
|
||||||
|
We don't spin up a real server in tests — instead we monkey-patch the
|
||||||
|
underlying ``AsyncOpenAI`` instance to assert on the request shape and
|
||||||
|
return canned responses. The semaphore behavior is shared with
|
||||||
|
FeatherlessClient (same pattern), so we don't re-test that here.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from types import SimpleNamespace
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from chat.llm.client import Message
|
||||||
|
from chat.llm.local_mlx import LocalMLXClient
|
||||||
|
|
||||||
|
|
||||||
|
class _FakeChatCompletions:
|
||||||
|
def __init__(self, response):
|
||||||
|
self.response = response
|
||||||
|
self.calls = []
|
||||||
|
|
||||||
|
async def create(self, **kw):
|
||||||
|
self.calls.append(kw)
|
||||||
|
return self.response
|
||||||
|
|
||||||
|
|
||||||
|
class _FakeEmbeddings:
|
||||||
|
def __init__(self, vector):
|
||||||
|
self.vector = vector
|
||||||
|
self.calls = []
|
||||||
|
|
||||||
|
async def create(self, **kw):
|
||||||
|
self.calls.append(kw)
|
||||||
|
return SimpleNamespace(data=[SimpleNamespace(embedding=self.vector)])
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_local_mlx_client_generate_calls_chat_completions():
|
||||||
|
client = LocalMLXClient(base_url="http://localhost:10240/v1")
|
||||||
|
fake_response = SimpleNamespace(
|
||||||
|
choices=[SimpleNamespace(message=SimpleNamespace(content="hello"))]
|
||||||
|
)
|
||||||
|
fake_chat = _FakeChatCompletions(fake_response)
|
||||||
|
client._client.chat = SimpleNamespace(completions=fake_chat)
|
||||||
|
|
||||||
|
out = await client.generate(
|
||||||
|
[Message(role="user", content="hi")],
|
||||||
|
model="mlx-community/Hermes-3-Llama-3.1-8B-8bit",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert out == "hello"
|
||||||
|
assert len(fake_chat.calls) == 1
|
||||||
|
assert fake_chat.calls[0]["model"] == "mlx-community/Hermes-3-Llama-3.1-8B-8bit"
|
||||||
|
assert fake_chat.calls[0]["messages"] == [{"role": "user", "content": "hi"}]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_local_mlx_client_embed_returns_vector():
|
||||||
|
"""``embed()`` actually works on this client (unlike FeatherlessClient
|
||||||
|
which raises NotImplementedError) — the local MLX server has a real
|
||||||
|
``/v1/embeddings`` endpoint backed by an MLX-quantized model.
|
||||||
|
"""
|
||||||
|
client = LocalMLXClient()
|
||||||
|
canned = [0.1, 0.2, 0.3, 0.4]
|
||||||
|
fake_embeddings = _FakeEmbeddings(canned)
|
||||||
|
client._client.embeddings = fake_embeddings
|
||||||
|
|
||||||
|
out = await client.embed("hello", model="mlx-community/bge-small-en-v1.5-bf16")
|
||||||
|
|
||||||
|
assert out == canned
|
||||||
|
assert fake_embeddings.calls[0]["model"] == "mlx-community/bge-small-en-v1.5-bf16"
|
||||||
|
assert fake_embeddings.calls[0]["input"] == "hello"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_local_mlx_client_default_base_url():
|
||||||
|
"""Default base_url targets ``mlx-omni-server`` on its standard port."""
|
||||||
|
client = LocalMLXClient()
|
||||||
|
# AsyncOpenAI normalizes trailing-slash differences; just check the
|
||||||
|
# configured host:port appears in the underlying client config.
|
||||||
|
assert "127.0.0.1:10240" in str(client._client.base_url)
|
||||||
+64
-8
@@ -21,7 +21,11 @@ import chat.state.world # noqa: F401
|
|||||||
import chat.state.events # noqa: F401
|
import chat.state.events # noqa: F401
|
||||||
import chat.state.threads # noqa: F401
|
import chat.state.threads # noqa: F401
|
||||||
from chat.llm.client import Message
|
from chat.llm.client import Message
|
||||||
from chat.services.prompt import _witness_role_for, assemble_narrative_prompt
|
from chat.services.prompt import (
|
||||||
|
_witness_role_for,
|
||||||
|
assemble_narrative_prompt,
|
||||||
|
trim_to_max_beats,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _seed_basic(conn) -> None:
|
def _seed_basic(conn) -> None:
|
||||||
@@ -565,8 +569,12 @@ def test_tight_budget_drops_guest_activity_bullet_first(tmp_path):
|
|||||||
speaker_bot_id="bot_a",
|
speaker_bot_id="bot_a",
|
||||||
recent_dialogue=dialogue,
|
recent_dialogue=dialogue,
|
||||||
retrieved_memory_summaries=[],
|
retrieved_memory_summaries=[],
|
||||||
budget_soft=250,
|
# Closing instruction grew with the asterisk-format spec
|
||||||
budget_hard=340,
|
# (Phase 4.6 narrative-style fix). Budget bumped enough to
|
||||||
|
# accommodate the larger MUST floor while still exercising
|
||||||
|
# the SHOULD-tier trim path.
|
||||||
|
budget_soft=480,
|
||||||
|
budget_hard=510,
|
||||||
)
|
)
|
||||||
body = msgs[0].content
|
body = msgs[0].content
|
||||||
# Speaker bullet survives (MUST-tier floor).
|
# Speaker bullet survives (MUST-tier floor).
|
||||||
@@ -696,13 +704,15 @@ def test_nice_trim_order_documented(tmp_path):
|
|||||||
# Soft tuned so the all-NICE config (with the heavy previous
|
# Soft tuned so the all-NICE config (with the heavy previous
|
||||||
# scene summary) overflows, but dropping just previous-scene
|
# scene summary) overflows, but dropping just previous-scene
|
||||||
# fits comfortably. Hard set high so SHOULD-tier never trims.
|
# fits comfortably. Hard set high so SHOULD-tier never trims.
|
||||||
|
# Soft bumped (was 400) to make room for the larger closing
|
||||||
|
# instruction shipped with the asterisk-format spec.
|
||||||
msgs = assemble_narrative_prompt(
|
msgs = assemble_narrative_prompt(
|
||||||
conn,
|
conn,
|
||||||
chat_id="chat_bot_a",
|
chat_id="chat_bot_a",
|
||||||
speaker_bot_id="bot_a",
|
speaker_bot_id="bot_a",
|
||||||
recent_dialogue=dialogue,
|
recent_dialogue=dialogue,
|
||||||
retrieved_memory_summaries=memories,
|
retrieved_memory_summaries=memories,
|
||||||
budget_soft=400,
|
budget_soft=540,
|
||||||
budget_hard=8000,
|
budget_hard=8000,
|
||||||
)
|
)
|
||||||
body = msgs[0].content
|
body = msgs[0].content
|
||||||
@@ -748,8 +758,12 @@ def test_assemble_with_tight_budget_drops_guest_activity_first(tmp_path):
|
|||||||
# group node + other edges) push it well over 380. budget_hard
|
# group node + other edges) push it well over 380. budget_hard
|
||||||
# is set just above MUST core so SHOULD-tier blocks must be
|
# is set just above MUST core so SHOULD-tier blocks must be
|
||||||
# trimmed away.
|
# trimmed away.
|
||||||
budget_soft=250,
|
# Closing instruction grew with the asterisk-format spec
|
||||||
budget_hard=340,
|
# (Phase 4.6 narrative-style fix). Budget bumped enough to
|
||||||
|
# accommodate the larger MUST floor while still exercising
|
||||||
|
# the SHOULD-tier trim path.
|
||||||
|
budget_soft=480,
|
||||||
|
budget_hard=510,
|
||||||
)
|
)
|
||||||
body = msgs[0].content
|
body = msgs[0].content
|
||||||
# MUST: speaker identity, edge to addressee, last 4 dialogue turns.
|
# MUST: speaker identity, edge to addressee, last 4 dialogue turns.
|
||||||
@@ -759,10 +773,11 @@ def test_assemble_with_tight_budget_drops_guest_activity_first(tmp_path):
|
|||||||
assert f"line-{i:02d}" in body
|
assert f"line-{i:02d}" in body
|
||||||
# Guest activity (SHOULD-tier) must be dropped under tight budget.
|
# Guest activity (SHOULD-tier) must be dropped under tight budget.
|
||||||
assert "smirking-distinctively" not in body
|
assert "smirking-distinctively" not in body
|
||||||
# Token budget honoured.
|
# Token budget honoured. Bumped (was 340) for the larger closing
|
||||||
|
# instruction that ships the asterisk-format spec.
|
||||||
import tiktoken
|
import tiktoken
|
||||||
enc = tiktoken.get_encoding("cl100k_base")
|
enc = tiktoken.get_encoding("cl100k_base")
|
||||||
assert len(enc.encode(body)) <= 340
|
assert len(enc.encode(body)) <= 510
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -859,3 +874,44 @@ def test_witness_role_for_none_host_returns_host():
|
|||||||
# Sanity check: existing semantics preserved.
|
# Sanity check: existing semantics preserved.
|
||||||
assert _witness_role_for("bot_a", "bot_a") == "host"
|
assert _witness_role_for("bot_a", "bot_a") == "host"
|
||||||
assert _witness_role_for("bot_a", "bot_b") == "guest"
|
assert _witness_role_for("bot_a", "bot_b") == "guest"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# trim_to_max_beats — caps verbose narrative output to N beats
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_trim_to_max_beats_passthrough_when_under_cap():
|
||||||
|
assert trim_to_max_beats("", 3) == ""
|
||||||
|
assert trim_to_max_beats("plain text", 3) == "plain text"
|
||||||
|
two = "*She nods* okay. *She turns* see you."
|
||||||
|
assert trim_to_max_beats(two, 3) == two
|
||||||
|
|
||||||
|
|
||||||
|
def test_trim_to_max_beats_passthrough_at_exactly_cap():
|
||||||
|
three = "*A* one. *B* two. *C* three."
|
||||||
|
assert trim_to_max_beats(three, 3) == three
|
||||||
|
|
||||||
|
|
||||||
|
def test_trim_to_max_beats_cuts_at_fourth_beat():
|
||||||
|
"""Cydonia-style 4-beat output trimmed at the start of the 4th
|
||||||
|
asterisk action; trailing whitespace stripped."""
|
||||||
|
four = "*A* one. *B* two. *C* three. *D* four."
|
||||||
|
assert trim_to_max_beats(four, 3) == "*A* one. *B* two. *C* three."
|
||||||
|
|
||||||
|
|
||||||
|
def test_trim_to_max_beats_handles_runaway_six_beats():
|
||||||
|
"""The exact failure mode that motivated this — verbose narrator
|
||||||
|
rambling for 6 beats when the prompt asked for 2-3."""
|
||||||
|
six = "*A* 1 *B* 2 *C* 3 *D* 4 *E* 5 *F* 6"
|
||||||
|
assert trim_to_max_beats(six, 3) == "*A* 1 *B* 2 *C* 3"
|
||||||
|
|
||||||
|
|
||||||
|
def test_trim_to_max_beats_respects_lower_cap():
|
||||||
|
four = "*A* one. *B* two. *C* three. *D* four."
|
||||||
|
assert trim_to_max_beats(four, 2) == "*A* one. *B* two."
|
||||||
|
assert trim_to_max_beats(four, 1) == "*A* one."
|
||||||
|
|
||||||
|
|
||||||
|
def test_trim_to_max_beats_zero_returns_empty():
|
||||||
|
assert trim_to_max_beats("*A* one. *B* two.", 0) == ""
|
||||||
|
|||||||
@@ -0,0 +1,122 @@
|
|||||||
|
"""Tests for RoutedLLMClient (Phase 4.5+).
|
||||||
|
|
||||||
|
Splits traffic across two underlying clients based on the ``model``
|
||||||
|
kwarg. We use simple stub clients to assert the router picks the
|
||||||
|
correct backend for each call.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import AsyncIterator, Sequence
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from chat.llm.client import Message
|
||||||
|
from chat.llm.router import RoutedLLMClient
|
||||||
|
|
||||||
|
|
||||||
|
class _StubClient:
|
||||||
|
def __init__(self, name: str):
|
||||||
|
self.name = name
|
||||||
|
self.generate_calls: list[str] = []
|
||||||
|
self.stream_calls: list[str] = []
|
||||||
|
self.embed_calls: list[str] = []
|
||||||
|
|
||||||
|
async def generate(self, messages, *, model, **params) -> str:
|
||||||
|
self.generate_calls.append(model)
|
||||||
|
return f"{self.name}:{model}"
|
||||||
|
|
||||||
|
async def stream(self, messages, *, model, **params) -> AsyncIterator[str]:
|
||||||
|
self.stream_calls.append(model)
|
||||||
|
yield f"{self.name}:{model}"
|
||||||
|
|
||||||
|
async def embed(self, text, *, model) -> list[float]:
|
||||||
|
self.embed_calls.append(model)
|
||||||
|
return [1.0, 2.0]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_router_generate_routes_remote_model_to_remote_backend():
|
||||||
|
"""Any model id NOT starting with a local prefix goes to the remote
|
||||||
|
backend — narrative model, remote classifiers, anything else."""
|
||||||
|
narrative = _StubClient("narrative")
|
||||||
|
local = _StubClient("local")
|
||||||
|
router = RoutedLLMClient(
|
||||||
|
narrative=narrative,
|
||||||
|
local=local,
|
||||||
|
narrative_model="provider/big-model",
|
||||||
|
local_prefixes=("mlx-community/",),
|
||||||
|
)
|
||||||
|
|
||||||
|
out = await router.generate(
|
||||||
|
[Message(role="user", content="hi")], model="provider/big-model"
|
||||||
|
)
|
||||||
|
|
||||||
|
assert out == "narrative:provider/big-model"
|
||||||
|
assert narrative.generate_calls == ["provider/big-model"]
|
||||||
|
assert local.generate_calls == []
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_router_generate_routes_local_prefix_to_local_backend():
|
||||||
|
"""Models prefixed with a local prefix (e.g. ``mlx-community/``)
|
||||||
|
go to the local MLX backend regardless of whether the rest of the
|
||||||
|
path looks like a remote provider id."""
|
||||||
|
narrative = _StubClient("narrative")
|
||||||
|
local = _StubClient("local")
|
||||||
|
router = RoutedLLMClient(
|
||||||
|
narrative=narrative,
|
||||||
|
local=local,
|
||||||
|
narrative_model="provider/big-model",
|
||||||
|
local_prefixes=("mlx-community/",),
|
||||||
|
)
|
||||||
|
|
||||||
|
out = await router.generate(
|
||||||
|
[Message(role="user", content="hi")],
|
||||||
|
model="mlx-community/Hermes-3-Llama-3.1-8B-8bit",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert out == "local:mlx-community/Hermes-3-Llama-3.1-8B-8bit"
|
||||||
|
assert local.generate_calls == ["mlx-community/Hermes-3-Llama-3.1-8B-8bit"]
|
||||||
|
assert narrative.generate_calls == []
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_router_stream_dispatches_by_prefix():
|
||||||
|
narrative = _StubClient("narrative")
|
||||||
|
local = _StubClient("local")
|
||||||
|
router = RoutedLLMClient(
|
||||||
|
narrative=narrative,
|
||||||
|
local=local,
|
||||||
|
narrative_model="provider/big-model",
|
||||||
|
local_prefixes=("mlx-community/",),
|
||||||
|
)
|
||||||
|
|
||||||
|
chunks_remote = [c async for c in router.stream(
|
||||||
|
[Message(role="user", content="hi")], model="provider/big-model"
|
||||||
|
)]
|
||||||
|
chunks_local = [c async for c in router.stream(
|
||||||
|
[Message(role="user", content="hi")],
|
||||||
|
model="mlx-community/Hermes-3-Llama-3.1-8B-8bit",
|
||||||
|
)]
|
||||||
|
|
||||||
|
assert chunks_remote == ["narrative:provider/big-model"]
|
||||||
|
assert chunks_local == ["local:mlx-community/Hermes-3-Llama-3.1-8B-8bit"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_router_embed_always_routes_to_local():
|
||||||
|
"""Embeddings always run locally — the remote provider doesn't
|
||||||
|
expose a working ``/v1/embeddings``, so the router never sends
|
||||||
|
embed calls there even if the model name happens to look 'remote'."""
|
||||||
|
narrative = _StubClient("narrative")
|
||||||
|
local = _StubClient("local")
|
||||||
|
router = RoutedLLMClient(
|
||||||
|
narrative=narrative, local=local, narrative_model="big-model"
|
||||||
|
)
|
||||||
|
|
||||||
|
out = await router.embed("hello", model="any-embedding-model")
|
||||||
|
|
||||||
|
assert out == [1.0, 2.0]
|
||||||
|
assert local.embed_calls == ["any-embedding-model"]
|
||||||
|
assert narrative.embed_calls == []
|
||||||
@@ -73,11 +73,25 @@ async def test_parse_turn_empty_prose_short_circuits_without_classifier_call():
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_parse_turn_raises_when_classifier_fails_twice():
|
async def test_parse_turn_falls_back_to_whole_prose_when_classifier_fails():
|
||||||
|
"""A flapping classifier (3 invalid responses) no longer 500s the
|
||||||
|
request. ``parse_turn`` returns the original prose as a single
|
||||||
|
``dialogue`` segment so the turn flow can keep moving — the
|
||||||
|
narrative will still fire on the prose, just without finer-grained
|
||||||
|
segment classification.
|
||||||
|
|
||||||
|
The old contract was ``RuntimeError`` (no default), but in
|
||||||
|
production that took down the whole turn endpoint with a 500 the
|
||||||
|
moment any classifier provider hiccuped — particularly painful in
|
||||||
|
multi-bot scenes where every user turn pays the parse_turn cost.
|
||||||
|
"""
|
||||||
mock = MockLLMClient(canned=["nope", "still nope", "nope3"])
|
mock = MockLLMClient(canned=["nope", "still nope", "nope3"])
|
||||||
with pytest.raises(RuntimeError):
|
result = await parse_turn(
|
||||||
await parse_turn(
|
mock,
|
||||||
mock,
|
model="m",
|
||||||
model="m",
|
prose='*shrugs* "whatever"',
|
||||||
prose='*shrugs* "whatever"',
|
)
|
||||||
)
|
assert len(result.segments) == 1
|
||||||
|
assert result.segments[0].kind == "dialogue"
|
||||||
|
assert result.segments[0].text == '*shrugs* "whatever"'
|
||||||
|
assert result.intent == "narrative"
|
||||||
|
|||||||
Reference in New Issue
Block a user