de7f6624f0
Four changes that compound: 1) **SQLite busy_timeout 5.0s -> 0.1s** in chat/db/connection.py. Root cause of the bulk of the slowness. The embedding worker contends for the WAL write lock while the request handler holds an open transaction; conn.execute's busy-wait does NOT release the GIL, so every state_update LLM call after the narrative was silently freezing the asyncio event loop for ~5s. With 0.1s the worker fails fast and logs (already handled), the chat keeps moving, and any missed embedding can be backfilled out of band. Also takes the test suite from ~290s -> 13s as a bonus. 2) **Parallel state-update pairs** in multi_state_update.py. Each directed (src, tgt) pair becomes a coroutine in asyncio.gather instead of a sequential for-loop. Returned order is preserved. 3) **Classifier on OpenRouter, provider-pinned to Cerebras**. New prefix-based router: model id with mlx-community/ -> local MLX, model == narrative_model -> narrative remote, else -> classifier remote. Settings.classifier_provider_order populates extra_body for the classifier client only (FeatherlessClient now accepts default_extra_body to merge into every chat.completions.create). Llama-3.1-8B on Cerebras runs at ~423 tok/s, ~10x the default provider. narrative still routes to mistral-nemo:nitro (Friendli). 4) **Cap classify max_tokens at 512**. A misbehaving classifier (response_format=json_object ignored) could otherwise generate thousands of tokens of prose before classify's JSON validation trips the retry. 512 is generous; usual completions are 50-150. CHAT_LLM_TIMING=1 env var enables per-call timing logs on stderr; zero overhead when unset. Useful for finding the slow link. Suite: 464 passed in 13s (was 290s).
84 lines
3.9 KiB
Python
84 lines
3.9 KiB
Python
from __future__ import annotations
|
|
import os
|
|
import tomllib
|
|
from pathlib import Path
|
|
from pydantic import BaseModel, Field
|
|
|
|
REPO_ROOT = Path(__file__).resolve().parent.parent
|
|
DEFAULT_CONFIG = REPO_ROOT / "data" / "config.toml"
|
|
DEFAULT_DB = REPO_ROOT / "data" / "chat.db"
|
|
|
|
class Settings(BaseModel):
|
|
featherless_api_key: str
|
|
featherless_base_url: str = "https://api.featherless.ai/v1"
|
|
narrative_model: str = "dphn/Dolphin-Mistral-24B-Venice-Edition"
|
|
classifier_model: str = "NousResearch/Hermes-3-Llama-3.1-8B"
|
|
classifier_fallbacks: list[str] = Field(
|
|
default_factory=lambda: [
|
|
"cognitivecomputations/dolphin-2.9.4-llama3-8b",
|
|
"mlabonne/Meta-Llama-3.1-8B-Instruct-abliterated",
|
|
]
|
|
)
|
|
ooc_marker: str = "(("
|
|
retrieval_k: int = 4
|
|
narrative_budget_hard: int = 8000
|
|
narrative_budget_soft: int = 6000
|
|
# Cap on each generated bot response. The asterisk-action format
|
|
# (see ``_closing_instruction`` in chat/services/prompt.py) targets
|
|
# 2-4 short interleaved action+dialogue beats; ~250 tokens fits that
|
|
# without leaving room for the model to drift into multi-paragraph
|
|
# inner-monologue prose. Bump back up if you want longer scenes;
|
|
# drop to 150 for very terse banter.
|
|
narrative_max_tokens: int = 250
|
|
# Sampling temperature for narrative generation. 0.7 = grounded /
|
|
# consistent; 0.85 = creative-but-in-character (default); 1.0 = wide
|
|
# variety, can drift; >1.0 = often off-the-rails.
|
|
narrative_temperature: float = 0.85
|
|
classifier_budget_hard: int = 4000
|
|
classifier_timeout_s: float = 30.0
|
|
# Featherless free tier and lower paid tiers cap concurrent connections.
|
|
# Set this to your account's max-concurrent-connections limit.
|
|
featherless_max_concurrent: int = 2
|
|
db_path: Path = DEFAULT_DB
|
|
data_dir: Path = REPO_ROOT / "data"
|
|
bind_host: str = "127.0.0.1"
|
|
bind_port: int = 8000
|
|
# Local MLX server (e.g. ``mlx-omni-server``) — serves any model
|
|
# whose id starts with one of ``local_prefixes`` (default
|
|
# ``"mlx-community/"``). The :class:`RoutedLLMClient` inspects the
|
|
# ``model`` kwarg at call time: local-prefix -> local, else -> remote.
|
|
# ``embed()`` always routes local.
|
|
local_mlx_base_url: str = "http://127.0.0.1:10240/v1"
|
|
local_mlx_max_concurrent: int = 1
|
|
# Optional OpenRouter-style provider pinning for the classifier
|
|
# client. Maps to the ``provider`` field on chat.completions.create
|
|
# via ``extra_body``; the FeatherlessClient (which is just an
|
|
# AsyncOpenAI wrapper) merges it into every call. Useful for forcing
|
|
# Llama-3.1-8B classifier traffic onto Cerebras (~423 tok/s, 10x
|
|
# the default Nebius). Empty list = no pin (provider is
|
|
# OpenRouter's choice).
|
|
classifier_provider_order: list[str] = Field(default_factory=list)
|
|
# T112 (Phase 4.5): embedding model identifier. Default is the
|
|
# deterministic local pseudo so fresh installs / tests don't need
|
|
# any external infra. Override via config.toml to a real model id
|
|
# (e.g. ``"mlx-community/bge-small-en-v1.5-bf16"``) once a local
|
|
# MLX server is running.
|
|
embedding_model: str = "pseudo-sha256-384"
|
|
|
|
def load_settings() -> Settings:
|
|
config_path = Path(os.environ.get("CHAT_CONFIG_PATH", DEFAULT_CONFIG))
|
|
raw: dict = {}
|
|
if config_path.exists():
|
|
raw = tomllib.loads(config_path.read_text())
|
|
if "CHAT_DB_PATH" in os.environ:
|
|
raw["db_path"] = Path(os.environ["CHAT_DB_PATH"])
|
|
if "CHAT_DATA_DIR" in os.environ:
|
|
raw["data_dir"] = Path(os.environ["CHAT_DATA_DIR"])
|
|
elif "data_dir" not in raw and "db_path" in raw:
|
|
# T31: when ``CHAT_DB_PATH`` is overridden (typical in tests) but
|
|
# ``data_dir`` isn't, derive ``data_dir`` from the db's parent so
|
|
# snapshot/auxiliary files stay alongside the test db rather than
|
|
# leaking into the real repo data dir.
|
|
raw["data_dir"] = Path(raw["db_path"]).parent
|
|
return Settings(**raw)
|