d161e7b8e9
Bot replies were running long (4 paragraphs of action+dialogue beats per turn) because we never set max_tokens on the narrative call. Three tunable knobs now in Settings (set in data/config.toml to override): - narrative_max_tokens: int = 400 Hard cap on each generated response. ~400 tokens ≈ 1–2 short paragraphs. Drop to 200 for terse banter, bump to 800+ for longer scenes. - narrative_temperature: float = 0.85 Sampling temperature. 0.7 = grounded/consistent (slightly stiff), 0.85 = creative-but-in-character (default), 1.0 = wide variety, >1.0 = often off-the-rails. - prompt closing instruction now nudges: "Keep your response to a single beat — one or two short paragraphs at most. Don't monologue; leave room for the other person to react." Both turns.py (post_turn) and regenerate.py forward the params to client.stream(). FeatherlessClient already passes **params through to the OpenAI-compat endpoint. Note: temperature doesn't control length — that was a common misconception. max_tokens is the actual length cap. Lower temperature makes word choice more predictable (slightly stiffer voice), not shorter. Both knobs are useful for different goals.
59 lines
2.4 KiB
Python
59 lines
2.4 KiB
Python
from __future__ import annotations
|
||
import os
|
||
import tomllib
|
||
from pathlib import Path
|
||
from pydantic import BaseModel, Field
|
||
|
||
REPO_ROOT = Path(__file__).resolve().parent.parent
|
||
DEFAULT_CONFIG = REPO_ROOT / "data" / "config.toml"
|
||
DEFAULT_DB = REPO_ROOT / "data" / "chat.db"
|
||
|
||
class Settings(BaseModel):
|
||
featherless_api_key: str
|
||
featherless_base_url: str = "https://api.featherless.ai/v1"
|
||
narrative_model: str = "dphn/Dolphin-Mistral-24B-Venice-Edition"
|
||
classifier_model: str = "NousResearch/Hermes-3-Llama-3.1-8B"
|
||
classifier_fallbacks: list[str] = Field(
|
||
default_factory=lambda: [
|
||
"cognitivecomputations/dolphin-2.9.4-llama3-8b",
|
||
"mlabonne/Meta-Llama-3.1-8B-Instruct-abliterated",
|
||
]
|
||
)
|
||
ooc_marker: str = "(("
|
||
retrieval_k: int = 4
|
||
narrative_budget_hard: int = 8000
|
||
narrative_budget_soft: int = 6000
|
||
# Cap on each generated bot response. ~400 tokens ≈ 1–2 short paragraphs.
|
||
# Bump if you want longer scenes; drop to 200 for terse banter.
|
||
narrative_max_tokens: int = 400
|
||
# Sampling temperature for narrative generation. 0.7 = grounded /
|
||
# consistent; 0.85 = creative-but-in-character (default); 1.0 = wide
|
||
# variety, can drift; >1.0 = often off-the-rails.
|
||
narrative_temperature: float = 0.85
|
||
classifier_budget_hard: int = 4000
|
||
classifier_timeout_s: float = 30.0
|
||
# Featherless free tier and lower paid tiers cap concurrent connections.
|
||
# Set this to your account's max-concurrent-connections limit.
|
||
featherless_max_concurrent: int = 2
|
||
db_path: Path = DEFAULT_DB
|
||
data_dir: Path = REPO_ROOT / "data"
|
||
bind_host: str = "127.0.0.1"
|
||
bind_port: int = 8000
|
||
|
||
def load_settings() -> Settings:
|
||
config_path = Path(os.environ.get("CHAT_CONFIG_PATH", DEFAULT_CONFIG))
|
||
raw: dict = {}
|
||
if config_path.exists():
|
||
raw = tomllib.loads(config_path.read_text())
|
||
if "CHAT_DB_PATH" in os.environ:
|
||
raw["db_path"] = Path(os.environ["CHAT_DB_PATH"])
|
||
if "CHAT_DATA_DIR" in os.environ:
|
||
raw["data_dir"] = Path(os.environ["CHAT_DATA_DIR"])
|
||
elif "data_dir" not in raw and "db_path" in raw:
|
||
# T31: when ``CHAT_DB_PATH`` is overridden (typical in tests) but
|
||
# ``data_dir`` isn't, derive ``data_dir`` from the db's parent so
|
||
# snapshot/auxiliary files stay alongside the test db rather than
|
||
# leaking into the real repo data dir.
|
||
raw["data_dir"] = Path(raw["db_path"]).parent
|
||
return Settings(**raw)
|