from __future__ import annotations import os import tomllib from pathlib import Path from pydantic import BaseModel, Field REPO_ROOT = Path(__file__).resolve().parent.parent DEFAULT_CONFIG = REPO_ROOT / "data" / "config.toml" DEFAULT_DB = REPO_ROOT / "data" / "chat.db" class Settings(BaseModel): featherless_api_key: str featherless_base_url: str = "https://api.featherless.ai/v1" narrative_model: str = "dphn/Dolphin-Mistral-24B-Venice-Edition" classifier_model: str = "NousResearch/Hermes-3-Llama-3.1-8B" classifier_fallbacks: list[str] = Field( default_factory=lambda: [ "cognitivecomputations/dolphin-2.9.4-llama3-8b", "mlabonne/Meta-Llama-3.1-8B-Instruct-abliterated", ] ) ooc_marker: str = "((" retrieval_k: int = 4 narrative_budget_hard: int = 8000 narrative_budget_soft: int = 6000 # Cap on each generated bot response. The asterisk-action format # (see ``_closing_instruction`` in chat/services/prompt.py) targets # 2-4 short interleaved action+dialogue beats; ~250 tokens fits that # without leaving room for the model to drift into multi-paragraph # inner-monologue prose. Bump back up if you want longer scenes; # drop to 150 for very terse banter. narrative_max_tokens: int = 250 # Sampling temperature for narrative generation. 0.7 = grounded / # consistent; 0.85 = creative-but-in-character (default); 1.0 = wide # variety, can drift; >1.0 = often off-the-rails. narrative_temperature: float = 0.85 classifier_budget_hard: int = 4000 classifier_timeout_s: float = 30.0 # Featherless free tier and lower paid tiers cap concurrent connections. # Set this to your account's max-concurrent-connections limit. featherless_max_concurrent: int = 2 db_path: Path = DEFAULT_DB data_dir: Path = REPO_ROOT / "data" bind_host: str = "127.0.0.1" bind_port: int = 8000 # Local MLX server (e.g. ``mlx-omni-server``) — serves any model # whose id starts with one of ``local_prefixes`` (default # ``"mlx-community/"``). The :class:`RoutedLLMClient` inspects the # ``model`` kwarg at call time: local-prefix -> local, else -> remote. # ``embed()`` always routes local. local_mlx_base_url: str = "http://127.0.0.1:10240/v1" local_mlx_max_concurrent: int = 1 # Optional OpenRouter-style provider pinning for the classifier # client. Maps to the ``provider`` field on chat.completions.create # via ``extra_body``; the FeatherlessClient (which is just an # AsyncOpenAI wrapper) merges it into every call. Useful for forcing # Llama-3.1-8B classifier traffic onto Cerebras (~423 tok/s, 10x # the default Nebius). Empty list = no pin (provider is # OpenRouter's choice). classifier_provider_order: list[str] = Field(default_factory=list) # T112 (Phase 4.5): embedding model identifier. Default is the # deterministic local pseudo so fresh installs / tests don't need # any external infra. Override via config.toml to a real model id # (e.g. ``"mlx-community/bge-small-en-v1.5-bf16"``) once a local # MLX server is running. embedding_model: str = "pseudo-sha256-384" def load_settings() -> Settings: config_path = Path(os.environ.get("CHAT_CONFIG_PATH", DEFAULT_CONFIG)) raw: dict = {} if config_path.exists(): raw = tomllib.loads(config_path.read_text()) if "CHAT_DB_PATH" in os.environ: raw["db_path"] = Path(os.environ["CHAT_DB_PATH"]) if "CHAT_DATA_DIR" in os.environ: raw["data_dir"] = Path(os.environ["CHAT_DATA_DIR"]) elif "data_dir" not in raw and "db_path" in raw: # T31: when ``CHAT_DB_PATH`` is overridden (typical in tests) but # ``data_dir`` isn't, derive ``data_dir`` from the db's parent so # snapshot/auxiliary files stay alongside the test db rather than # leaking into the real repo data dir. raw["data_dir"] = Path(raw["db_path"]).parent return Settings(**raw)