diff --git a/chat/config.py b/chat/config.py index 332c95c..1af77ca 100644 --- a/chat/config.py +++ b/chat/config.py @@ -25,15 +25,20 @@ class Settings(BaseModel): narrative_budget_soft: int = 6000 # Cap on each generated bot response. The asterisk-action format # (see ``_closing_instruction`` in chat/services/prompt.py) targets - # 2-4 short interleaved action+dialogue beats; ~250 tokens fits that - # without leaving room for the model to drift into multi-paragraph - # inner-monologue prose. Bump back up if you want longer scenes; - # drop to 150 for very terse banter. - narrative_max_tokens: int = 250 + # 2-3 short interleaved action+dialogue beats. Verbose roleplay + # narrators (Cydonia, Magnum) ignore the prompt's cap and keep + # going; ``trim_to_max_beats`` in chat/services/prompt.py handles + # the actual cap by trimming at a beat boundary post-stream. This + # max_tokens setting just gives the third beat enough room to + # complete naturally before max_tokens cuts mid-action: 160 fits + # 3 substantive beats with margin. Bump to 250 for longer scenes; + # drop to 80 for terse banter. + narrative_max_tokens: int = 160 # Sampling temperature for narrative generation. 0.7 = grounded / - # consistent; 0.85 = creative-but-in-character (default); 1.0 = wide - # variety, can drift; >1.0 = often off-the-rails. - narrative_temperature: float = 0.85 + # instruction-compliant (current — Cydonia is verbose-by-default and + # tighter temperature helps it respect the 2-3-beat cap); + # 0.85 = creative; 1.0 = wide variety; >1.0 = often off-the-rails. + narrative_temperature: float = 0.7 classifier_budget_hard: int = 4000 classifier_timeout_s: float = 30.0 # Featherless free tier and lower paid tiers cap concurrent connections. diff --git a/chat/services/prompt.py b/chat/services/prompt.py index cae9fee..e313337 100644 --- a/chat/services/prompt.py +++ b/chat/services/prompt.py @@ -325,6 +325,36 @@ def _build_open_threads_block(threads: list[dict]) -> str | None: return "\n".join(lines) +def trim_to_max_beats(text: str, max_beats: int = 3) -> str: + """Truncate ``text`` to at most ``max_beats`` asterisk-action beats. + + A "beat" is one ``*action*`` markdown-italic block plus the dialogue + that follows it; counting ``*`` characters works as a deterministic + boundary detector since each complete beat contributes exactly two + asterisks (open + close). The (2*max_beats + 1)th asterisk is the + opening of an over-the-cap beat; we trim immediately before it and + strip trailing whitespace. + + Belt-and-suspenders for verbose roleplay-tuned narrators (Cydonia, + Magnum, etc.) that reliably ignore "HARD CAP: 2-3 beats" prompt + instructions and keep going. A physical max_tokens cap helps but + truncates mid-word; this trims at a beat boundary instead. + + Idempotent and safe on outputs with fewer beats than the cap (just + returns the text unchanged after a single pass). + """ + if max_beats <= 0: + return "" + target = max_beats * 2 + count = 0 + for i, ch in enumerate(text): + if ch == "*": + count += 1 + if count > target: + return text[:i].rstrip() + return text + + def _closing_instruction(speaker_name: str, addressee_name: str) -> str: return ( f"Continue as {speaker_name}. Format strictly:\n" @@ -333,17 +363,21 @@ def _closing_instruction(speaker_name: str, addressee_name: str) -> str: "thoughts inside asterisks.\n" "- Speak dialogue as plain text between action beats, no quote " "marks. Keep speech fragmented, not paragraphs.\n" - "- Interleave 2-4 short beats (action, brief speech, action, brief " - "speech). Each beat is one concrete gesture or sensory image — no " + "- HARD CAP: 2-3 beats per response. A beat is one *asterisk " + "action* paired with a short dialogue fragment. After the " + "third beat, STOP — do not add a fourth, do not summarize, do " + f"not narrate {addressee_name}'s reaction. Long responses break " + "the scene's rhythm.\n" + "- Each beat is one concrete gesture or sensory image. No " "explanation, no inner monologue, no stage-direction adverbs.\n" "- Trailing ellipses (...) are fine for emotional weight.\n" - "Example: *She turns with soapy hands to cup your face* That's how " - "I know it's real... *She kisses you softly* You love me when I'm " - "messy... *She rests her forehead against yours* ...and every " - "moment in between.\n" + "EXAMPLE (3 beats, stops cleanly):\n" + "*She turns with soapy hands to cup your face* That's how I know " + "it's real... *She kisses you softly* You love me when I'm messy... " + "*She smiles tearfully* ...and every moment in between.\n" f"Show only what {addressee_name} could externally observe of " - f"{speaker_name}; never narrate {addressee_name}'s actions or " - "thoughts. One response — leave room to react." + f"{speaker_name}; never narrate {addressee_name}'s actions, " + "thoughts, or speech. One response — leave room to react." ) diff --git a/chat/web/turns.py b/chat/web/turns.py index 623390d..9b3076b 100644 --- a/chat/web/turns.py +++ b/chat/web/turns.py @@ -67,6 +67,7 @@ from chat.services.multi_state_update import compute_state_updates_for_present from chat.services.prompt import ( assemble_narrative_prompt, consume_pending_meanwhile_digests, + trim_to_max_beats, ) from chat.services.rewind import compute_rewind_preview, execute_rewind from chat.services.scene_close import detect_scene_close @@ -482,6 +483,11 @@ async def post_turn( _in_flight_tasks.pop(chat_id, None) primary_text = "".join(primary_accumulated) + # Belt-and-suspenders: trim to 3 beats max even if the model + # ignored the "HARD CAP: 2-3 beats" prompt instruction. Roleplay- + # tuned narrators are reliably verbose; a physical max_tokens + # truncates mid-word, this trims at a beat boundary. + primary_text = trim_to_max_beats(primary_text, max_beats=3) # 7. Append the assistant_turn with the final text. (See note above on # why we skip ``project`` for these transcript-only event kinds.) @@ -677,6 +683,10 @@ async def post_turn( _in_flight_tasks.pop(chat_id, None) interjection_text = "".join(interject_accumulated) + # Same beat-cap as the primary turn — interjections are + # by definition short, but Cydonia-class narrators ignore + # that. 2 beats is plenty for a chime-in. + interjection_text = trim_to_max_beats(interjection_text, max_beats=2) # Capture the event id (T86 follow-up) so the SSE fragment # below carries ``id="turn-"`` for in-place swap. diff --git a/tests/test_prompt.py b/tests/test_prompt.py index be12271..721923f 100644 --- a/tests/test_prompt.py +++ b/tests/test_prompt.py @@ -21,7 +21,11 @@ import chat.state.world # noqa: F401 import chat.state.events # noqa: F401 import chat.state.threads # noqa: F401 from chat.llm.client import Message -from chat.services.prompt import _witness_role_for, assemble_narrative_prompt +from chat.services.prompt import ( + _witness_role_for, + assemble_narrative_prompt, + trim_to_max_beats, +) def _seed_basic(conn) -> None: @@ -569,8 +573,8 @@ def test_tight_budget_drops_guest_activity_bullet_first(tmp_path): # (Phase 4.6 narrative-style fix). Budget bumped enough to # accommodate the larger MUST floor while still exercising # the SHOULD-tier trim path. - budget_soft=440, - budget_hard=460, + budget_soft=480, + budget_hard=510, ) body = msgs[0].content # Speaker bullet survives (MUST-tier floor). @@ -758,8 +762,8 @@ def test_assemble_with_tight_budget_drops_guest_activity_first(tmp_path): # (Phase 4.6 narrative-style fix). Budget bumped enough to # accommodate the larger MUST floor while still exercising # the SHOULD-tier trim path. - budget_soft=440, - budget_hard=460, + budget_soft=480, + budget_hard=510, ) body = msgs[0].content # MUST: speaker identity, edge to addressee, last 4 dialogue turns. @@ -773,7 +777,7 @@ def test_assemble_with_tight_budget_drops_guest_activity_first(tmp_path): # instruction that ships the asterisk-format spec. import tiktoken enc = tiktoken.get_encoding("cl100k_base") - assert len(enc.encode(body)) <= 460 + assert len(enc.encode(body)) <= 510 # --------------------------------------------------------------------------- @@ -870,3 +874,44 @@ def test_witness_role_for_none_host_returns_host(): # Sanity check: existing semantics preserved. assert _witness_role_for("bot_a", "bot_a") == "host" assert _witness_role_for("bot_a", "bot_b") == "guest" + + +# --------------------------------------------------------------------------- +# trim_to_max_beats — caps verbose narrative output to N beats +# --------------------------------------------------------------------------- + + +def test_trim_to_max_beats_passthrough_when_under_cap(): + assert trim_to_max_beats("", 3) == "" + assert trim_to_max_beats("plain text", 3) == "plain text" + two = "*She nods* okay. *She turns* see you." + assert trim_to_max_beats(two, 3) == two + + +def test_trim_to_max_beats_passthrough_at_exactly_cap(): + three = "*A* one. *B* two. *C* three." + assert trim_to_max_beats(three, 3) == three + + +def test_trim_to_max_beats_cuts_at_fourth_beat(): + """Cydonia-style 4-beat output trimmed at the start of the 4th + asterisk action; trailing whitespace stripped.""" + four = "*A* one. *B* two. *C* three. *D* four." + assert trim_to_max_beats(four, 3) == "*A* one. *B* two. *C* three." + + +def test_trim_to_max_beats_handles_runaway_six_beats(): + """The exact failure mode that motivated this — verbose narrator + rambling for 6 beats when the prompt asked for 2-3.""" + six = "*A* 1 *B* 2 *C* 3 *D* 4 *E* 5 *F* 6" + assert trim_to_max_beats(six, 3) == "*A* 1 *B* 2 *C* 3" + + +def test_trim_to_max_beats_respects_lower_cap(): + four = "*A* one. *B* two. *C* three. *D* four." + assert trim_to_max_beats(four, 2) == "*A* one. *B* two." + assert trim_to_max_beats(four, 1) == "*A* one." + + +def test_trim_to_max_beats_zero_returns_empty(): + assert trim_to_max_beats("*A* one. *B* two.", 0) == ""