fix: parse_turn falls back gracefully + classify logs flapping classifiers

The turn endpoint was 500ing in multi-bot scenes whenever the
classifier provider hiccuped on parse_turn — particularly visible
after a guest was added and bots started exchanging turns. The
traceback was 'classify failed for schema ParsedTurn with no default'
because parse_turn was the only classify caller without a default.

Two changes:

- chat/services/turn_parse.py: parse_turn now passes a default that
  wraps the whole prose as one 'dialogue' segment. The narrative
  still fires on the prose; we lose finer-grained segment kinds
  (action vs dialogue vs ooc) on this turn, but the request returns
  cleanly. Updated the existing test that pinned the old
  RuntimeError contract.

- chat/llm/classify.py: when retries are exhausted, log a WARNING
  with the schema name, last error type, and a snippet of the last
  raw text the model returned. Surfaces flapping classifiers in the
  uvicorn log for diagnosis without taking down the request.

Suite: 471 passed in 11.7s.
This commit is contained in:
Joseph Doherty
2026-04-27 15:07:39 -04:00
parent a11255a5e6
commit 49be3cf4b9
3 changed files with 53 additions and 11 deletions
+18 -1
View File
@@ -1,11 +1,13 @@
from __future__ import annotations from __future__ import annotations
import json import json
import asyncio import asyncio
import logging
from typing import TypeVar from typing import TypeVar
from pydantic import BaseModel, ValidationError from pydantic import BaseModel, ValidationError
from .client import LLMClient, Message from .client import LLMClient, Message
T = TypeVar("T", bound=BaseModel) T = TypeVar("T", bound=BaseModel)
_log = logging.getLogger(__name__)
REFUSAL_PATTERNS = ("i can't", "i cannot", "i'm sorry, but", "as an ai") REFUSAL_PATTERNS = ("i can't", "i cannot", "i'm sorry, but", "as an ai")
@@ -47,6 +49,8 @@ async def classify(
# several seconds on tokens we'll never use. Classifier responses # several seconds on tokens we'll never use. Classifier responses
# are small JSON objects — 512 tokens is generous; usual completions # are small JSON objects — 512 tokens is generous; usual completions
# are 50-150. # are 50-150.
last_text = None
last_error: BaseException | None = None
for attempt in range(3): for attempt in range(3):
try: try:
text = await asyncio.wait_for( text = await asyncio.wait_for(
@@ -58,16 +62,29 @@ async def classify(
), ),
timeout=timeout_s, timeout=timeout_s,
) )
last_text = text
cleaned = _strip_json_fences(text) cleaned = _strip_json_fences(text)
if any(p in cleaned.lower()[:80] for p in REFUSAL_PATTERNS) and not cleaned.lstrip().startswith("{"): if any(p in cleaned.lower()[:80] for p in REFUSAL_PATTERNS) and not cleaned.lstrip().startswith("{"):
raise ValueError("refusal-shaped response") raise ValueError("refusal-shaped response")
return schema.model_validate_json(cleaned) return schema.model_validate_json(cleaned)
except (ValidationError, ValueError, json.JSONDecodeError, asyncio.TimeoutError): except (ValidationError, ValueError, json.JSONDecodeError, asyncio.TimeoutError) as exc:
last_error = exc
msgs[0] = Message( msgs[0] = Message(
role="system", role="system",
content=system + schema_block + "\n\nRespond with valid JSON ONLY. No prose, no markdown fences.", content=system + schema_block + "\n\nRespond with valid JSON ONLY. No prose, no markdown fences.",
) )
continue continue
# Log when we're falling back so flapping classifiers are
# diagnosable without taking down the request.
snippet = (last_text or "")[:200].replace("\n", " ")
_log.warning(
"classify(%s) exhausted 3 attempts; last_error=%s last_text=%r; "
"falling back to %s",
schema.__name__,
type(last_error).__name__ if last_error else "?",
snippet,
"default" if default is not None else "RuntimeError (no default)",
)
if default is None: if default is None:
raise RuntimeError(f"classify failed for schema {schema.__name__} with no default") raise RuntimeError(f"classify failed for schema {schema.__name__} with no default")
return default return default
+14 -3
View File
@@ -107,13 +107,23 @@ async def parse_turn(
without an LLM call (the classifier would error on empty input without an LLM call (the classifier would error on empty input
anyway, and the result is unambiguous). anyway, and the result is unambiguous).
Raises ``RuntimeError`` if the classifier fails twice — no default Falls back to a single dialogue-shaped segment containing the
is supplied, since the caller (T19's turn flow) is responsible for whole prose if the classifier flaps after retries — the turn flow
surfacing the error to the user. can keep moving (the narrative will still fire on the prose) at
the cost of finer-grained segment classification. The original
code raised ``RuntimeError`` here, which 500'd the whole request
and was particularly painful in multi-bot scenes where every
user turn paid the classifier round-trip.
""" """
if not prose.strip(): if not prose.strip():
return ParsedTurn(segments=[]) return ParsedTurn(segments=[])
fallback = ParsedTurn(
segments=[TurnSegment(kind="dialogue", text=prose)],
intent="narrative",
landing_state_hint="",
)
user_prompt = f"INPUT:\n{prose}" user_prompt = f"INPUT:\n{prose}"
return await classify( return await classify(
client, client,
@@ -121,5 +131,6 @@ async def parse_turn(
system=_SYSTEM_PROMPT, system=_SYSTEM_PROMPT,
user=user_prompt, user=user_prompt,
schema=ParsedTurn, schema=ParsedTurn,
default=fallback,
timeout_s=timeout_s, timeout_s=timeout_s,
) )
+21 -7
View File
@@ -73,11 +73,25 @@ async def test_parse_turn_empty_prose_short_circuits_without_classifier_call():
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_parse_turn_raises_when_classifier_fails_twice(): async def test_parse_turn_falls_back_to_whole_prose_when_classifier_fails():
"""A flapping classifier (3 invalid responses) no longer 500s the
request. ``parse_turn`` returns the original prose as a single
``dialogue`` segment so the turn flow can keep moving — the
narrative will still fire on the prose, just without finer-grained
segment classification.
The old contract was ``RuntimeError`` (no default), but in
production that took down the whole turn endpoint with a 500 the
moment any classifier provider hiccuped — particularly painful in
multi-bot scenes where every user turn pays the parse_turn cost.
"""
mock = MockLLMClient(canned=["nope", "still nope", "nope3"]) mock = MockLLMClient(canned=["nope", "still nope", "nope3"])
with pytest.raises(RuntimeError): result = await parse_turn(
await parse_turn( mock,
mock, model="m",
model="m", prose='*shrugs* "whatever"',
prose='*shrugs* "whatever"', )
) assert len(result.segments) == 1
assert result.segments[0].kind == "dialogue"
assert result.segments[0].text == '*shrugs* "whatever"'
assert result.intent == "narrative"