fix: parse_turn falls back gracefully + classify logs flapping classifiers

The turn endpoint was 500ing in multi-bot scenes whenever the classifier provider hiccuped on parse_turn — particularly visible after a guest was added and bots started exchanging turns. The traceback was 'classify failed for schema ParsedTurn with no default' because parse_turn was the only classify caller without a default. Two changes: - chat/services/turn_parse.py: parse_turn now passes a default that wraps the whole prose as one 'dialogue' segment. The narrative still fires on the prose; we lose finer-grained segment kinds (action vs dialogue vs ooc) on this turn, but the request returns cleanly. Updated the existing test that pinned the old RuntimeError contract. - chat/llm/classify.py: when retries are exhausted, log a WARNING with the schema name, last error type, and a snippet of the last raw text the model returned. Surfaces flapping classifiers in the uvicorn log for diagnosis without taking down the request. Suite: 471 passed in 11.7s.
2026-04-27 15:07:39 -04:00
parent a11255a5e6
commit 49be3cf4b9
3 changed files with 53 additions and 11 deletions
@@ -1,11 +1,13 @@
 from __future__ import annotations
 import json
 import asyncio
 import logging
 from typing import TypeVar
 from pydantic import BaseModel, ValidationError
 from .client import LLMClient, Message
 T = TypeVar("T", bound=BaseModel)
 _log = logging.getLogger(__name__)
 REFUSAL_PATTERNS = ("i can't", "i cannot", "i'm sorry, but", "as an ai")
@@ -47,6 +49,8 @@ async def classify(
    # several seconds on tokens we'll never use. Classifier responses
    # are small JSON objects — 512 tokens is generous; usual completions
    # are 50-150.
    last_text = None
    last_error: BaseException | None = None
    for attempt in range(3):
        try:
            text = await asyncio.wait_for(
@@ -58,16 +62,29 @@ async def classify(
                ),
                timeout=timeout_s,
            )
            last_text = text
            cleaned = _strip_json_fences(text)
            if any(p in cleaned.lower()[:80] for p in REFUSAL_PATTERNS) and not cleaned.lstrip().startswith("{"):
                raise ValueError("refusal-shaped response")
            return schema.model_validate_json(cleaned)
-        except (ValidationError, ValueError, json.JSONDecodeError, asyncio.TimeoutError):
+        except (ValidationError, ValueError, json.JSONDecodeError, asyncio.TimeoutError) as exc:
            last_error = exc
            msgs[0] = Message(
                role="system",
                content=system + schema_block + "\n\nRespond with valid JSON ONLY. No prose, no markdown fences.",
            )
            continue
    # Log when we're falling back so flapping classifiers are
    # diagnosable without taking down the request.
    snippet = (last_text or "")[:200].replace("\n", " ")
    _log.warning(
        "classify(%s) exhausted 3 attempts; last_error=%s last_text=%r; "
        "falling back to %s",
        schema.__name__,
        type(last_error).__name__ if last_error else "?",
        snippet,
        "default" if default is not None else "RuntimeError (no default)",
    )
    if default is None:
        raise RuntimeError(f"classify failed for schema {schema.__name__} with no default")
    return default
@@ -107,13 +107,23 @@ async def parse_turn(
    without an LLM call (the classifier would error on empty input
    anyway, and the result is unambiguous).
-    Raises ``RuntimeError`` if the classifier fails twice — no default
+    Falls back to a single dialogue-shaped segment containing the
-    is supplied, since the caller (T19's turn flow) is responsible for
+    whole prose if the classifier flaps after retries — the turn flow
-    surfacing the error to the user.
+    can keep moving (the narrative will still fire on the prose) at
    the cost of finer-grained segment classification. The original
    code raised ``RuntimeError`` here, which 500'd the whole request
    and was particularly painful in multi-bot scenes where every
    user turn paid the classifier round-trip.
    """
    if not prose.strip():
        return ParsedTurn(segments=[])
    fallback = ParsedTurn(
        segments=[TurnSegment(kind="dialogue", text=prose)],
        intent="narrative",
        landing_state_hint="",
    )
    user_prompt = f"INPUT:\n{prose}"
    return await classify(
        client,
@@ -121,5 +131,6 @@ async def parse_turn(
        system=_SYSTEM_PROMPT,
        user=user_prompt,
        schema=ParsedTurn,
        default=fallback,
        timeout_s=timeout_s,
    )
@@ -73,11 +73,25 @@ async def test_parse_turn_empty_prose_short_circuits_without_classifier_call():
@pytest.mark.asyncio
-async def test_parse_turn_raises_when_classifier_fails_twice():
+async def test_parse_turn_falls_back_to_whole_prose_when_classifier_fails():
    """A flapping classifier (3 invalid responses) no longer 500s the
    request. ``parse_turn`` returns the original prose as a single
    ``dialogue`` segment so the turn flow can keep moving — the
    narrative will still fire on the prose, just without finer-grained
    segment classification.
    The old contract was ``RuntimeError`` (no default), but in
    production that took down the whole turn endpoint with a 500 the
    moment any classifier provider hiccuped — particularly painful in
    multi-bot scenes where every user turn pays the parse_turn cost.
    """
    mock = MockLLMClient(canned=["nope", "still nope", "nope3"])
-    with pytest.raises(RuntimeError):
+    result = await parse_turn(
-        await parse_turn(
+        mock,
-            mock,
+        model="m",
-            model="m",
+        prose='*shrugs* "whatever"',
-            prose='*shrugs* "whatever"',
+    )
-        )
+    assert len(result.segments) == 1
    assert result.segments[0].kind == "dialogue"
    assert result.segments[0].text == '*shrugs* "whatever"'
    assert result.intent == "narrative"