fix: parse_turn falls back gracefully + classify logs flapping classifiers
The turn endpoint was 500ing in multi-bot scenes whenever the classifier provider hiccuped on parse_turn — particularly visible after a guest was added and bots started exchanging turns. The traceback was 'classify failed for schema ParsedTurn with no default' because parse_turn was the only classify caller without a default. Two changes: - chat/services/turn_parse.py: parse_turn now passes a default that wraps the whole prose as one 'dialogue' segment. The narrative still fires on the prose; we lose finer-grained segment kinds (action vs dialogue vs ooc) on this turn, but the request returns cleanly. Updated the existing test that pinned the old RuntimeError contract. - chat/llm/classify.py: when retries are exhausted, log a WARNING with the schema name, last error type, and a snippet of the last raw text the model returned. Surfaces flapping classifiers in the uvicorn log for diagnosis without taking down the request. Suite: 471 passed in 11.7s.
This commit is contained in:
+18
-1
@@ -1,11 +1,13 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
import json
|
import json
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import logging
|
||||||
from typing import TypeVar
|
from typing import TypeVar
|
||||||
from pydantic import BaseModel, ValidationError
|
from pydantic import BaseModel, ValidationError
|
||||||
from .client import LLMClient, Message
|
from .client import LLMClient, Message
|
||||||
|
|
||||||
T = TypeVar("T", bound=BaseModel)
|
T = TypeVar("T", bound=BaseModel)
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
REFUSAL_PATTERNS = ("i can't", "i cannot", "i'm sorry, but", "as an ai")
|
REFUSAL_PATTERNS = ("i can't", "i cannot", "i'm sorry, but", "as an ai")
|
||||||
|
|
||||||
@@ -47,6 +49,8 @@ async def classify(
|
|||||||
# several seconds on tokens we'll never use. Classifier responses
|
# several seconds on tokens we'll never use. Classifier responses
|
||||||
# are small JSON objects — 512 tokens is generous; usual completions
|
# are small JSON objects — 512 tokens is generous; usual completions
|
||||||
# are 50-150.
|
# are 50-150.
|
||||||
|
last_text = None
|
||||||
|
last_error: BaseException | None = None
|
||||||
for attempt in range(3):
|
for attempt in range(3):
|
||||||
try:
|
try:
|
||||||
text = await asyncio.wait_for(
|
text = await asyncio.wait_for(
|
||||||
@@ -58,16 +62,29 @@ async def classify(
|
|||||||
),
|
),
|
||||||
timeout=timeout_s,
|
timeout=timeout_s,
|
||||||
)
|
)
|
||||||
|
last_text = text
|
||||||
cleaned = _strip_json_fences(text)
|
cleaned = _strip_json_fences(text)
|
||||||
if any(p in cleaned.lower()[:80] for p in REFUSAL_PATTERNS) and not cleaned.lstrip().startswith("{"):
|
if any(p in cleaned.lower()[:80] for p in REFUSAL_PATTERNS) and not cleaned.lstrip().startswith("{"):
|
||||||
raise ValueError("refusal-shaped response")
|
raise ValueError("refusal-shaped response")
|
||||||
return schema.model_validate_json(cleaned)
|
return schema.model_validate_json(cleaned)
|
||||||
except (ValidationError, ValueError, json.JSONDecodeError, asyncio.TimeoutError):
|
except (ValidationError, ValueError, json.JSONDecodeError, asyncio.TimeoutError) as exc:
|
||||||
|
last_error = exc
|
||||||
msgs[0] = Message(
|
msgs[0] = Message(
|
||||||
role="system",
|
role="system",
|
||||||
content=system + schema_block + "\n\nRespond with valid JSON ONLY. No prose, no markdown fences.",
|
content=system + schema_block + "\n\nRespond with valid JSON ONLY. No prose, no markdown fences.",
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
|
# Log when we're falling back so flapping classifiers are
|
||||||
|
# diagnosable without taking down the request.
|
||||||
|
snippet = (last_text or "")[:200].replace("\n", " ")
|
||||||
|
_log.warning(
|
||||||
|
"classify(%s) exhausted 3 attempts; last_error=%s last_text=%r; "
|
||||||
|
"falling back to %s",
|
||||||
|
schema.__name__,
|
||||||
|
type(last_error).__name__ if last_error else "?",
|
||||||
|
snippet,
|
||||||
|
"default" if default is not None else "RuntimeError (no default)",
|
||||||
|
)
|
||||||
if default is None:
|
if default is None:
|
||||||
raise RuntimeError(f"classify failed for schema {schema.__name__} with no default")
|
raise RuntimeError(f"classify failed for schema {schema.__name__} with no default")
|
||||||
return default
|
return default
|
||||||
|
|||||||
@@ -107,13 +107,23 @@ async def parse_turn(
|
|||||||
without an LLM call (the classifier would error on empty input
|
without an LLM call (the classifier would error on empty input
|
||||||
anyway, and the result is unambiguous).
|
anyway, and the result is unambiguous).
|
||||||
|
|
||||||
Raises ``RuntimeError`` if the classifier fails twice — no default
|
Falls back to a single dialogue-shaped segment containing the
|
||||||
is supplied, since the caller (T19's turn flow) is responsible for
|
whole prose if the classifier flaps after retries — the turn flow
|
||||||
surfacing the error to the user.
|
can keep moving (the narrative will still fire on the prose) at
|
||||||
|
the cost of finer-grained segment classification. The original
|
||||||
|
code raised ``RuntimeError`` here, which 500'd the whole request
|
||||||
|
and was particularly painful in multi-bot scenes where every
|
||||||
|
user turn paid the classifier round-trip.
|
||||||
"""
|
"""
|
||||||
if not prose.strip():
|
if not prose.strip():
|
||||||
return ParsedTurn(segments=[])
|
return ParsedTurn(segments=[])
|
||||||
|
|
||||||
|
fallback = ParsedTurn(
|
||||||
|
segments=[TurnSegment(kind="dialogue", text=prose)],
|
||||||
|
intent="narrative",
|
||||||
|
landing_state_hint="",
|
||||||
|
)
|
||||||
|
|
||||||
user_prompt = f"INPUT:\n{prose}"
|
user_prompt = f"INPUT:\n{prose}"
|
||||||
return await classify(
|
return await classify(
|
||||||
client,
|
client,
|
||||||
@@ -121,5 +131,6 @@ async def parse_turn(
|
|||||||
system=_SYSTEM_PROMPT,
|
system=_SYSTEM_PROMPT,
|
||||||
user=user_prompt,
|
user=user_prompt,
|
||||||
schema=ParsedTurn,
|
schema=ParsedTurn,
|
||||||
|
default=fallback,
|
||||||
timeout_s=timeout_s,
|
timeout_s=timeout_s,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -73,11 +73,25 @@ async def test_parse_turn_empty_prose_short_circuits_without_classifier_call():
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_parse_turn_raises_when_classifier_fails_twice():
|
async def test_parse_turn_falls_back_to_whole_prose_when_classifier_fails():
|
||||||
|
"""A flapping classifier (3 invalid responses) no longer 500s the
|
||||||
|
request. ``parse_turn`` returns the original prose as a single
|
||||||
|
``dialogue`` segment so the turn flow can keep moving — the
|
||||||
|
narrative will still fire on the prose, just without finer-grained
|
||||||
|
segment classification.
|
||||||
|
|
||||||
|
The old contract was ``RuntimeError`` (no default), but in
|
||||||
|
production that took down the whole turn endpoint with a 500 the
|
||||||
|
moment any classifier provider hiccuped — particularly painful in
|
||||||
|
multi-bot scenes where every user turn pays the parse_turn cost.
|
||||||
|
"""
|
||||||
mock = MockLLMClient(canned=["nope", "still nope", "nope3"])
|
mock = MockLLMClient(canned=["nope", "still nope", "nope3"])
|
||||||
with pytest.raises(RuntimeError):
|
result = await parse_turn(
|
||||||
await parse_turn(
|
mock,
|
||||||
mock,
|
model="m",
|
||||||
model="m",
|
prose='*shrugs* "whatever"',
|
||||||
prose='*shrugs* "whatever"',
|
)
|
||||||
)
|
assert len(result.segments) == 1
|
||||||
|
assert result.segments[0].kind == "dialogue"
|
||||||
|
assert result.segments[0].text == '*shrugs* "whatever"'
|
||||||
|
assert result.intent == "narrative"
|
||||||
|
|||||||
Reference in New Issue
Block a user