Files
chat/tests/test_kickoff.py
T
Joseph Doherty 5aab98e4d7 fix: classifier robustness — schema in prompt, retries, kickoff fallback
The kickoff parse-and-confirm route was 500-ing intermittently because
Hermes-3 + Featherless's response_format={"type":"json_object"} only
guarantees JSON output, NOT a particular schema. The model was inventing
its own field names (sceneTime, entities, settingDetails) instead of
the KickoffParse fields, causing Pydantic validation to fail on both
classify() retries.

Three changes:

1. Include the Pydantic JSON schema in the system prompt so the model
   knows exactly which keys to produce. Affects every classify() call
   (kickoff parse, turn parse, scene-close detect, significance,
   state-update, scene summarize). Strip ```json fences if the model
   wraps its output. Bump retries 2 → 3 (model is stochastic; one extra
   attempt closes most of the remaining gap).

2. parse_kickoff() now passes a default empty KickoffParse so the
   route degrades to a fillable form instead of 500 when the classifier
   ultimately fails. The confirm form is the human-in-the-loop; an
   empty form is strictly better UX than a stack trace.

3. Tests updated: bumped canned-failure arrays from 2 → 3 entries to
   match the new attempt count; renamed kickoff test from
   "raises_when_classifier_fails_twice" to
   "falls_back_to_empty_when_classifier_fails" reflecting the new
   degraded-but-usable behavior.

Verified live with all 3 sample bots (maya/eli/sam) — kickoff route
returns 200 across multiple attempts. Full suite: 168 passed.
2026-04-26 15:03:13 -04:00

143 lines
5.2 KiB
Python

import json
import pytest
from chat.llm.mock import MockLLMClient
from chat.services.kickoff import (
ActivityShape,
KickoffParse,
parse_kickoff,
)
def _full_kickoff_json() -> str:
return json.dumps(
{
"container_name": "office bullpen, late evening",
"container_type": "office",
"container_properties": {
"moving": False,
"public": False,
"audible_range": "room",
},
"you_activity": {
"posture": "sitting at your desk",
"action_verb": "finishing emails",
"action_interruptible": True,
"action_required_attention": "low",
"action_expected_duration": "15m",
"attention": "the screen",
"holding": ["coffee mug"],
},
"bot_activity": {
"posture": "sitting at her desk",
"action_verb": "pretending to work",
"action_interruptible": True,
"action_required_attention": "low",
"action_expected_duration": "indefinite",
"attention": "you, in glances",
"holding": [],
},
"initial_time_iso": "2026-04-26T19:42:00",
"edge_seed_summary": "coworkers; aware of each other; no shared history beyond the office",
"edge_seed_knowledge_facts": [
"they work on the same floor",
"it is unusual to be the only two left",
],
}
)
@pytest.mark.asyncio
async def test_parse_kickoff_happy_path_populates_fields():
mock = MockLLMClient(canned=[_full_kickoff_json()])
result = await parse_kickoff(
mock,
model="m",
bot_name="BotA",
bot_persona="reserved colleague who quietly notices things",
initial_relationship_to_you="coworker, slight crush, never voiced",
kickoff_prose=(
"you stay late at the office; only you and BotA are there; "
"she's at her desk pretending to work"
),
you_name="You",
)
assert isinstance(result, KickoffParse)
assert result.container_name == "office bullpen, late evening"
assert result.container_type == "office"
assert isinstance(result.you_activity, ActivityShape)
assert result.you_activity.posture == "sitting at your desk"
assert result.bot_activity.action_verb == "pretending to work"
assert result.edge_seed_summary.startswith("coworkers")
assert "they work on the same floor" in result.edge_seed_knowledge_facts
assert result.initial_time_iso == "2026-04-26T19:42:00"
@pytest.mark.asyncio
async def test_parse_kickoff_applies_activity_defaults_for_missing_fields():
minimal_payload = {
"container_name": "kitchen",
"container_type": "kitchen",
"container_properties": {},
"you_activity": {
"posture": "standing",
"action_verb": "boiling water",
"action_interruptible": True,
"action_required_attention": "low",
"action_expected_duration": "5m",
},
"bot_activity": {
"posture": "leaning on the counter",
"action_verb": "scrolling phone",
"action_interruptible": True,
"action_required_attention": "low",
"action_expected_duration": "10m",
},
"initial_time_iso": "2026-04-26T08:00:00",
"edge_seed_summary": "roommates",
"edge_seed_knowledge_facts": [],
}
mock = MockLLMClient(canned=[json.dumps(minimal_payload)])
result = await parse_kickoff(
mock,
model="m",
bot_name="BotA",
bot_persona="laid-back roommate",
initial_relationship_to_you="roommates of two years",
kickoff_prose="morning in the kitchen; you're making tea while BotA scrolls her phone",
you_name="You",
)
assert result.you_activity.attention == ""
assert result.you_activity.holding == []
assert result.bot_activity.attention == ""
assert result.bot_activity.holding == []
# mutating one default must not leak into the other (default_factory check)
result.you_activity.holding.append("kettle")
assert result.bot_activity.holding == []
@pytest.mark.asyncio
async def test_parse_kickoff_falls_back_to_empty_when_classifier_fails():
"""When the classifier fails three times, return an empty KickoffParse
instead of raising — the confirm form lets the user fill in by hand.
"""
mock = MockLLMClient(canned=["nope", "still nope", "still bad"])
result = await parse_kickoff(
mock,
model="m",
bot_name="BotA",
bot_persona="x",
initial_relationship_to_you="y",
kickoff_prose="z",
you_name="You",
)
assert isinstance(result, KickoffParse)
assert result.container_name == ""
assert result.container_type == ""
assert result.edge_seed_summary == ""
assert result.edge_seed_knowledge_facts == []
# Activity defaults sane (action_interruptible defaults to True so the
# confirm form's checkbox is in a reasonable initial state).
assert result.you_activity.action_interruptible is True
assert result.bot_activity.action_interruptible is True