merge: T93 cross-chat search service
This commit is contained in:
@@ -0,0 +1,75 @@
|
||||
"""Cross-chat search service (T93, Phase 4).
|
||||
|
||||
FTS5-based search across ALL owners and ALL chats. Used by the
|
||||
top-bar search UX (T100) for "where did I last see this character
|
||||
mention X?" queries. NO witness filter -- this is intentionally a
|
||||
power-user surface that surfaces memories across POVs.
|
||||
|
||||
Mirrors the FTS5 access pattern of ``chat.state.memory.search_memories``
|
||||
but drops both the ``owner_id = ?`` and the per-witness predicates so a
|
||||
single query can sweep every chat in the database. The composite
|
||||
re-rank is also dropped: callers want raw BM25 ordering for the
|
||||
"highest match strength wins" semantics expected of a global search box.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from sqlite3 import Connection
|
||||
|
||||
|
||||
def search_all_memories(
|
||||
conn: Connection,
|
||||
*,
|
||||
query: str,
|
||||
k: int = 20,
|
||||
) -> list[dict]:
|
||||
"""Search FTS5 across all owners and chats.
|
||||
|
||||
Returns rows with ``{memory_id, owner_id, chat_id, scene_id,
|
||||
pov_summary, significance, ts, fts_rank}``, sorted by FTS5 BM25
|
||||
rank ascending (lower rank = stronger match, surfaced first).
|
||||
|
||||
The ``memories`` table has no ``ts`` column; we expose ``created_at``
|
||||
(the projector-side row insertion timestamp) under that key so the
|
||||
UI does not have to know the storage name.
|
||||
|
||||
An empty / whitespace-only ``query`` short-circuits to ``[]`` to
|
||||
avoid an FTS5 ``MATCH ''`` syntax error and to keep the top-bar
|
||||
"no input yet" state from triggering a full-table scan.
|
||||
"""
|
||||
if not query or not query.strip():
|
||||
return []
|
||||
|
||||
# FTS5 MATCH against the same ``memories_fts`` virtual table that
|
||||
# backs ``state.memory.search_memories``; the JOIN pulls metadata
|
||||
# from the content table because the FTS index only stores
|
||||
# ``pov_summary``. ORDER BY rank ASC because BM25 in FTS5 returns
|
||||
# negative scores where lower is better.
|
||||
rows = conn.execute(
|
||||
"SELECT m.id, m.owner_id, m.chat_id, m.scene_id, "
|
||||
" m.pov_summary, m.significance, m.created_at, "
|
||||
" memories_fts.rank "
|
||||
"FROM memories_fts "
|
||||
"JOIN memories m ON m.id = memories_fts.rowid "
|
||||
"WHERE memories_fts MATCH ? "
|
||||
"ORDER BY memories_fts.rank ASC "
|
||||
"LIMIT ?",
|
||||
(query.strip(), k),
|
||||
).fetchall()
|
||||
|
||||
return [
|
||||
{
|
||||
"memory_id": r[0],
|
||||
"owner_id": r[1],
|
||||
"chat_id": r[2],
|
||||
"scene_id": r[3],
|
||||
"pov_summary": r[4],
|
||||
"significance": r[5],
|
||||
"ts": r[6],
|
||||
"fts_rank": r[7],
|
||||
}
|
||||
for r in rows
|
||||
]
|
||||
|
||||
|
||||
__all__ = ["search_all_memories"]
|
||||
@@ -0,0 +1,155 @@
|
||||
"""T93 (Phase 4): cross-chat FTS5 search across all owners and chats.
|
||||
|
||||
Verifies that ``chat.services.cross_chat_search.search_all_memories``:
|
||||
* surfaces matches across multiple owner_ids (the per-owner restriction
|
||||
used by ``state.memory.search_memories`` is intentionally absent),
|
||||
* applies no witness filter (admin/power-user surface),
|
||||
* orders results by FTS5 BM25 rank (lower = stronger match, surfaced
|
||||
first), and
|
||||
* honours the ``k`` LIMIT and the empty-query fast-path.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from chat.db.connection import open_db
|
||||
from chat.db.migrate import apply_migrations
|
||||
from chat.eventlog.log import append_event
|
||||
from chat.eventlog.projector import project
|
||||
from chat.services.cross_chat_search import search_all_memories
|
||||
import chat.state.memory # noqa: F401 (registers memory_written handler)
|
||||
|
||||
|
||||
def _seed(db, *, memory_specs):
|
||||
"""Apply migrations + project a list of memory_written events."""
|
||||
apply_migrations(db)
|
||||
with open_db(db) as conn:
|
||||
for spec in memory_specs:
|
||||
payload = {
|
||||
"owner_id": spec.get("owner_id", "bot_a"),
|
||||
"chat_id": spec.get("chat_id", "chat_bot_a"),
|
||||
"pov_summary": spec["pov_summary"],
|
||||
"witness_you": spec.get("witness_you", 1),
|
||||
"witness_host": spec.get("witness_host", 1),
|
||||
"witness_guest": spec.get("witness_guest", 0),
|
||||
"source": "direct",
|
||||
"reliability": 1.0,
|
||||
"significance": spec.get("significance", 1),
|
||||
"pinned": 0,
|
||||
"auto_pinned": 0,
|
||||
}
|
||||
append_event(conn, kind="memory_written", payload=payload)
|
||||
project(conn)
|
||||
|
||||
|
||||
def test_search_all_memories_returns_matches_across_owners(tmp_path):
|
||||
"""Cross-owner: a single query must surface memories from every owner.
|
||||
|
||||
The per-owner ``owner_id = ?`` predicate that ``search_memories`` uses
|
||||
is intentionally absent here, so a "rabbit" memory under ``bot_a`` and
|
||||
one under ``bot_b`` should both come back from a single call.
|
||||
"""
|
||||
db = tmp_path / "t.db"
|
||||
_seed(
|
||||
db,
|
||||
memory_specs=[
|
||||
{
|
||||
"owner_id": "bot_a",
|
||||
"chat_id": "chat_bot_a",
|
||||
"pov_summary": "the rabbit darted into the brambles",
|
||||
},
|
||||
{
|
||||
"owner_id": "bot_b",
|
||||
"chat_id": "chat_bot_b",
|
||||
"pov_summary": "a white rabbit watched from the hedge",
|
||||
},
|
||||
# Distractor: must not appear for "rabbit".
|
||||
{
|
||||
"owner_id": "bot_a",
|
||||
"chat_id": "chat_bot_a",
|
||||
"pov_summary": "the kettle whistled",
|
||||
},
|
||||
],
|
||||
)
|
||||
with open_db(db) as conn:
|
||||
out = search_all_memories(conn, query="rabbit")
|
||||
owners = {row["owner_id"] for row in out}
|
||||
assert owners == {"bot_a", "bot_b"}
|
||||
assert len(out) == 2
|
||||
# Returned shape contract.
|
||||
for row in out:
|
||||
assert set(row.keys()) >= {
|
||||
"memory_id",
|
||||
"owner_id",
|
||||
"chat_id",
|
||||
"scene_id",
|
||||
"pov_summary",
|
||||
"significance",
|
||||
"ts",
|
||||
"fts_rank",
|
||||
}
|
||||
|
||||
|
||||
def test_search_all_memories_orders_by_fts_rank(tmp_path):
|
||||
"""Stronger BM25 match must come first (rank ASC = lower is better)."""
|
||||
db = tmp_path / "t.db"
|
||||
_seed(
|
||||
db,
|
||||
memory_specs=[
|
||||
# Single occurrence -> weaker BM25 score.
|
||||
{
|
||||
"owner_id": "bot_a",
|
||||
"chat_id": "chat_bot_a",
|
||||
"pov_summary": "a rabbit appeared",
|
||||
},
|
||||
# Triple occurrence in a short row -> stronger BM25 score.
|
||||
{
|
||||
"owner_id": "bot_b",
|
||||
"chat_id": "chat_bot_b",
|
||||
"pov_summary": "rabbit rabbit rabbit",
|
||||
},
|
||||
],
|
||||
)
|
||||
with open_db(db) as conn:
|
||||
out = search_all_memories(conn, query="rabbit", k=5)
|
||||
assert len(out) == 2
|
||||
# Stronger match first; fts_rank monotonically non-decreasing
|
||||
# (lower-is-better, so ASC).
|
||||
assert out[0]["pov_summary"] == "rabbit rabbit rabbit"
|
||||
assert out[0]["fts_rank"] <= out[1]["fts_rank"]
|
||||
|
||||
|
||||
def test_search_all_memories_respects_k_limit(tmp_path):
|
||||
"""LIMIT ? must cap result count even when more matches exist."""
|
||||
db = tmp_path / "t.db"
|
||||
_seed(
|
||||
db,
|
||||
memory_specs=[
|
||||
{
|
||||
"owner_id": f"bot_{i}",
|
||||
"chat_id": f"chat_{i}",
|
||||
"pov_summary": f"rabbit sighting number {i}",
|
||||
}
|
||||
for i in range(10)
|
||||
],
|
||||
)
|
||||
with open_db(db) as conn:
|
||||
out = search_all_memories(conn, query="rabbit", k=3)
|
||||
assert len(out) == 3
|
||||
|
||||
|
||||
def test_search_all_memories_empty_query_returns_empty(tmp_path):
|
||||
"""Empty / whitespace-only query must short-circuit to []."""
|
||||
db = tmp_path / "t.db"
|
||||
_seed(
|
||||
db,
|
||||
memory_specs=[
|
||||
{
|
||||
"owner_id": "bot_a",
|
||||
"chat_id": "chat_bot_a",
|
||||
"pov_summary": "the rabbit darted into the brambles",
|
||||
},
|
||||
],
|
||||
)
|
||||
with open_db(db) as conn:
|
||||
assert search_all_memories(conn, query="") == []
|
||||
assert search_all_memories(conn, query=" ") == []
|
||||
Reference in New Issue
Block a user