feat: FTS5 memory retrieval with witness filter and ranking boosts
This commit is contained in:
+46
-5
@@ -87,6 +87,14 @@ def get_pinned(conn: Connection, owner_id: str) -> list[dict]:
|
||||
return [dict(zip(cols, row)) for row in rows]
|
||||
|
||||
|
||||
# Composite-score weights used by ``search_memories`` (T23, §8 retrieval).
|
||||
# FTS5 BM25 ``rank`` is *more negative* for better matches, so subtracting a
|
||||
# positive boost from it drives stronger candidates further down (i.e. earlier
|
||||
# in an ascending sort). Hardcoded for v1 — tunable in a later pass.
|
||||
_SIGNIFICANCE_WEIGHT = 0.3
|
||||
_RECENCY_WEIGHT = 0.5
|
||||
|
||||
|
||||
def search_memories(
|
||||
conn: Connection,
|
||||
owner_id: str,
|
||||
@@ -97,16 +105,32 @@ def search_memories(
|
||||
"""FTS5 search over pov_summary, scoped by owner and witness role.
|
||||
|
||||
witness_role must be one of {"you", "host", "guest"} per the witness flags
|
||||
on each memory row. Returns up to k rows ordered by FTS5 bm25 rank.
|
||||
on each memory row. Returns up to ``k`` rows ranked by a composite score
|
||||
that combines the FTS5 BM25 rank with two boosts (§8 retrieval rules):
|
||||
|
||||
* **significance boost** — ``0.3 * significance`` (0..3 per §11.1).
|
||||
* **recency boost** — ``0.5 * (id / max_id)``, using the row id as a
|
||||
monotonic recency proxy. Newer memories therefore tilt above older ones
|
||||
when the BM25 rank and significance are otherwise tied.
|
||||
|
||||
BM25 returns negative scores (lower = better). Both boosts are subtracted
|
||||
so that stronger candidates yield smaller composite scores; the result is
|
||||
sorted ascending and truncated to ``k``. The unmodified ``fts_rank`` and a
|
||||
debug-friendly ``composite_score`` are kept on each returned dict.
|
||||
"""
|
||||
if witness_role not in _VALID_WITNESS_ROLES:
|
||||
raise ValueError(
|
||||
f"witness_role must be one of {sorted(_VALID_WITNESS_ROLES)}, "
|
||||
f"got {witness_role!r}"
|
||||
)
|
||||
if not query.strip():
|
||||
return []
|
||||
witness_col = f"witness_{witness_role}"
|
||||
cols = [c[1] for c in conn.execute("PRAGMA table_info(memories)").fetchall()]
|
||||
select_list = ", ".join(f"m.{c}" for c in cols)
|
||||
# Over-fetch from FTS so the Python-side re-rank has room to reorder
|
||||
# results that BM25 alone would have demoted past the top-k boundary.
|
||||
over_fetch = max(k * 4, 20)
|
||||
sql = (
|
||||
f"SELECT {select_list}, memories_fts.rank AS fts_rank "
|
||||
"FROM memories_fts "
|
||||
@@ -116,10 +140,27 @@ def search_memories(
|
||||
"ORDER BY memories_fts.rank "
|
||||
"LIMIT ?"
|
||||
)
|
||||
cur = conn.execute(sql, (owner_id, query, k))
|
||||
cur = conn.execute(sql, (owner_id, query, over_fetch))
|
||||
rows = cur.fetchall()
|
||||
out: list[dict] = []
|
||||
if not rows:
|
||||
return []
|
||||
|
||||
# Recency normalises against the current max id for this owner so the
|
||||
# boost magnitude is bounded regardless of dataset size.
|
||||
max_id_row = conn.execute(
|
||||
"SELECT MAX(id) FROM memories WHERE owner_id = ?", (owner_id,)
|
||||
).fetchone()
|
||||
max_id = max_id_row[0] if max_id_row and max_id_row[0] else 1
|
||||
|
||||
result_cols = cols + ["fts_rank"]
|
||||
enriched: list[dict] = []
|
||||
for row in rows:
|
||||
out.append(dict(zip(result_cols, row)))
|
||||
return out
|
||||
d = dict(zip(result_cols, row))
|
||||
fts_rank = d.get("fts_rank") or 0.0
|
||||
sig_boost = _SIGNIFICANCE_WEIGHT * (d.get("significance") or 0)
|
||||
recency_boost = _RECENCY_WEIGHT * ((d.get("id") or 0) / max_id)
|
||||
d["composite_score"] = fts_rank - sig_boost - recency_boost
|
||||
enriched.append(d)
|
||||
|
||||
enriched.sort(key=lambda x: x["composite_score"])
|
||||
return enriched[:k]
|
||||
|
||||
Reference in New Issue
Block a user