feat: embeddings table + projector handlers (pure-Python cosine, T88)

2026-04-27 02:22:32 -04:00
parent bffd9a2f38
commit 0ba374b790
3 changed files with 337 additions and 0 deletions
@@ -0,0 +1,105 @@
+"""Embeddings projector + readers (T88, Phase 4).
+
+Embeddings are stored as JSON-serialized float arrays in a regular
+SQLite table. Cosine similarity is computed in Python at query time
+(see chat/services/vector_search.py / T92). This deliberately avoids
+the sqlite-vec extension dependency — the host Python build doesn't
+support enable_load_extension. Phase 4.5+ may revisit if memory counts
+grow beyond pure-Python feasibility (~few thousand per query).
+"""
+
+from __future__ import annotations
+import json
+from sqlite3 import Connection
+
+from chat.eventlog.projector import on
+from chat.eventlog.log import Event
+
+
+@on("embedding_indexed")
+def _apply_embedding_indexed(conn: Connection, e: Event) -> None:
+    """Insert or replace the embedding for a memory.
+
+    Idempotent: re-projection or re-indexing replaces the prior vector.
+    """
+    p = e.payload
+    vector = p["vector"]
+    conn.execute(
+        "INSERT OR REPLACE INTO embeddings "
+        "(memory_id, vector_json, model, dim, indexed_at) "
+        "VALUES (?, ?, ?, ?, datetime('now'))",
+        (
+            int(p["memory_id"]),
+            json.dumps(list(vector)),
+            p["model"],
+            int(p.get("dim") or len(vector)),
+        ),
+    )
+
+
+@on("embedding_deindexed")
+def _apply_embedding_deindexed(conn: Connection, e: Event) -> None:
+    """Remove the embedding for a memory (used by reset cascade)."""
+    p = e.payload
+    conn.execute(
+        "DELETE FROM embeddings WHERE memory_id = ?",
+        (int(p["memory_id"]),),
+    )
+
+
+def get_embedding(conn: Connection, memory_id: int) -> dict | None:
+    row = conn.execute(
+        "SELECT memory_id, vector_json, model, dim, indexed_at "
+        "FROM embeddings WHERE memory_id = ?",
+        (memory_id,),
+    ).fetchone()
+    if not row:
+        return None
+    return {
+        "memory_id": row[0],
+        "vector": json.loads(row[1]),
+        "model": row[2],
+        "dim": row[3],
+        "indexed_at": row[4],
+    }
+
+
+def list_embeddings_for_owner(conn: Connection, owner_id: str) -> list[dict]:
+    """Return all embeddings for memories owned by ``owner_id``.
+
+    Used by vector search at query time (T92). The join carries the
+    fields the cosine ranker needs to assemble result rows without a
+    second round-trip: the POV summary text, significance, and witness
+    flags. The ``memories`` table has no separate ``text`` column —
+    ``pov_summary`` is the canonical narrative text per
+    ``chat/services/memory_write.py``.
+    """
+    rows = conn.execute(
+        "SELECT e.memory_id, e.vector_json, e.model, e.dim, "
+        "       m.pov_summary, m.significance, "
+        "       m.witness_you, m.witness_host, m.witness_guest "
+        "FROM embeddings e "
+        "JOIN memories m ON m.id = e.memory_id "
+        "WHERE m.owner_id = ?",
+        (owner_id,),
+    ).fetchall()
+    return [
+        {
+            "memory_id": r[0],
+            "vector": json.loads(r[1]),
+            "model": r[2],
+            "dim": r[3],
+            "pov_summary": r[4],
+            "significance": r[5],
+            "witness_you": r[6],
+            "witness_host": r[7],
+            "witness_guest": r[8],
+        }
+        for r in rows
+    ]
+
+
+__all__ = [
+    "get_embedding",
+    "list_embeddings_for_owner",
+]