feat: embeddings table + projector handlers (pure-Python cosine, T88)
This commit is contained in:
@@ -0,0 +1,105 @@
|
||||
"""Embeddings projector + readers (T88, Phase 4).
|
||||
|
||||
Embeddings are stored as JSON-serialized float arrays in a regular
|
||||
SQLite table. Cosine similarity is computed in Python at query time
|
||||
(see chat/services/vector_search.py / T92). This deliberately avoids
|
||||
the sqlite-vec extension dependency — the host Python build doesn't
|
||||
support enable_load_extension. Phase 4.5+ may revisit if memory counts
|
||||
grow beyond pure-Python feasibility (~few thousand per query).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
import json
|
||||
from sqlite3 import Connection
|
||||
|
||||
from chat.eventlog.projector import on
|
||||
from chat.eventlog.log import Event
|
||||
|
||||
|
||||
@on("embedding_indexed")
|
||||
def _apply_embedding_indexed(conn: Connection, e: Event) -> None:
|
||||
"""Insert or replace the embedding for a memory.
|
||||
|
||||
Idempotent: re-projection or re-indexing replaces the prior vector.
|
||||
"""
|
||||
p = e.payload
|
||||
vector = p["vector"]
|
||||
conn.execute(
|
||||
"INSERT OR REPLACE INTO embeddings "
|
||||
"(memory_id, vector_json, model, dim, indexed_at) "
|
||||
"VALUES (?, ?, ?, ?, datetime('now'))",
|
||||
(
|
||||
int(p["memory_id"]),
|
||||
json.dumps(list(vector)),
|
||||
p["model"],
|
||||
int(p.get("dim") or len(vector)),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@on("embedding_deindexed")
|
||||
def _apply_embedding_deindexed(conn: Connection, e: Event) -> None:
|
||||
"""Remove the embedding for a memory (used by reset cascade)."""
|
||||
p = e.payload
|
||||
conn.execute(
|
||||
"DELETE FROM embeddings WHERE memory_id = ?",
|
||||
(int(p["memory_id"]),),
|
||||
)
|
||||
|
||||
|
||||
def get_embedding(conn: Connection, memory_id: int) -> dict | None:
|
||||
row = conn.execute(
|
||||
"SELECT memory_id, vector_json, model, dim, indexed_at "
|
||||
"FROM embeddings WHERE memory_id = ?",
|
||||
(memory_id,),
|
||||
).fetchone()
|
||||
if not row:
|
||||
return None
|
||||
return {
|
||||
"memory_id": row[0],
|
||||
"vector": json.loads(row[1]),
|
||||
"model": row[2],
|
||||
"dim": row[3],
|
||||
"indexed_at": row[4],
|
||||
}
|
||||
|
||||
|
||||
def list_embeddings_for_owner(conn: Connection, owner_id: str) -> list[dict]:
|
||||
"""Return all embeddings for memories owned by ``owner_id``.
|
||||
|
||||
Used by vector search at query time (T92). The join carries the
|
||||
fields the cosine ranker needs to assemble result rows without a
|
||||
second round-trip: the POV summary text, significance, and witness
|
||||
flags. The ``memories`` table has no separate ``text`` column —
|
||||
``pov_summary`` is the canonical narrative text per
|
||||
``chat/services/memory_write.py``.
|
||||
"""
|
||||
rows = conn.execute(
|
||||
"SELECT e.memory_id, e.vector_json, e.model, e.dim, "
|
||||
" m.pov_summary, m.significance, "
|
||||
" m.witness_you, m.witness_host, m.witness_guest "
|
||||
"FROM embeddings e "
|
||||
"JOIN memories m ON m.id = e.memory_id "
|
||||
"WHERE m.owner_id = ?",
|
||||
(owner_id,),
|
||||
).fetchall()
|
||||
return [
|
||||
{
|
||||
"memory_id": r[0],
|
||||
"vector": json.loads(r[1]),
|
||||
"model": r[2],
|
||||
"dim": r[3],
|
||||
"pov_summary": r[4],
|
||||
"significance": r[5],
|
||||
"witness_you": r[6],
|
||||
"witness_host": r[7],
|
||||
"witness_guest": r[8],
|
||||
}
|
||||
for r in rows
|
||||
]
|
||||
|
||||
|
||||
__all__ = [
|
||||
"get_embedding",
|
||||
"list_embeddings_for_owner",
|
||||
]
|
||||
Reference in New Issue
Block a user