feat: backfill_embeddings script for existing memories (T97.4)

2026-04-27 02:51:48 -04:00
parent 9c63d6b24c
commit d85ed8aaa6
1 changed files with 97 additions and 0 deletions
@@ -0,0 +1,97 @@
+"""Backfill embeddings for memories that lack them (T97, Phase 4).
+
+Walks all memories where no row exists in the ``embeddings`` table. For
+each, calls :func:`chat.services.embeddings.generate_embedding` and emits
+an ``embedding_indexed`` event so the projector lands the vector.
+
+Phase 4 ships the deterministic local pseudo-embedding so this script
+runs synchronously without a network round-trip — the LLMClient argument
+is not needed on the pseudo path. Phase 4.5+ will need a real client.
+
+Run from the repo root:
+    .venv/bin/python scripts/backfill_embeddings.py [--limit N] [--dry-run]
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+
+from chat.config import load_settings
+from chat.db.connection import open_db
+from chat.db.migrate import apply_migrations
+from chat.eventlog.log import append_and_apply
+from chat.services.embeddings import (
+    FALLBACK_EMBEDDING_MODEL,
+    generate_embedding,
+)
+
+# Trigger projector handler registration so ``append_and_apply`` lands
+# the embedding rows correctly.
+import chat.state.embeddings  # noqa: F401
+import chat.state.entities  # noqa: F401
+import chat.state.memory  # noqa: F401
+import chat.state.world  # noqa: F401
+
+
+async def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--limit",
+        type=int,
+        default=None,
+        help="Cap the number of memories backfilled in this run.",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Print the count of memories needing embeddings, then exit.",
+    )
+    args = parser.parse_args()
+
+    settings = load_settings()
+    settings.db_path.parent.mkdir(parents=True, exist_ok=True)
+    apply_migrations(settings.db_path)
+
+    with open_db(settings.db_path) as conn:
+        sql = (
+            "SELECT m.id, m.pov_summary FROM memories m "
+            "LEFT JOIN embeddings e ON e.memory_id = m.id "
+            "WHERE e.memory_id IS NULL "
+            "ORDER BY m.id"
+        )
+        if args.limit is not None:
+            sql += f" LIMIT {int(args.limit)}"
+        rows = conn.execute(sql).fetchall()
+        print(f"Found {len(rows)} memories needing embeddings.")
+        if args.dry_run:
+            return
+
+        indexed = 0
+        skipped = 0
+        for memory_id, text in rows:
+            result = await generate_embedding(
+                client=None,  # pseudo path: no client needed
+                text=text or "",
+            )
+            if result.model == FALLBACK_EMBEDDING_MODEL:
+                print(f"  Skipping memory_id={memory_id} (empty text)")
+                skipped += 1
+                continue
+            append_and_apply(
+                conn,
+                kind="embedding_indexed",
+                payload={
+                    "memory_id": memory_id,
+                    "model": result.model,
+                    "dim": result.dim,
+                    "vector": result.vector,
+                },
+            )
+            indexed += 1
+            print(f"  Indexed memory_id={memory_id}")
+        print(f"Done. Indexed {indexed}, skipped {skipped}.")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())