From d85ed8aaa6e90caa467d4edce942264fe4d135d7 Mon Sep 17 00:00:00 2001 From: Joseph Doherty Date: Mon, 27 Apr 2026 02:51:48 -0400 Subject: [PATCH] feat: backfill_embeddings script for existing memories (T97.4) --- scripts/backfill_embeddings.py | 97 ++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 scripts/backfill_embeddings.py diff --git a/scripts/backfill_embeddings.py b/scripts/backfill_embeddings.py new file mode 100644 index 0000000..f5c15bb --- /dev/null +++ b/scripts/backfill_embeddings.py @@ -0,0 +1,97 @@ +"""Backfill embeddings for memories that lack them (T97, Phase 4). + +Walks all memories where no row exists in the ``embeddings`` table. For +each, calls :func:`chat.services.embeddings.generate_embedding` and emits +an ``embedding_indexed`` event so the projector lands the vector. + +Phase 4 ships the deterministic local pseudo-embedding so this script +runs synchronously without a network round-trip — the LLMClient argument +is not needed on the pseudo path. Phase 4.5+ will need a real client. + +Run from the repo root: + .venv/bin/python scripts/backfill_embeddings.py [--limit N] [--dry-run] +""" + +from __future__ import annotations + +import argparse +import asyncio + +from chat.config import load_settings +from chat.db.connection import open_db +from chat.db.migrate import apply_migrations +from chat.eventlog.log import append_and_apply +from chat.services.embeddings import ( + FALLBACK_EMBEDDING_MODEL, + generate_embedding, +) + +# Trigger projector handler registration so ``append_and_apply`` lands +# the embedding rows correctly. +import chat.state.embeddings # noqa: F401 +import chat.state.entities # noqa: F401 +import chat.state.memory # noqa: F401 +import chat.state.world # noqa: F401 + + +async def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--limit", + type=int, + default=None, + help="Cap the number of memories backfilled in this run.", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Print the count of memories needing embeddings, then exit.", + ) + args = parser.parse_args() + + settings = load_settings() + settings.db_path.parent.mkdir(parents=True, exist_ok=True) + apply_migrations(settings.db_path) + + with open_db(settings.db_path) as conn: + sql = ( + "SELECT m.id, m.pov_summary FROM memories m " + "LEFT JOIN embeddings e ON e.memory_id = m.id " + "WHERE e.memory_id IS NULL " + "ORDER BY m.id" + ) + if args.limit is not None: + sql += f" LIMIT {int(args.limit)}" + rows = conn.execute(sql).fetchall() + print(f"Found {len(rows)} memories needing embeddings.") + if args.dry_run: + return + + indexed = 0 + skipped = 0 + for memory_id, text in rows: + result = await generate_embedding( + client=None, # pseudo path: no client needed + text=text or "", + ) + if result.model == FALLBACK_EMBEDDING_MODEL: + print(f" Skipping memory_id={memory_id} (empty text)") + skipped += 1 + continue + append_and_apply( + conn, + kind="embedding_indexed", + payload={ + "memory_id": memory_id, + "model": result.model, + "dim": result.dim, + "vector": result.vector, + }, + ) + indexed += 1 + print(f" Indexed memory_id={memory_id}") + print(f"Done. Indexed {indexed}, skipped {skipped}.") + + +if __name__ == "__main__": + asyncio.run(main())