feat: backfill_embeddings script for existing memories (T97.4)
This commit is contained in:
@@ -0,0 +1,97 @@
|
||||
"""Backfill embeddings for memories that lack them (T97, Phase 4).
|
||||
|
||||
Walks all memories where no row exists in the ``embeddings`` table. For
|
||||
each, calls :func:`chat.services.embeddings.generate_embedding` and emits
|
||||
an ``embedding_indexed`` event so the projector lands the vector.
|
||||
|
||||
Phase 4 ships the deterministic local pseudo-embedding so this script
|
||||
runs synchronously without a network round-trip — the LLMClient argument
|
||||
is not needed on the pseudo path. Phase 4.5+ will need a real client.
|
||||
|
||||
Run from the repo root:
|
||||
.venv/bin/python scripts/backfill_embeddings.py [--limit N] [--dry-run]
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
|
||||
from chat.config import load_settings
|
||||
from chat.db.connection import open_db
|
||||
from chat.db.migrate import apply_migrations
|
||||
from chat.eventlog.log import append_and_apply
|
||||
from chat.services.embeddings import (
|
||||
FALLBACK_EMBEDDING_MODEL,
|
||||
generate_embedding,
|
||||
)
|
||||
|
||||
# Trigger projector handler registration so ``append_and_apply`` lands
|
||||
# the embedding rows correctly.
|
||||
import chat.state.embeddings # noqa: F401
|
||||
import chat.state.entities # noqa: F401
|
||||
import chat.state.memory # noqa: F401
|
||||
import chat.state.world # noqa: F401
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--limit",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Cap the number of memories backfilled in this run.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Print the count of memories needing embeddings, then exit.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
settings = load_settings()
|
||||
settings.db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
apply_migrations(settings.db_path)
|
||||
|
||||
with open_db(settings.db_path) as conn:
|
||||
sql = (
|
||||
"SELECT m.id, m.pov_summary FROM memories m "
|
||||
"LEFT JOIN embeddings e ON e.memory_id = m.id "
|
||||
"WHERE e.memory_id IS NULL "
|
||||
"ORDER BY m.id"
|
||||
)
|
||||
if args.limit is not None:
|
||||
sql += f" LIMIT {int(args.limit)}"
|
||||
rows = conn.execute(sql).fetchall()
|
||||
print(f"Found {len(rows)} memories needing embeddings.")
|
||||
if args.dry_run:
|
||||
return
|
||||
|
||||
indexed = 0
|
||||
skipped = 0
|
||||
for memory_id, text in rows:
|
||||
result = await generate_embedding(
|
||||
client=None, # pseudo path: no client needed
|
||||
text=text or "",
|
||||
)
|
||||
if result.model == FALLBACK_EMBEDDING_MODEL:
|
||||
print(f" Skipping memory_id={memory_id} (empty text)")
|
||||
skipped += 1
|
||||
continue
|
||||
append_and_apply(
|
||||
conn,
|
||||
kind="embedding_indexed",
|
||||
payload={
|
||||
"memory_id": memory_id,
|
||||
"model": result.model,
|
||||
"dim": result.dim,
|
||||
"vector": result.vector,
|
||||
},
|
||||
)
|
||||
indexed += 1
|
||||
print(f" Indexed memory_id={memory_id}")
|
||||
print(f"Done. Indexed {indexed}, skipped {skipped}.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user