merge: T91 embedding generation service (pseudo-embedding)

2026-04-27 02:32:53 -04:00
parent 1d6768e980 caa17b4174
commit a25c166174
2 changed files with 199 additions and 0 deletions
@@ -0,0 +1,108 @@
+"""Embedding generation service (T91, Phase 4).
+
+Wraps the embedding API call. For Phase 4's first cut we ship a
+deterministic local pseudo-embedding (hash-derived) so the vector
+retrieval pipeline can land without an external embedding endpoint
+or heavy local dependency. Phase 4.5+ swaps to a real model — the
+EmbeddingResult shape stays the same, only the generator changes.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import math
+import struct
+
+from pydantic import BaseModel
+
+from chat.llm.client import LLMClient
+
+
+DEFAULT_EMBEDDING_DIM = 384
+DEFAULT_EMBEDDING_MODEL = "pseudo-sha256-384"
+FALLBACK_EMBEDDING_MODEL = "fallback"
+
+
+class EmbeddingResult(BaseModel):
+    vector: list[float]
+    model: str
+    dim: int
+
+
+def _pseudo_embed(text: str, dim: int = DEFAULT_EMBEDDING_DIM) -> list[float]:
+    """Deterministic pseudo-embedding for Phase 4 first cut.
+
+    Hashes the text with SHA-256, then expands by re-hashing each
+    successive block with the previous block + a counter — this gives
+    ``dim * 4`` bytes of fresh entropy per input rather than naively
+    repeating the 32-byte digest (which would collapse the vector onto
+    only 8 unique floats and make distinct inputs cosine-similar).
+
+    Bytes are unpacked as little-endian int32s and rescaled to [-1, 1]
+    so we sidestep the float32 NaN/denormal values that ``struct.unpack
+    'f'`` would otherwise produce on raw hash bytes. The result is
+    unit-normalized so cosine similarity reduces to a dot product.
+
+    NOT semantically meaningful — just consistent for testing the
+    pipeline. Phase 4.5 should swap to a real embedding model.
+    """
+    needed = dim * 4  # 4 bytes per int32
+    seed = text.encode("utf-8")
+    chunks: list[bytes] = []
+    counter = 0
+    while sum(len(c) for c in chunks) < needed:
+        block = hashlib.sha256(seed + counter.to_bytes(4, "big")).digest()
+        chunks.append(block)
+        counter += 1
+    full = b"".join(chunks)[:needed]
+    ints = struct.unpack(f"<{dim}i", full)
+    # Map int32 to roughly [-1, 1] — exact bound doesn't matter since we
+    # normalize, but keeps values numerically tame.
+    raw = [x / 2147483648.0 for x in ints]
+    norm = math.sqrt(sum(x * x for x in raw)) or 1.0
+    return [x / norm for x in raw]
+
+
+async def generate_embedding(
+    client: LLMClient,
+    *,
+    text: str,
+    model: str = DEFAULT_EMBEDDING_MODEL,
+    dim: int = DEFAULT_EMBEDDING_DIM,
+    timeout_s: float = 30.0,
+) -> EmbeddingResult:
+    """Generate an embedding for the given text.
+
+    Phase 4 default uses a deterministic local pseudo-embedding. If
+    the LLMClient grows an ``embed(...)`` method in Phase 4.5, this
+    wrapper will route to it when ``model != "pseudo-sha256-384"``.
+
+    Falls back to a zero vector with ``model="fallback"`` on any
+    failure (callers detect the sentinel and skip indexing). For the
+    pseudo path, failure is structurally impossible — it's pure local
+    computation.
+    """
+    if not text or not text.strip():
+        # Empty input — return fallback so caller doesn't index empty rows.
+        return EmbeddingResult(
+            vector=[0.0] * dim, model=FALLBACK_EMBEDDING_MODEL, dim=dim
+        )
+
+    if model == DEFAULT_EMBEDDING_MODEL:
+        # Pure-local pseudo path — no LLMClient call.
+        return EmbeddingResult(vector=_pseudo_embed(text, dim), model=model, dim=dim)
+
+    # Future: real embedding via client.embed(...). Phase 4.5 work.
+    # For Phase 4, any non-default model falls through to fallback.
+    return EmbeddingResult(
+        vector=[0.0] * dim, model=FALLBACK_EMBEDDING_MODEL, dim=dim
+    )
+
+
+__all__ = [
+    "DEFAULT_EMBEDDING_DIM",
+    "DEFAULT_EMBEDDING_MODEL",
+    "FALLBACK_EMBEDDING_MODEL",
+    "EmbeddingResult",
+    "generate_embedding",
+]
@@ -0,0 +1,91 @@
+"""Tests for the embedding generation service (T91, Phase 4).
+
+Phase 4's first cut ships a deterministic local pseudo-embedding so the
+vector retrieval pipeline can land without an external embeddings API
+or a heavy local model dependency. These tests pin the contract:
+
+* the result has the right shape (vector length, ``dim`` metadata),
+* the default ``model`` string is reported back unchanged,
+* output is byte-identical for the same input (deterministic),
+* distinct inputs produce distinct vectors (so cosine actually
+  discriminates),
+* empty / whitespace-only input collapses to the ``"fallback"`` sentinel
+  with a zero vector — callers detect this and skip indexing,
+* the vector is unit-normalized so cosine similarity behaves.
+
+The pseudo path doesn't touch the LLMClient, so we pass an empty
+``MockLLMClient`` — any accidental call into it would raise
+``IndexError`` and surface as a regression.
+"""
+
+from __future__ import annotations
+
+import math
+
+import pytest
+
+from chat.llm.mock import MockLLMClient
+from chat.services.embeddings import (
+    DEFAULT_EMBEDDING_DIM,
+    DEFAULT_EMBEDDING_MODEL,
+    FALLBACK_EMBEDDING_MODEL,
+    EmbeddingResult,
+    generate_embedding,
+)
+
+
+def _client() -> MockLLMClient:
+    # Pseudo path never calls the client — empty canned list ensures any
+    # accidental call raises and surfaces the regression loudly.
+    return MockLLMClient(canned=[])
+
+
+@pytest.mark.asyncio
+async def test_generate_embedding_returns_vector_of_correct_dim():
+    result = await generate_embedding(_client(), text="hello")
+    assert isinstance(result, EmbeddingResult)
+    assert isinstance(result.vector, list)
+    assert len(result.vector) == DEFAULT_EMBEDDING_DIM == 384
+    assert result.dim == 384
+    assert all(isinstance(x, float) for x in result.vector)
+
+
+@pytest.mark.asyncio
+async def test_generate_embedding_returns_correct_model_metadata():
+    result = await generate_embedding(_client(), text="hello")
+    assert result.model == DEFAULT_EMBEDDING_MODEL == "pseudo-sha256-384"
+
+
+@pytest.mark.asyncio
+async def test_generate_embedding_is_deterministic():
+    a = await generate_embedding(_client(), text="hello world")
+    b = await generate_embedding(_client(), text="hello world")
+    assert a.vector == b.vector
+
+
+@pytest.mark.asyncio
+async def test_generate_embedding_distinct_text_produces_distinct_vectors():
+    a = await generate_embedding(_client(), text="hello world")
+    b = await generate_embedding(_client(), text="totally different content")
+    assert a.vector != b.vector
+    # Sanity-check cosine similarity — both vectors are unit-normalized,
+    # so this reduces to a plain dot product.
+    cosine = sum(x * y for x, y in zip(a.vector, b.vector))
+    assert cosine < 0.99
+
+
+@pytest.mark.asyncio
+async def test_generate_embedding_empty_text_returns_fallback():
+    for empty in ("", "   ", "\n\t"):
+        result = await generate_embedding(_client(), text=empty)
+        assert result.model == FALLBACK_EMBEDDING_MODEL == "fallback"
+        assert result.dim == DEFAULT_EMBEDDING_DIM
+        assert len(result.vector) == DEFAULT_EMBEDDING_DIM
+        assert all(x == 0.0 for x in result.vector)
+
+
+@pytest.mark.asyncio
+async def test_generate_embedding_unit_normalized():
+    result = await generate_embedding(_client(), text="some non-empty text")
+    norm_sq = sum(x * x for x in result.vector)
+    assert math.isclose(norm_sq, 1.0, abs_tol=1e-6)