diff --git a/chat/services/embeddings.py b/chat/services/embeddings.py new file mode 100644 index 0000000..ece6eae --- /dev/null +++ b/chat/services/embeddings.py @@ -0,0 +1,108 @@ +"""Embedding generation service (T91, Phase 4). + +Wraps the embedding API call. For Phase 4's first cut we ship a +deterministic local pseudo-embedding (hash-derived) so the vector +retrieval pipeline can land without an external embedding endpoint +or heavy local dependency. Phase 4.5+ swaps to a real model — the +EmbeddingResult shape stays the same, only the generator changes. +""" + +from __future__ import annotations + +import hashlib +import math +import struct + +from pydantic import BaseModel + +from chat.llm.client import LLMClient + + +DEFAULT_EMBEDDING_DIM = 384 +DEFAULT_EMBEDDING_MODEL = "pseudo-sha256-384" +FALLBACK_EMBEDDING_MODEL = "fallback" + + +class EmbeddingResult(BaseModel): + vector: list[float] + model: str + dim: int + + +def _pseudo_embed(text: str, dim: int = DEFAULT_EMBEDDING_DIM) -> list[float]: + """Deterministic pseudo-embedding for Phase 4 first cut. + + Hashes the text with SHA-256, then expands by re-hashing each + successive block with the previous block + a counter — this gives + ``dim * 4`` bytes of fresh entropy per input rather than naively + repeating the 32-byte digest (which would collapse the vector onto + only 8 unique floats and make distinct inputs cosine-similar). + + Bytes are unpacked as little-endian int32s and rescaled to [-1, 1] + so we sidestep the float32 NaN/denormal values that ``struct.unpack + 'f'`` would otherwise produce on raw hash bytes. The result is + unit-normalized so cosine similarity reduces to a dot product. + + NOT semantically meaningful — just consistent for testing the + pipeline. Phase 4.5 should swap to a real embedding model. + """ + needed = dim * 4 # 4 bytes per int32 + seed = text.encode("utf-8") + chunks: list[bytes] = [] + counter = 0 + while sum(len(c) for c in chunks) < needed: + block = hashlib.sha256(seed + counter.to_bytes(4, "big")).digest() + chunks.append(block) + counter += 1 + full = b"".join(chunks)[:needed] + ints = struct.unpack(f"<{dim}i", full) + # Map int32 to roughly [-1, 1] — exact bound doesn't matter since we + # normalize, but keeps values numerically tame. + raw = [x / 2147483648.0 for x in ints] + norm = math.sqrt(sum(x * x for x in raw)) or 1.0 + return [x / norm for x in raw] + + +async def generate_embedding( + client: LLMClient, + *, + text: str, + model: str = DEFAULT_EMBEDDING_MODEL, + dim: int = DEFAULT_EMBEDDING_DIM, + timeout_s: float = 30.0, +) -> EmbeddingResult: + """Generate an embedding for the given text. + + Phase 4 default uses a deterministic local pseudo-embedding. If + the LLMClient grows an ``embed(...)`` method in Phase 4.5, this + wrapper will route to it when ``model != "pseudo-sha256-384"``. + + Falls back to a zero vector with ``model="fallback"`` on any + failure (callers detect the sentinel and skip indexing). For the + pseudo path, failure is structurally impossible — it's pure local + computation. + """ + if not text or not text.strip(): + # Empty input — return fallback so caller doesn't index empty rows. + return EmbeddingResult( + vector=[0.0] * dim, model=FALLBACK_EMBEDDING_MODEL, dim=dim + ) + + if model == DEFAULT_EMBEDDING_MODEL: + # Pure-local pseudo path — no LLMClient call. + return EmbeddingResult(vector=_pseudo_embed(text, dim), model=model, dim=dim) + + # Future: real embedding via client.embed(...). Phase 4.5 work. + # For Phase 4, any non-default model falls through to fallback. + return EmbeddingResult( + vector=[0.0] * dim, model=FALLBACK_EMBEDDING_MODEL, dim=dim + ) + + +__all__ = [ + "DEFAULT_EMBEDDING_DIM", + "DEFAULT_EMBEDDING_MODEL", + "FALLBACK_EMBEDDING_MODEL", + "EmbeddingResult", + "generate_embedding", +] diff --git a/tests/test_embeddings.py b/tests/test_embeddings.py new file mode 100644 index 0000000..b458681 --- /dev/null +++ b/tests/test_embeddings.py @@ -0,0 +1,91 @@ +"""Tests for the embedding generation service (T91, Phase 4). + +Phase 4's first cut ships a deterministic local pseudo-embedding so the +vector retrieval pipeline can land without an external embeddings API +or a heavy local model dependency. These tests pin the contract: + +* the result has the right shape (vector length, ``dim`` metadata), +* the default ``model`` string is reported back unchanged, +* output is byte-identical for the same input (deterministic), +* distinct inputs produce distinct vectors (so cosine actually + discriminates), +* empty / whitespace-only input collapses to the ``"fallback"`` sentinel + with a zero vector — callers detect this and skip indexing, +* the vector is unit-normalized so cosine similarity behaves. + +The pseudo path doesn't touch the LLMClient, so we pass an empty +``MockLLMClient`` — any accidental call into it would raise +``IndexError`` and surface as a regression. +""" + +from __future__ import annotations + +import math + +import pytest + +from chat.llm.mock import MockLLMClient +from chat.services.embeddings import ( + DEFAULT_EMBEDDING_DIM, + DEFAULT_EMBEDDING_MODEL, + FALLBACK_EMBEDDING_MODEL, + EmbeddingResult, + generate_embedding, +) + + +def _client() -> MockLLMClient: + # Pseudo path never calls the client — empty canned list ensures any + # accidental call raises and surfaces the regression loudly. + return MockLLMClient(canned=[]) + + +@pytest.mark.asyncio +async def test_generate_embedding_returns_vector_of_correct_dim(): + result = await generate_embedding(_client(), text="hello") + assert isinstance(result, EmbeddingResult) + assert isinstance(result.vector, list) + assert len(result.vector) == DEFAULT_EMBEDDING_DIM == 384 + assert result.dim == 384 + assert all(isinstance(x, float) for x in result.vector) + + +@pytest.mark.asyncio +async def test_generate_embedding_returns_correct_model_metadata(): + result = await generate_embedding(_client(), text="hello") + assert result.model == DEFAULT_EMBEDDING_MODEL == "pseudo-sha256-384" + + +@pytest.mark.asyncio +async def test_generate_embedding_is_deterministic(): + a = await generate_embedding(_client(), text="hello world") + b = await generate_embedding(_client(), text="hello world") + assert a.vector == b.vector + + +@pytest.mark.asyncio +async def test_generate_embedding_distinct_text_produces_distinct_vectors(): + a = await generate_embedding(_client(), text="hello world") + b = await generate_embedding(_client(), text="totally different content") + assert a.vector != b.vector + # Sanity-check cosine similarity — both vectors are unit-normalized, + # so this reduces to a plain dot product. + cosine = sum(x * y for x, y in zip(a.vector, b.vector)) + assert cosine < 0.99 + + +@pytest.mark.asyncio +async def test_generate_embedding_empty_text_returns_fallback(): + for empty in ("", " ", "\n\t"): + result = await generate_embedding(_client(), text=empty) + assert result.model == FALLBACK_EMBEDDING_MODEL == "fallback" + assert result.dim == DEFAULT_EMBEDDING_DIM + assert len(result.vector) == DEFAULT_EMBEDDING_DIM + assert all(x == 0.0 for x in result.vector) + + +@pytest.mark.asyncio +async def test_generate_embedding_unit_normalized(): + result = await generate_embedding(_client(), text="some non-empty text") + norm_sq = sum(x * x for x in result.vector) + assert math.isclose(norm_sq, 1.0, abs_tol=1e-6)