chat/chat/llm/featherless.py

from __future__ import annotations
import asyncio
from typing import AsyncIterator, Sequence
from openai import AsyncOpenAI
from .client import Message


class FeatherlessClient:
    """Client for Featherless's OpenAI-compatible API.

    Featherless caps concurrent connections per account (2 on free / lower
    paid tiers). A class-level semaphore gates every ``generate`` and
    ``stream`` call so the orchestrator never exceeds the configured cap,
    regardless of how many ``FeatherlessClient`` instances are alive.

    Configure once at app startup via :meth:`configure_concurrency`. The
    default is 2.
    """

    _semaphore: asyncio.Semaphore | None = None

    @classmethod
    def configure_concurrency(cls, max_concurrent: int) -> None:
        cls._semaphore = asyncio.Semaphore(max(1, int(max_concurrent)))

    @classmethod
    def _sem(cls) -> asyncio.Semaphore:
        if cls._semaphore is None:
            cls._semaphore = asyncio.Semaphore(2)
        return cls._semaphore

    def __init__(self, api_key: str, base_url: str = "https://api.featherless.ai/v1"):
        self._client = AsyncOpenAI(api_key=api_key, base_url=base_url)

    async def generate(self, messages: Sequence[Message], *, model: str, **params) -> str:
        async with self._sem():
            resp = await self._client.chat.completions.create(
                model=model,
                messages=[{"role": m.role, "content": m.content} for m in messages],
                **params,
            )
            return resp.choices[0].message.content or ""

    async def stream(self, messages: Sequence[Message], *, model: str, **params) -> AsyncIterator[str]:
        async with self._sem():
            stream = await self._client.chat.completions.create(
                model=model,
                messages=[{"role": m.role, "content": m.content} for m in messages],
                stream=True,
                **params,
            )
            async for chunk in stream:
                delta = chunk.choices[0].delta.content or ""
                if delta:
                    yield delta

    async def embed(self, text: str, *, model: str) -> list[float]:
        """Embeddings via Featherless — unsupported in practice.

        T112 (Phase 4.5) extends the LLMClient Protocol with ``embed()``
        for a future real-embedding swap. Featherless's OpenAI-compatible
        surface routes ``/v1/embeddings`` (no 404), but every request
        returns HTTP 500 ``{"error": {"type": "completions_error", ...}}``
        — including standard names like ``text-embedding-3-small`` and
        ``BAAI/bge-small-en-v1.5``. ``/v1/models`` confirms it: the
        catalog has no embedding-class entries, only chat/completion
        classes (``llama3-*``, ``gemma3-*``, ``glm5-*``, etc.).

        Rather than ship a request that always 500s, this implementation
        raises ``NotImplementedError``. The
        :func:`chat.services.embeddings.generate_embedding` wrapper
        catches it and degrades to the existing zero-vector fallback
        (with the T107 warning), so misconfigured callers fail loudly in
        logs but the request path keeps working.

        For real embeddings, configure a different provider (OpenAI
        direct, Cohere, Voyage, Together, self-hosted Ollama /
        sentence-transformers). The Mock + routing seam from T112 keeps
        the swap to a one-class change in ``chat/llm/``.
        """
        raise NotImplementedError(
            "Featherless /v1/embeddings always returns 500 "
            '("completions_error") and the model catalog has no '
            "embedding class; configure a different embedding provider "
            "or stick with the default pseudo-sha256-384 model."
        )