b3d78c1603
Updates the docstring + test docstring for the NotImplementedError stub shipped in T112 (Phase 4.5). Original wording said Featherless 'does not expose /v1/embeddings'; verified the endpoint actually responds but always returns HTTP 500 with type='completions_error' for every model tried (text-embedding-3-small, BAAI/bge-small-en-v1.5, sentence-transformers/all-MiniLM-L6-v2, etc.) and /v1/models has no embedding-class entries. Stub behavior unchanged.
87 lines
3.7 KiB
Python
87 lines
3.7 KiB
Python
from __future__ import annotations
|
|
import asyncio
|
|
from typing import AsyncIterator, Sequence
|
|
from openai import AsyncOpenAI
|
|
from .client import Message
|
|
|
|
|
|
class FeatherlessClient:
|
|
"""Client for Featherless's OpenAI-compatible API.
|
|
|
|
Featherless caps concurrent connections per account (2 on free / lower
|
|
paid tiers). A class-level semaphore gates every ``generate`` and
|
|
``stream`` call so the orchestrator never exceeds the configured cap,
|
|
regardless of how many ``FeatherlessClient`` instances are alive.
|
|
|
|
Configure once at app startup via :meth:`configure_concurrency`. The
|
|
default is 2.
|
|
"""
|
|
|
|
_semaphore: asyncio.Semaphore | None = None
|
|
|
|
@classmethod
|
|
def configure_concurrency(cls, max_concurrent: int) -> None:
|
|
cls._semaphore = asyncio.Semaphore(max(1, int(max_concurrent)))
|
|
|
|
@classmethod
|
|
def _sem(cls) -> asyncio.Semaphore:
|
|
if cls._semaphore is None:
|
|
cls._semaphore = asyncio.Semaphore(2)
|
|
return cls._semaphore
|
|
|
|
def __init__(self, api_key: str, base_url: str = "https://api.featherless.ai/v1"):
|
|
self._client = AsyncOpenAI(api_key=api_key, base_url=base_url)
|
|
|
|
async def generate(self, messages: Sequence[Message], *, model: str, **params) -> str:
|
|
async with self._sem():
|
|
resp = await self._client.chat.completions.create(
|
|
model=model,
|
|
messages=[{"role": m.role, "content": m.content} for m in messages],
|
|
**params,
|
|
)
|
|
return resp.choices[0].message.content or ""
|
|
|
|
async def stream(self, messages: Sequence[Message], *, model: str, **params) -> AsyncIterator[str]:
|
|
async with self._sem():
|
|
stream = await self._client.chat.completions.create(
|
|
model=model,
|
|
messages=[{"role": m.role, "content": m.content} for m in messages],
|
|
stream=True,
|
|
**params,
|
|
)
|
|
async for chunk in stream:
|
|
delta = chunk.choices[0].delta.content or ""
|
|
if delta:
|
|
yield delta
|
|
|
|
async def embed(self, text: str, *, model: str) -> list[float]:
|
|
"""Embeddings via Featherless — unsupported in practice.
|
|
|
|
T112 (Phase 4.5) extends the LLMClient Protocol with ``embed()``
|
|
for a future real-embedding swap. Featherless's OpenAI-compatible
|
|
surface routes ``/v1/embeddings`` (no 404), but every request
|
|
returns HTTP 500 ``{"error": {"type": "completions_error", ...}}``
|
|
— including standard names like ``text-embedding-3-small`` and
|
|
``BAAI/bge-small-en-v1.5``. ``/v1/models`` confirms it: the
|
|
catalog has no embedding-class entries, only chat/completion
|
|
classes (``llama3-*``, ``gemma3-*``, ``glm5-*``, etc.).
|
|
|
|
Rather than ship a request that always 500s, this implementation
|
|
raises ``NotImplementedError``. The
|
|
:func:`chat.services.embeddings.generate_embedding` wrapper
|
|
catches it and degrades to the existing zero-vector fallback
|
|
(with the T107 warning), so misconfigured callers fail loudly in
|
|
logs but the request path keeps working.
|
|
|
|
For real embeddings, configure a different provider (OpenAI
|
|
direct, Cohere, Voyage, Together, self-hosted Ollama /
|
|
sentence-transformers). The Mock + routing seam from T112 keeps
|
|
the swap to a one-class change in ``chat/llm/``.
|
|
"""
|
|
raise NotImplementedError(
|
|
"Featherless /v1/embeddings always returns 500 "
|
|
'("completions_error") and the model catalog has no '
|
|
"embedding class; configure a different embedding provider "
|
|
"or stick with the default pseudo-sha256-384 model."
|
|
)
|