"""Provider-agnostic LLM interfaces, payloads, and normalized error taxonomy.
These contracts are shared across agent code and backend adapters so call sites
can stay provider-neutral while still supporting both chat-style and request-
object style execution paths.
"""
from __future__ import annotations
from collections.abc import Iterator, Mapping, Sequence
from dataclasses import dataclass, field
from datetime import datetime, timezone
from types import TracebackType
from typing import Literal, Protocol, Self
from ._tools import ToolSpec
type LLMRole = Literal["system", "user", "assistant", "tool"]
type LLMStreamEventKind = Literal["delta", "completed"]
type ToolCallingMode = Literal["native", "best_effort", "none"]
type JSONMode = Literal["native", "prompt+validate", "none"]
type TaskPriority = Literal["latency", "quality", "cost", "balanced"]
[docs]
@dataclass(slots=True, frozen=True, kw_only=True)
class LLMMessage:
"""One chat message in the provider-neutral completion format."""
role: LLMRole
"""Message role used by chat-compatible backends."""
content: str
"""Plain-text message content."""
name: str | None = None
"""Optional participant name, when supported by the provider."""
tool_call_id: str | None = None
"""Tool call identifier for tool-response messages."""
tool_name: str | None = None
"""Tool name associated with a tool-response message."""
[docs]
@dataclass(slots=True, kw_only=True)
class LLMChatParams:
"""Provider-neutral generation controls passed with chat requests."""
temperature: float | None = None
"""Sampling temperature override for the request."""
max_tokens: int | None = None
"""Maximum number of output tokens to generate."""
response_schema: dict[str, object] | None = None
"""Optional JSON schema describing required structured output."""
provider_options: dict[str, object] = field(default_factory=dict)
"""Provider-specific raw options forwarded to backend adapters."""
[docs]
@dataclass(slots=True, frozen=True, kw_only=True)
class Usage:
"""Token accounting information for an LLM call."""
prompt_tokens: int | None = None
"""Prompt token count reported by the backend."""
completion_tokens: int | None = None
"""Completion token count reported by the backend."""
total_tokens: int | None = None
"""Total token count if reported by the backend."""
[docs]
@dataclass(slots=True, frozen=True, kw_only=True)
class TaskProfile:
"""Routing hints for selecting a backend."""
priority: TaskPriority = "balanced"
"""Primary optimization objective for backend selection."""
max_cost_usd: float | None = None
"""Upper cost bound for a single request, when enforced."""
max_latency_ms: int | None = None
"""Upper latency target for a single request, when enforced."""
tags: tuple[str, ...] = ()
"""Free-form tags used by model selection policies."""
[docs]
@dataclass(slots=True, frozen=True, kw_only=True)
class LLMRequest:
"""Provider-neutral request payload for LLM generation."""
messages: Sequence[LLMMessage]
"""Ordered conversation/messages sent to the model."""
model: str | None = None
"""Explicit model identifier override for this request."""
temperature: float | None = None
"""Sampling temperature override."""
max_tokens: int | None = None
"""Maximum output token limit."""
tools: Sequence[ToolSpec] = ()
"""Tool specifications exposed for model tool-calling."""
response_schema: dict[str, object] | None = None
"""Optional schema for structured output validation."""
response_format: dict[str, object] | None = None
"""Provider-specific response-format hints."""
metadata: dict[str, object] = field(default_factory=dict)
"""Caller metadata forwarded for tracing and diagnostics."""
provider_options: dict[str, object] = field(default_factory=dict)
"""Backend/provider-specific low-level options."""
task_profile: TaskProfile | None = None
"""Optional routing profile used by selector-aware clients."""
[docs]
@dataclass(slots=True, frozen=True, kw_only=True)
class Provenance:
"""Provenance metadata for reproducibility and audit trails."""
backend_name: str
"""Configured backend instance name."""
backend_kind: str
"""Backend implementation family/type identifier."""
model_id: str
"""Resolved model identifier used for the request."""
base_url: str | None
"""Backend endpoint base URL, if network-backed."""
started_at: str
"""ISO timestamp when request execution started."""
completed_at: str
"""ISO timestamp when request execution completed."""
config_hash: str
"""Stable hash of backend configuration inputs."""
[docs]
@staticmethod
def now_iso() -> str:
"""Return the current UTC timestamp in ISO 8601 format.
Returns:
Current UTC timestamp as an ISO 8601 string.
"""
return datetime.now(timezone.utc).isoformat() # noqa: UP017
[docs]
@dataclass(slots=True, frozen=True, kw_only=True)
class LLMResponse:
"""Normalized non-streaming response payload returned by a backend."""
text: str
"""Primary response text emitted by the model."""
model: str | None = None
"""Model identifier reported by the backend."""
provider: str | None = None
"""Provider/backend name that produced this response."""
finish_reason: str | None = None
"""Provider-specific completion reason."""
usage: Usage | dict[str, int] | None = None
"""Token usage counters when available."""
latency_ms: int | None = None
"""End-to-end latency in milliseconds."""
raw_output: dict[str, object] | None = None
"""Legacy/raw backend payload for debugging."""
tool_calls: tuple[ToolCall, ...] = ()
"""Tool calls requested by the model in this response."""
raw: dict[str, object] | None = None
"""Canonical raw backend payload snapshot."""
provenance: Provenance | None = None
"""Execution provenance metadata for auditability."""
[docs]
@dataclass(slots=True, frozen=True, kw_only=True)
class LLMStreamEvent:
"""One event emitted from a streaming model response."""
kind: LLMStreamEventKind
"""Event kind, either incremental delta or stream completion."""
delta_text: str | None = None
"""Incremental text fragment for ``kind='delta'`` events."""
response: LLMResponse | None = None
"""Final assembled response for ``kind='completed'`` events."""
[docs]
@dataclass(slots=True, frozen=True, kw_only=True)
class LLMDelta:
"""Incremental delta emitted by streaming model responses."""
text_delta: str | None = None
"""Incremental text token/segment."""
tool_call_delta: ToolCallDelta | None = None
"""Incremental tool-call payload, when provided by backend."""
usage_delta: Usage | None = None
"""Incremental usage counters emitted mid-stream."""
[docs]
@dataclass(slots=True, frozen=True, kw_only=True)
class EmbeddingResult:
"""Embedding response payload returned by a backend."""
vectors: Sequence[Sequence[float]]
"""Embedding vectors in request input order."""
model_id: str | None = None
"""Model identifier used for embedding generation."""
usage: Usage | None = None
"""Usage counters associated with the embedding request."""
[docs]
@dataclass(slots=True, frozen=True, kw_only=True)
class BackendCapabilities:
"""Capabilities supported by a backend."""
streaming: bool
"""Whether backend supports incremental streaming responses."""
tool_calling: ToolCallingMode
"""Tool-calling capability mode supported by backend."""
json_mode: JSONMode
"""Structured JSON output mode supported by backend."""
vision: bool
"""Whether backend accepts vision/image inputs."""
max_context_tokens: int | None
"""Maximum context window tokens, if known."""
[docs]
@dataclass(slots=True, frozen=True, kw_only=True)
class BackendStatus:
"""Healthcheck status returned by a backend."""
ok: bool
"""True when backend healthcheck succeeded."""
message: str | None = None
"""Optional human-readable healthcheck summary."""
details: Mapping[str, object] | None = None
"""Optional structured diagnostics from healthcheck."""
checked_at: str | None = None
"""ISO timestamp for when the healthcheck was performed."""
[docs]
class LLMClient(Protocol):
"""Protocol implemented by provider-agnostic LLM clients.
Implementations may support one or both call styles used in this package:
chat-style methods (``chat``/``stream_chat``) and request-object methods
(``generate``/``stream``).
"""
[docs]
def chat(
self,
messages: Sequence[LLMMessage],
*,
model: str,
params: LLMChatParams,
) -> LLMResponse:
"""Generate and return a full chat completion response.
Args:
messages: Ordered chat messages for this completion request.
model: Target model identifier.
params: Request controls such as temperature and max token limits.
Returns:
Normalized completion response payload.
"""
[docs]
def stream_chat(
self,
messages: Sequence[LLMMessage],
*,
model: str,
params: LLMChatParams,
) -> Iterator[LLMStreamEvent]:
"""Generate a streaming chat completion event sequence.
Args:
messages: Ordered chat messages for this completion request.
model: Target model identifier.
params: Request controls such as temperature and max token limits.
Returns:
Iterator of normalized streaming completion events.
"""
[docs]
def generate(self, request: LLMRequest) -> LLMResponse:
"""Generate and return a full response from a request object.
Args:
request: Provider-neutral request payload.
Returns:
Normalized completion response payload.
"""
[docs]
def stream(self, request: LLMRequest) -> Iterator[LLMDelta]:
"""Stream a response from a request object.
Args:
request: Provider-neutral request payload.
Returns:
Iterator of normalized response deltas.
"""
[docs]
def close(self) -> None:
"""Release any client-owned resources.
Implementations that do not own external resources may implement this
as a no-op so callers can use a uniform lifecycle pattern.
"""
def __enter__(self) -> Self:
"""Return this client for use in a ``with`` statement."""
def __exit__(
self,
exc_type: type[BaseException] | None,
exc: BaseException | None,
tb: TracebackType | None,
) -> bool | None:
"""Close the client when exiting a ``with`` block."""
[docs]
def default_model(self) -> str:
"""Return default model identifier for the configured backend.
Returns:
Model identifier used when no model is supplied on a request.
"""
[docs]
def capabilities(self) -> BackendCapabilities:
"""Return declared backend capabilities for this client.
Returns:
Backend capability payload.
"""
[docs]
def config_snapshot(self) -> Mapping[str, object]:
"""Return stable client/backend configuration metadata.
Returns:
Mapping safe for diagnostics and example output.
"""
[docs]
def server_snapshot(self) -> Mapping[str, object] | None:
"""Return managed-server metadata when this client owns a server.
Returns:
Server metadata mapping, or ``None`` when not managed.
"""
[docs]
def describe(self) -> Mapping[str, object]:
"""Return a composed client configuration and capability summary.
Returns:
JSON-serializable runtime description mapping.
"""
[docs]
class LLMProviderAdapter(Protocol):
"""Backend adapter contract consumed by ``LLMClient`` implementations."""
provider_name: str
[docs]
def chat(
self,
messages: Sequence[LLMMessage],
*,
model: str,
params: LLMChatParams,
) -> LLMResponse:
"""Generate one provider-backed chat response in normalized format.
Args:
messages: Ordered chat messages for this completion request.
model: Target model identifier.
params: Request controls such as temperature and max token limits.
Returns:
Normalized completion response payload.
"""
[docs]
def stream_chat(
self,
messages: Sequence[LLMMessage],
*,
model: str,
params: LLMChatParams,
) -> Iterator[LLMStreamEvent]:
"""Stream provider-backed chat events in normalized format.
Args:
messages: Ordered chat messages for this completion request.
model: Target model identifier.
params: Request controls such as temperature and max token limits.
Returns:
Iterator of normalized streaming completion events.
"""
[docs]
class LLMError(Exception):
"""Base exception for provider-independent LLM runtime failures."""
[docs]
class LLMAuthError(LLMError):
"""Authentication or authorization failure raised by provider backends."""
[docs]
class LLMRateLimitError(LLMError):
"""Provider rate-limit failure indicating callers should throttle or retry."""
[docs]
class LLMInvalidRequestError(LLMError):
"""Invalid request payload or unsupported provider/backend configuration."""
[docs]
class LLMProviderError(LLMError):
"""General provider runtime failure not covered by specialized subclasses."""
[docs]
class LLMBadResponseError(LLMError):
"""Raised when a provider returns an invalid or empty response payload."""
[docs]
class LLMCapabilityError(LLMError):
"""Raised when a backend cannot satisfy required capabilities."""