OpenAI Compatible HTTP Client

Source: examples/clients/openai_compatible_http_client.py

Introduction

OpenAI-compatible HTTP surfaces are valuable because they let one orchestration stack target multiple providers; vLLM and SGLang both expose this style of interface while OpenAI Responses API defines the baseline semantics. This example demonstrates that compatibility layer in the framework client runtime.

Technical Implementation

  1. Configure Tracer with JSONL + console output so each run emits machine-readable traces and lifecycle logs.

  2. Build the runtime surface (public APIs only) and execute OpenAICompatibleHTTPLLMClient.generate(...) with a fixed request_id.

  3. Construct LLMRequest inputs and call generate through the selected client implementation.

  4. Print a compact JSON payload including trace_info for deterministic tests and docs examples.

        flowchart LR
    A["Input prompt or scenario"] --> B["main(): runtime wiring"]
    B --> C["OpenAICompatibleHTTPLLMClient.generate(...)"]
    C --> D["LLMRequest/LLMResponse contracts wrap provider behavior"]
    C --> E["Tracer JSONL + console events"]
    D --> F["ExecutionResult/payload"]
    E --> F
    F --> G["Printed JSON output"]
    
 1from __future__ import annotations
 2
 3import json
 4from pathlib import Path
 5
 6from design_research_agents import OpenAICompatibleHTTPLLMClient, Tracer
 7from design_research_agents.llm import LLMMessage, LLMRequest
 8
 9
10def _build_payload() -> dict[str, object]:
11    # Run the OpenAI-compatible client using public runtime APIs. Using this with statement will automatically
12    # close the configured HTTP client when the example is done.
13    with OpenAICompatibleHTTPLLMClient(
14        name="local-openai-compat",
15        base_url="http://127.0.0.1:8011/v1",
16        default_model="qwen2.5-1.5b-q4",
17        api_key_env="OPENAI_API_KEY",
18        api_key="example-key-for-config-demo",
19        max_retries=3,
20        model_patterns=("qwen2.5-*", "qwen2-*"),
21    ) as client:
22        description = client.describe()
23        prompt = "Provide one sentence on balancing latency and quality in design review assistants."
24        response = client.generate(
25            LLMRequest(
26                messages=(
27                    LLMMessage(role="system", content="You are a concise engineering design assistant."),
28                    LLMMessage(role="user", content=prompt),
29                ),
30                model=client.default_model(),
31                temperature=0.0,
32                max_tokens=120,
33            )
34        )
35        llm_call = {
36            "prompt": prompt,
37            "response_text": response.text,
38            "response_model": response.model,
39            "response_provider": response.provider,
40            "response_has_text": bool(response.text.strip()),
41        }
42        return {
43            "client_class": description["client_class"],
44            "default_model": description["default_model"],
45            "llm_call": llm_call,
46            "backend": description["backend"],
47            "capabilities": description["capabilities"],
48            "server": description["server"],
49        }
50
51
52def main() -> None:
53    """Run traced OpenAI-compatible client call payload."""
54    # Fixed request id keeps traces and docs output deterministic across runs.
55    request_id = "example-clients-openai-compatible-call-001"
56    tracer = Tracer(
57        enabled=True,
58        trace_dir=Path("artifacts/examples/traces"),
59        enable_jsonl=True,
60        enable_console=True,
61    )
62    payload = tracer.run_callable(
63        agent_name="ExamplesOpenAICompatClientCall",
64        request_id=request_id,
65        input_payload={"scenario": "openai-compatible-client-call"},
66        function=_build_payload,
67    )
68    assert isinstance(payload, dict)
69    payload["example"] = "clients/openai_compatible_http_client.py"
70    payload["trace"] = tracer.trace_info(request_id)
71    # Print the results
72    print(json.dumps(payload, ensure_ascii=True, indent=2, sort_keys=True))
73
74
75if __name__ == "__main__":
76    main()

Expected Results

Run Command

PYTHONPATH=src python3 examples/clients/openai_compatible_http_client.py

Example output captured with DRA_EXAMPLE_LLM_MODE=deterministic (timestamps, durations, and trace filenames vary by run):

{
  "backend": {
    "api_key_env": "OPENAI_API_KEY",
    "base_url": "http://127.0.0.1:8011/v1",
    "default_model": "qwen2.5-1.5b-q4",
    "kind": "openai_compatible_http",
    "max_retries": 3,
    "model_patterns": [
      "qwen2.5-*",
      "qwen2-*"
    ],
    "name": "local-openai-compat"
  },
  "capabilities": {
    "json_mode": "prompt+validate",
    "max_context_tokens": null,
    "streaming": false,
    "tool_calling": "best_effort",
    "vision": false
  },
  "client_class": "OpenAICompatibleHTTPLLMClient",
  "default_model": "qwen2.5-1.5b-q4",
  "example": "clients/openai_compatible_http_client.py",
  "llm_call": {
    "prompt": "Provide one sentence on balancing latency and quality in design review assistants.",
    "response_has_text": true,
    "response_model": "qwen2.5-1.5b-q4",
    "response_provider": "example-test-monkeypatch",
    "response_text": "Use fast drafts for iteration, then escalate critical decisions to higher-quality models."
  },
  "server": null,
  "trace": {
    "request_id": "example-clients-openai-compatible-call-001",
    "trace_dir": "artifacts/examples/traces",
    "trace_path": "artifacts/examples/traces/run_20260222T162206Z_example-clients-openai-compatible-call-001.jsonl"
  }
}

References