vLLM Server Client

Source: examples/clients/vllm_server_client.py

Introduction

vLLM is a common high-performance inference server, OpenAI-compatible response contracts enable drop-in orchestration reuse, and HELM provides context for why consistent serving interfaces help evaluation. This example exercises the vLLM server client integration with explicit trace reporting.

Technical Implementation

  1. Configure Tracer with JSONL + console output so each run emits machine-readable traces and lifecycle logs.

  2. Build the runtime surface (public APIs only) and execute VLLMServerLLMClient.generate(...) with a fixed request_id.

  3. Construct LLMRequest inputs and call generate through the selected client implementation.

  4. Print a compact JSON payload including trace_info for deterministic tests and docs examples.

        flowchart LR
    A["Input prompt or scenario"] --> B["main(): runtime wiring"]
    B --> C["VLLMServerLLMClient.generate(...)"]
    C --> D["LLMRequest/LLMResponse contracts wrap provider behavior"]
    C --> E["Tracer JSONL + console events"]
    D --> F["ExecutionResult/payload"]
    E --> F
    F --> G["Printed JSON output"]
    
 1from __future__ import annotations
 2
 3import json
 4import sys
 5from pathlib import Path
 6
 7from design_research_agents import Tracer, VLLMServerLLMClient
 8from design_research_agents.llm import LLMMessage, LLMRequest
 9
10
11def _build_payload() -> dict[str, object]:
12    # Run the managed vLLM client using public runtime APIs. Using this with statement will automatically
13    # shut down the managed local server when the example is done.
14    with VLLMServerLLMClient(
15        name="vllm-local-dev",
16        model="Qwen/Qwen2.5-1.5B-Instruct",
17        api_model="qwen2.5-1.5b-instruct",
18        host="127.0.0.1",
19        port=8002,
20        manage_server=True,
21        startup_timeout_seconds=90.0,
22        poll_interval_seconds=0.5,
23        python_executable=sys.executable,
24        extra_server_args=("--dtype", "auto"),
25        request_timeout_seconds=60.0,
26        max_retries=3,
27        model_patterns=("qwen2.5-*",),
28    ) as client:
29        description = client.describe()
30        prompt = "Provide one sentence on why local serving helps reproducible benchmarking."
31        response = client.generate(
32            LLMRequest(
33                messages=(
34                    LLMMessage(role="system", content="You are a concise engineering design assistant."),
35                    LLMMessage(role="user", content=prompt),
36                ),
37                model=client.default_model(),
38                temperature=0.0,
39                max_tokens=120,
40            )
41        )
42        llm_call = {
43            "prompt": prompt,
44            "response_text": response.text,
45            "response_model": response.model,
46            "response_provider": response.provider,
47            "response_has_text": bool(response.text.strip()),
48        }
49        return {
50            "client_class": description["client_class"],
51            "default_model": description["default_model"],
52            "llm_call": llm_call,
53            "backend": description["backend"],
54            "capabilities": description["capabilities"],
55            "server": description["server"],
56        }
57
58
59def main() -> None:
60    """Run traced vLLM client call payload."""
61    # Fixed request id keeps traces and docs output deterministic across runs.
62    request_id = "example-clients-vllm-server-call-001"
63    tracer = Tracer(
64        enabled=True,
65        trace_dir=Path("artifacts/examples/traces"),
66        enable_jsonl=True,
67        enable_console=True,
68    )
69    payload = tracer.run_callable(
70        agent_name="ExamplesVllmClientCall",
71        request_id=request_id,
72        input_payload={"scenario": "vllm-server-client-call"},
73        function=_build_payload,
74    )
75    assert isinstance(payload, dict)
76    payload["example"] = "clients/vllm_server_client.py"
77    payload["trace"] = tracer.trace_info(request_id)
78    # Print the results
79    print(json.dumps(payload, ensure_ascii=True, indent=2, sort_keys=True))
80
81
82if __name__ == "__main__":
83    main()

Expected Results

Run Command

PYTHONPATH=src python3 examples/clients/vllm_server_client.py

Example output captured with DRA_EXAMPLE_LLM_MODE=deterministic (timestamps, durations, and trace filenames vary by run):

{
  "backend": {
    "base_url": "http://127.0.0.1:8002/v1",
    "default_model": "qwen2.5-1.5b-instruct",
    "host": "127.0.0.1",
    "kind": "vllm_server",
    "max_retries": 3,
    "model_patterns": [
      "qwen2.5-*"
    ],
    "name": "vllm-local-dev",
    "port": 8002
  },
  "capabilities": {
    "json_mode": "prompt+validate",
    "max_context_tokens": null,
    "streaming": false,
    "tool_calling": "best_effort",
    "vision": false
  },
  "client_class": "VLLMServerLLMClient",
  "default_model": "qwen2.5-1.5b-instruct",
  "example": "clients/vllm_server_client.py",
  "llm_call": {
    "prompt": "Provide one sentence on why local serving helps reproducible benchmarking.",
    "response_has_text": true,
    "response_model": "qwen2.5-1.5b-instruct",
    "response_provider": "example-test-monkeypatch",
    "response_text": "Local serving reduces backend drift and improves benchmark reproducibility."
  },
  "server": {
    "host": "127.0.0.1",
    "kind": "vllm_server",
    "managed": true,
    "port": 8002
  },
  "trace": {
    "request_id": "example-clients-vllm-server-call-001",
    "trace_dir": "artifacts/examples/traces",
    "trace_path": "artifacts/examples/traces/run_20260222T162206Z_example-clients-vllm-server-call-001.jsonl"
  }
}

References