SGLang Server Client

Source: examples/clients/sglang_server_client.py

Introduction

SGLang focuses on high-throughput serving and exposes OpenAI-compatible APIs, making it useful for controlled backend substitution against common response contracts and HELM-style evaluation framing. This example wires the SGLang server client into the same traced run surface used by other providers.

Technical Implementation

  1. Configure Tracer with JSONL + console output so each run emits machine-readable traces and lifecycle logs.

  2. Build the runtime surface (public APIs only) and execute SGLangServerLLMClient.generate(...) with a fixed request_id.

  3. Construct LLMRequest inputs and call generate through the selected client implementation.

  4. Print a compact JSON payload including trace_info for deterministic tests and docs examples.

        flowchart LR
    A["Input prompt or scenario"] --> B["main(): runtime wiring"]
    B --> C["SGLangServerLLMClient.generate(...)"]
    C --> D["LLMRequest/LLMResponse contracts wrap provider behavior"]
    C --> E["Tracer JSONL + console events"]
    D --> F["ExecutionResult/payload"]
    E --> F
    F --> G["Printed JSON output"]
    
 1from __future__ import annotations
 2
 3import json
 4import sys
 5from pathlib import Path
 6
 7from design_research_agents import SGLangServerLLMClient, Tracer
 8from design_research_agents.llm import LLMMessage, LLMRequest
 9
10
11def _build_payload() -> dict[str, object]:
12    # Run the managed SGLang client using public runtime APIs. Using this with statement will automatically
13    # shut down the managed local server when the example is done.
14    with SGLangServerLLMClient(
15        name="sglang-local-dev",
16        model="Qwen/Qwen2.5-1.5B-Instruct",
17        host="127.0.0.1",
18        port=30000,
19        manage_server=True,
20        startup_timeout_seconds=90.0,
21        poll_interval_seconds=0.5,
22        python_executable=sys.executable,
23        extra_server_args=("--tp-size", "1"),
24        request_timeout_seconds=60.0,
25        max_retries=3,
26        model_patterns=("Qwen/*", "qwen2.5-*"),
27    ) as client:
28        description = client.describe()
29        prompt = "Provide one sentence on when SGLang-style serving helps local benchmarking."
30        response = client.generate(
31            LLMRequest(
32                messages=(
33                    LLMMessage(role="system", content="You are a concise engineering design assistant."),
34                    LLMMessage(role="user", content=prompt),
35                ),
36                model=client.default_model(),
37                temperature=0.0,
38                max_tokens=120,
39            )
40        )
41        llm_call = {
42            "prompt": prompt,
43            "response_text": response.text,
44            "response_model": response.model,
45            "response_provider": response.provider,
46            "response_has_text": bool(response.text.strip()),
47        }
48        return {
49            "client_class": description["client_class"],
50            "default_model": description["default_model"],
51            "llm_call": llm_call,
52            "backend": description["backend"],
53            "capabilities": description["capabilities"],
54            "server": description["server"],
55        }
56
57
58def main() -> None:
59    """Run traced SGLang client call payload."""
60    # Fixed request id keeps traces and docs output deterministic across runs.
61    request_id = "example-clients-sglang-server-call-001"
62    tracer = Tracer(
63        enabled=True,
64        trace_dir=Path("artifacts/examples/traces"),
65        enable_jsonl=True,
66        enable_console=True,
67    )
68    payload = tracer.run_callable(
69        agent_name="ExamplesSglangClientCall",
70        request_id=request_id,
71        input_payload={"scenario": "sglang-server-client-call"},
72        function=_build_payload,
73    )
74    assert isinstance(payload, dict)
75    payload["example"] = "clients/sglang_server_client.py"
76    payload["trace"] = tracer.trace_info(request_id)
77    # Print the results
78    print(json.dumps(payload, ensure_ascii=True, indent=2, sort_keys=True))
79
80
81if __name__ == "__main__":
82    main()

Expected Results

Run Command

PYTHONPATH=src python3 examples/clients/sglang_server_client.py

Example output captured with DRA_EXAMPLE_LLM_MODE=deterministic (timestamps, durations, and trace filenames vary by run):

{
  "backend": {
    "base_url": "http://127.0.0.1:30000/v1",
    "default_model": "Qwen/Qwen2.5-1.5B-Instruct",
    "host": "127.0.0.1",
    "kind": "sglang_server",
    "max_retries": 3,
    "model_patterns": [
      "Qwen/*",
      "qwen2.5-*"
    ],
    "name": "sglang-local-dev",
    "port": 30000
  },
  "capabilities": {
    "json_mode": "prompt+validate",
    "max_context_tokens": null,
    "streaming": false,
    "tool_calling": "best_effort",
    "vision": false
  },
  "client_class": "SGLangServerLLMClient",
  "default_model": "Qwen/Qwen2.5-1.5B-Instruct",
  "example": "clients/sglang_server_client.py",
  "llm_call": {
    "prompt": "Provide one sentence on when SGLang-style serving helps local benchmarking.",
    "response_has_text": true,
    "response_model": "Qwen/Qwen2.5-1.5B-Instruct",
    "response_provider": "example-test-monkeypatch",
    "response_text": "SGLang-style serving helps when you need stable local throughput for repeated tests."
  },
  "server": {
    "host": "127.0.0.1",
    "kind": "sglang_server",
    "managed": true,
    "port": 30000
  },
  "trace": {
    "request_id": "example-clients-sglang-server-call-001",
    "trace_dir": "artifacts/examples/traces",
    "trace_path": "artifacts/examples/traces/run_20260222T162206Z_example-clients-sglang-server-call-001.jsonl"
  }
}

References