SGLang Server Client
Source: examples/clients/sglang_server_client.py
Introduction
SGLang focuses on high-throughput serving and exposes OpenAI-compatible APIs, making it useful for controlled backend substitution against common response contracts and HELM-style evaluation framing. This example wires the SGLang server client into the same traced run surface used by other providers.
Technical Implementation
Configure
Tracerwith JSONL + console output so each run emits machine-readable traces and lifecycle logs.Build the runtime surface (public APIs only) and execute
SGLangServerLLMClient.generate(...)with a fixedrequest_id.Construct
LLMRequestinputs and callgeneratethrough the selected client implementation.Print a compact JSON payload including
trace_infofor deterministic tests and docs examples.
flowchart LR
A["Input prompt or scenario"] --> B["main(): runtime wiring"]
B --> C["SGLangServerLLMClient.generate(...)"]
C --> D["LLMRequest/LLMResponse contracts wrap provider behavior"]
C --> E["Tracer JSONL + console events"]
D --> F["ExecutionResult/payload"]
E --> F
F --> G["Printed JSON output"]
1from __future__ import annotations
2
3import json
4import sys
5from pathlib import Path
6
7from design_research_agents import SGLangServerLLMClient, Tracer
8from design_research_agents.llm import LLMMessage, LLMRequest
9
10
11def _build_payload() -> dict[str, object]:
12 # Run the managed SGLang client using public runtime APIs. Using this with statement will automatically
13 # shut down the managed local server when the example is done.
14 with SGLangServerLLMClient(
15 name="sglang-local-dev",
16 model="Qwen/Qwen2.5-1.5B-Instruct",
17 host="127.0.0.1",
18 port=30000,
19 manage_server=True,
20 startup_timeout_seconds=90.0,
21 poll_interval_seconds=0.5,
22 python_executable=sys.executable,
23 extra_server_args=("--tp-size", "1"),
24 request_timeout_seconds=60.0,
25 max_retries=3,
26 model_patterns=("Qwen/*", "qwen2.5-*"),
27 ) as client:
28 description = client.describe()
29 prompt = "Provide one sentence on when SGLang-style serving helps local benchmarking."
30 response = client.generate(
31 LLMRequest(
32 messages=(
33 LLMMessage(role="system", content="You are a concise engineering design assistant."),
34 LLMMessage(role="user", content=prompt),
35 ),
36 model=client.default_model(),
37 temperature=0.0,
38 max_tokens=120,
39 )
40 )
41 llm_call = {
42 "prompt": prompt,
43 "response_text": response.text,
44 "response_model": response.model,
45 "response_provider": response.provider,
46 "response_has_text": bool(response.text.strip()),
47 }
48 return {
49 "client_class": description["client_class"],
50 "default_model": description["default_model"],
51 "llm_call": llm_call,
52 "backend": description["backend"],
53 "capabilities": description["capabilities"],
54 "server": description["server"],
55 }
56
57
58def main() -> None:
59 """Run traced SGLang client call payload."""
60 # Fixed request id keeps traces and docs output deterministic across runs.
61 request_id = "example-clients-sglang-server-call-001"
62 tracer = Tracer(
63 enabled=True,
64 trace_dir=Path("artifacts/examples/traces"),
65 enable_jsonl=True,
66 enable_console=True,
67 )
68 payload = tracer.run_callable(
69 agent_name="ExamplesSglangClientCall",
70 request_id=request_id,
71 input_payload={"scenario": "sglang-server-client-call"},
72 function=_build_payload,
73 )
74 assert isinstance(payload, dict)
75 payload["example"] = "clients/sglang_server_client.py"
76 payload["trace"] = tracer.trace_info(request_id)
77 # Print the results
78 print(json.dumps(payload, ensure_ascii=True, indent=2, sort_keys=True))
79
80
81if __name__ == "__main__":
82 main()
Expected Results
Run Command
PYTHONPATH=src python3 examples/clients/sglang_server_client.py
Example output captured with DRA_EXAMPLE_LLM_MODE=deterministic
(timestamps, durations, and trace filenames vary by run):
{
"backend": {
"base_url": "http://127.0.0.1:30000/v1",
"default_model": "Qwen/Qwen2.5-1.5B-Instruct",
"host": "127.0.0.1",
"kind": "sglang_server",
"max_retries": 3,
"model_patterns": [
"Qwen/*",
"qwen2.5-*"
],
"name": "sglang-local-dev",
"port": 30000
},
"capabilities": {
"json_mode": "prompt+validate",
"max_context_tokens": null,
"streaming": false,
"tool_calling": "best_effort",
"vision": false
},
"client_class": "SGLangServerLLMClient",
"default_model": "Qwen/Qwen2.5-1.5B-Instruct",
"example": "clients/sglang_server_client.py",
"llm_call": {
"prompt": "Provide one sentence on when SGLang-style serving helps local benchmarking.",
"response_has_text": true,
"response_model": "Qwen/Qwen2.5-1.5B-Instruct",
"response_provider": "example-test-monkeypatch",
"response_text": "SGLang-style serving helps when you need stable local throughput for repeated tests."
},
"server": {
"host": "127.0.0.1",
"kind": "sglang_server",
"managed": true,
"port": 30000
},
"trace": {
"request_id": "example-clients-sglang-server-call-001",
"trace_dir": "artifacts/examples/traces",
"trace_path": "artifacts/examples/traces/run_20260222T162206Z_example-clients-sglang-server-call-001.jsonl"
}
}