Llama CPP Server Client
Source: examples/clients/llama_cpp_server_client.py
Introduction
Local serving with llama.cpp is a practical path for controllable offline experimentation, OpenAI-style response contracts improve interchangeability, and HELM motivates standardized evaluation conditions. This example validates the llama.cpp server client path with tracing and deterministic output framing.
Technical Implementation
Configure
Tracerwith JSONL + console output so each run emits machine-readable traces and lifecycle logs.Build the runtime surface (public APIs only) and execute
LlamaCppServerLLMClient.generate(...)with a fixedrequest_id.Construct
LLMRequestinputs and callgeneratethrough the selected client implementation.Print a compact JSON payload including
trace_infofor deterministic tests and docs examples.
flowchart LR
A["Input prompt or scenario"] --> B["main(): runtime wiring"]
B --> C["LlamaCppServerLLMClient.generate(...)"]
C --> D["LLMRequest/LLMResponse contracts wrap provider behavior"]
C --> E["Tracer JSONL + console events"]
D --> F["ExecutionResult/payload"]
E --> F
F --> G["Printed JSON output"]
1from __future__ import annotations
2
3import json
4import sys
5from pathlib import Path
6
7from design_research_agents import Tracer
8from design_research_agents.llm import LLMMessage, LLMRequest
9from design_research_agents.llm.clients import LlamaCppServerLLMClient
10
11
12def _build_payload() -> dict[str, object]:
13 # Run the managed llama.cpp client using public runtime APIs. Using this with statement will automatically
14 # shut down the managed local server when the example is done.
15 with LlamaCppServerLLMClient(
16 name="llama-local-dev",
17 model="Qwen2.5-1.5B-Instruct-Q4_K_M.gguf",
18 hf_model_repo_id="bartowski/Qwen2.5-1.5B-Instruct-GGUF",
19 api_model="qwen2.5-1.5b-q4",
20 host="127.0.0.1",
21 port=8011,
22 context_window=8192,
23 startup_timeout_seconds=90.0,
24 poll_interval_seconds=0.5,
25 python_executable=sys.executable,
26 extra_server_args=("--n_threads", "4", "--flash_attn", "1"),
27 max_retries=3,
28 model_patterns=("qwen2.5-*", "qwen2-*"),
29 ) as client:
30 description = client.describe()
31 prompt = "In one sentence, explain a key tradeoff in engineering design reviews."
32 response = client.generate(
33 LLMRequest(
34 messages=(
35 LLMMessage(role="system", content="You are a concise engineering design assistant."),
36 LLMMessage(role="user", content=prompt),
37 ),
38 model=client.default_model(),
39 temperature=0.0,
40 max_tokens=120,
41 )
42 )
43 llm_call = {
44 "prompt": prompt,
45 "response_text": response.text,
46 "response_model": response.model,
47 "response_provider": response.provider,
48 "response_has_text": bool(response.text.strip()),
49 }
50 return {
51 "client_class": description["client_class"],
52 "default_model": description["default_model"],
53 "llm_call": llm_call,
54 "backend": description["backend"],
55 "capabilities": description["capabilities"],
56 "server": description["server"],
57 }
58
59
60def main() -> None:
61 """Run traced llama-cpp client call payload."""
62 # Fixed request id keeps traces and docs output deterministic across runs.
63 request_id = "example-clients-llama-cpp-call-001"
64 tracer = Tracer(
65 enabled=True,
66 trace_dir=Path("artifacts/examples/traces"),
67 enable_jsonl=True,
68 enable_console=True,
69 )
70 payload = tracer.run_callable(
71 agent_name="ExamplesLlamaCppClientCall",
72 request_id=request_id,
73 input_payload={"scenario": "llama-cpp-client-call"},
74 function=_build_payload,
75 )
76 assert isinstance(payload, dict)
77 payload["example"] = "clients/llama_cpp_server_client.py"
78 payload["trace"] = tracer.trace_info(request_id)
79 # Print the results
80 print(json.dumps(payload, ensure_ascii=True, indent=2, sort_keys=True))
81
82
83if __name__ == "__main__":
84 main()
Expected Results
Run Command
PYTHONPATH=src python3 examples/clients/llama_cpp_server_client.py
Example output captured with DRA_EXAMPLE_LLM_MODE=deterministic
(timestamps, durations, and trace filenames vary by run):
{
"backend": {
"api_model": "qwen2.5-1.5b-q4",
"base_url": null,
"default_model": "qwen2.5-1.5b-q4",
"host": "127.0.0.1",
"kind": "llama_cpp_server",
"max_retries": 3,
"model_patterns": [
"qwen2.5-*",
"qwen2-*"
],
"name": "llama-local-dev",
"port": 8011
},
"capabilities": {
"json_mode": "prompt+validate",
"max_context_tokens": null,
"streaming": false,
"tool_calling": "best_effort",
"vision": false
},
"client_class": "LlamaCppServerLLMClient",
"default_model": "qwen2.5-1.5b-q4",
"example": "clients/llama_cpp_server_client.py",
"llm_call": {
"prompt": "In one sentence, explain a key tradeoff in engineering design reviews.",
"response_has_text": true,
"response_model": "qwen2.5-1.5b-q4",
"response_provider": "example-test-monkeypatch",
"response_text": "Tradeoff: strict review gates improve reliability but can slow delivery speed."
},
"server": {
"host": "127.0.0.1",
"kind": "llama_cpp_server",
"managed": true,
"port": 8011
},
"trace": {
"request_id": "example-clients-llama-cpp-call-001",
"trace_dir": "artifacts/examples/traces",
"trace_path": "artifacts/examples/traces/run_20260222T162206Z_example-clients-llama-cpp-call-001.jsonl"
}
}