Llama CPP Server Client

Source: examples/clients/llama_cpp_server_client.py

Introduction

Local serving with llama.cpp is a practical path for controllable offline experimentation, OpenAI-style response contracts improve interchangeability, and HELM motivates standardized evaluation conditions. This example validates the llama.cpp server client path with tracing and deterministic output framing.

Technical Implementation

  1. Configure Tracer with JSONL + console output so each run emits machine-readable traces and lifecycle logs.

  2. Build the runtime surface (public APIs only) and execute LlamaCppServerLLMClient.generate(...) with a fixed request_id.

  3. Construct LLMRequest inputs and call generate through the selected client implementation.

  4. Print a compact JSON payload including trace_info for deterministic tests and docs examples.

        flowchart LR
    A["Input prompt or scenario"] --> B["main(): runtime wiring"]
    B --> C["LlamaCppServerLLMClient.generate(...)"]
    C --> D["LLMRequest/LLMResponse contracts wrap provider behavior"]
    C --> E["Tracer JSONL + console events"]
    D --> F["ExecutionResult/payload"]
    E --> F
    F --> G["Printed JSON output"]
    
 1from __future__ import annotations
 2
 3import json
 4import sys
 5from pathlib import Path
 6
 7from design_research_agents import Tracer
 8from design_research_agents.llm import LLMMessage, LLMRequest
 9from design_research_agents.llm.clients import LlamaCppServerLLMClient
10
11
12def _build_payload() -> dict[str, object]:
13    # Run the managed llama.cpp client using public runtime APIs. Using this with statement will automatically
14    # shut down the managed local server when the example is done.
15    with LlamaCppServerLLMClient(
16        name="llama-local-dev",
17        model="Qwen2.5-1.5B-Instruct-Q4_K_M.gguf",
18        hf_model_repo_id="bartowski/Qwen2.5-1.5B-Instruct-GGUF",
19        api_model="qwen2.5-1.5b-q4",
20        host="127.0.0.1",
21        port=8011,
22        context_window=8192,
23        startup_timeout_seconds=90.0,
24        poll_interval_seconds=0.5,
25        python_executable=sys.executable,
26        extra_server_args=("--n_threads", "4", "--flash_attn", "1"),
27        max_retries=3,
28        model_patterns=("qwen2.5-*", "qwen2-*"),
29    ) as client:
30        description = client.describe()
31        prompt = "In one sentence, explain a key tradeoff in engineering design reviews."
32        response = client.generate(
33            LLMRequest(
34                messages=(
35                    LLMMessage(role="system", content="You are a concise engineering design assistant."),
36                    LLMMessage(role="user", content=prompt),
37                ),
38                model=client.default_model(),
39                temperature=0.0,
40                max_tokens=120,
41            )
42        )
43        llm_call = {
44            "prompt": prompt,
45            "response_text": response.text,
46            "response_model": response.model,
47            "response_provider": response.provider,
48            "response_has_text": bool(response.text.strip()),
49        }
50        return {
51            "client_class": description["client_class"],
52            "default_model": description["default_model"],
53            "llm_call": llm_call,
54            "backend": description["backend"],
55            "capabilities": description["capabilities"],
56            "server": description["server"],
57        }
58
59
60def main() -> None:
61    """Run traced llama-cpp client call payload."""
62    # Fixed request id keeps traces and docs output deterministic across runs.
63    request_id = "example-clients-llama-cpp-call-001"
64    tracer = Tracer(
65        enabled=True,
66        trace_dir=Path("artifacts/examples/traces"),
67        enable_jsonl=True,
68        enable_console=True,
69    )
70    payload = tracer.run_callable(
71        agent_name="ExamplesLlamaCppClientCall",
72        request_id=request_id,
73        input_payload={"scenario": "llama-cpp-client-call"},
74        function=_build_payload,
75    )
76    assert isinstance(payload, dict)
77    payload["example"] = "clients/llama_cpp_server_client.py"
78    payload["trace"] = tracer.trace_info(request_id)
79    # Print the results
80    print(json.dumps(payload, ensure_ascii=True, indent=2, sort_keys=True))
81
82
83if __name__ == "__main__":
84    main()

Expected Results

Run Command

PYTHONPATH=src python3 examples/clients/llama_cpp_server_client.py

Example output captured with DRA_EXAMPLE_LLM_MODE=deterministic (timestamps, durations, and trace filenames vary by run):

{
  "backend": {
    "api_model": "qwen2.5-1.5b-q4",
    "base_url": null,
    "default_model": "qwen2.5-1.5b-q4",
    "host": "127.0.0.1",
    "kind": "llama_cpp_server",
    "max_retries": 3,
    "model_patterns": [
      "qwen2.5-*",
      "qwen2-*"
    ],
    "name": "llama-local-dev",
    "port": 8011
  },
  "capabilities": {
    "json_mode": "prompt+validate",
    "max_context_tokens": null,
    "streaming": false,
    "tool_calling": "best_effort",
    "vision": false
  },
  "client_class": "LlamaCppServerLLMClient",
  "default_model": "qwen2.5-1.5b-q4",
  "example": "clients/llama_cpp_server_client.py",
  "llm_call": {
    "prompt": "In one sentence, explain a key tradeoff in engineering design reviews.",
    "response_has_text": true,
    "response_model": "qwen2.5-1.5b-q4",
    "response_provider": "example-test-monkeypatch",
    "response_text": "Tradeoff: strict review gates improve reliability but can slow delivery speed."
  },
  "server": {
    "host": "127.0.0.1",
    "kind": "llama_cpp_server",
    "managed": true,
    "port": 8011
  },
  "trace": {
    "request_id": "example-clients-llama-cpp-call-001",
    "trace_dir": "artifacts/examples/traces",
    "trace_path": "artifacts/examples/traces/run_20260222T162206Z_example-clients-llama-cpp-call-001.jsonl"
  }
}

References