Recipe Optimization Benchmark Run
Source: examples/recipe_optimization_benchmark_run.py
Introduction
Execute a non-default optimization benchmark recipe with deterministic mocks.
Technical Implementation
Build
OptimizationBenchmarkConfigoverrides for factors and design.Create deterministic problem packets and per-agent factories.
Run the study and write methods/significance/codebook markdown output.
1from __future__ import annotations
2
3from pathlib import Path
4
5import design_research_experiments as drex
6
7
8def _build_problem_registry(problem_ids: tuple[str, ...]) -> dict[str, drex.ProblemPacket]:
9 """Build a deterministic optimization-oriented problem registry."""
10
11 def evaluator(output: dict[str, object]) -> list[dict[str, object]]:
12 """Emit one deterministic optimization score row."""
13 text = str(output.get("text", ""))
14 return [{"metric_name": "objective_score", "metric_value": len(text) / 120.0}]
15
16 registry: dict[str, drex.ProblemPacket] = {}
17 for problem_id in problem_ids:
18 registry[problem_id] = drex.ProblemPacket(
19 problem_id=problem_id,
20 family="optimization",
21 brief=f"Optimization benchmark brief for {problem_id}",
22 evaluator=evaluator,
23 )
24 return registry
25
26
27def _agent_factory(agent_name: str):
28 """Create a deterministic optimization agent callable."""
29
30 def _agent(
31 *,
32 problem_packet: drex.ProblemPacket,
33 run_spec: drex.RunSpec,
34 condition: drex.Condition,
35 ) -> dict[str, object]:
36 """Generate one deterministic mock run result for optimization conditions."""
37 run_seed = run_spec.seed
38 factor_assignments = condition.factor_assignments
39 learning_strategy = str(
40 factor_assignments.get("learning_strategy", "deterministic-baseline")
41 )
42 tuning_regime = str(factor_assignments.get("tuning_regime", "conservative"))
43
44 strategy_bonus = 0.08 if learning_strategy == "self-learning-agent" else 0.0
45 tuning_bonus = 0.04 if tuning_regime == "aggressive" else 0.0
46 agent_bonus = 0.03 if agent_name == "self-learning-agent" else 0.0
47 primary_outcome = round(0.56 + strategy_bonus + tuning_bonus + agent_bonus, 4)
48
49 text = (
50 f"{agent_name} optimized {problem_packet.problem_id} "
51 f"with strategy={learning_strategy} tuning={tuning_regime} seed={run_seed}"
52 )
53
54 return {
55 "output": {"text": text},
56 "metrics": {
57 "primary_outcome": primary_outcome,
58 "input_tokens": 150,
59 "output_tokens": 240,
60 "cost_usd": 0.022,
61 },
62 "events": [
63 {
64 "event_type": "assistant_output",
65 "text": text,
66 "actor_id": agent_name,
67 }
68 ],
69 "metadata": {"model_name": "example-model"},
70 }
71
72 return _agent
73
74
75def main() -> None:
76 """Run an optimization benchmark study and export markdown artifacts."""
77 config = drex.OptimizationBenchmarkConfig(
78 study_id="optimization-benchmark-custom",
79 factors=(
80 drex.Factor(
81 name="learning_strategy",
82 description="Agent learning approach.",
83 kind=drex.FactorKind.MANIPULATED,
84 levels=(
85 drex.Level(name="deterministic", value="deterministic-baseline"),
86 drex.Level(name="self_learning", value="self-learning-agent"),
87 ),
88 ),
89 drex.Factor(
90 name="tuning_regime",
91 description="Hyperparameter regime.",
92 kind=drex.FactorKind.MANIPULATED,
93 levels=(
94 drex.Level(name="conservative", value="conservative"),
95 drex.Level(name="aggressive", value="aggressive"),
96 drex.Level(name="exploratory", value="exploratory"),
97 ),
98 ),
99 ),
100 design_spec={"kind": "randomized_block", "randomize": True},
101 run_budget=drex.RunBudget(replicates=1, parallelism=1, max_runs=8),
102 output_dir=Path("artifacts") / "example-optimization-benchmark",
103 problem_ids=("optimization-small", "optimization-medium"),
104 agent_specs=("deterministic-baseline", "self-learning-agent"),
105 )
106 study = drex.build_optimization_benchmark_study(config)
107
108 problem_registry = _build_problem_registry(study.problem_ids)
109 agent_factories = {
110 "deterministic-baseline": lambda _condition: _agent_factory("deterministic-baseline"),
111 "self-learning-agent": lambda _condition: _agent_factory("self-learning-agent"),
112 }
113
114 run_results = drex.run_study(
115 study,
116 agent_factories=agent_factories,
117 problem_registry=problem_registry,
118 )
119
120 condition_ids = {
121 result.run_spec.condition_id for result in run_results if result.run_spec is not None
122 }
123 significance = drex.render_significance_brief(
124 [
125 {
126 "test": "mixed_effects",
127 "outcome": "primary_outcome",
128 "p_value": 0.04,
129 "effect_size": 0.37,
130 }
131 ]
132 )
133 methods = drex.render_methods_scaffold(study)
134 codebook = drex.render_codebook(
135 study,
136 [
137 condition
138 for condition in drex.build_design(study)
139 if condition.condition_id in condition_ids
140 ],
141 )
142
143 report = "\n\n".join((methods, significance, codebook))
144 report_path = drex.write_markdown_report(
145 study.output_dir, "optimization_benchmark_report.md", report
146 )
147
148 print(f"Completed {len(run_results)} runs")
149 print(f"Report written to {report_path}")
150
151
152if __name__ == "__main__":
153 main()
Expected Results
Run Command
PYTHONPATH=src python examples/recipe_optimization_benchmark_run.py
The script prints completed run count and writes
artifacts/example-optimization-benchmark/artifacts/optimization_benchmark_report.md.