Recipe Strategy Comparison Run#
Source: examples/recipe_strategy_comparison_run.py
Introduction#
Execute a packaged-problem strategy comparison study with deterministic mocks.
Technical Implementation#
Build
StrategyComparisonConfigoverrides for bundle selection, run budget, and output path.Create deterministic problem packets and one factory per compared agent strategy.
Run the study and write a markdown summary artifact.
1from __future__ import annotations
2
3from pathlib import Path
4
5import design_research_experiments as drex
6
7
8def _build_problem_registry(problem_ids: tuple[str, ...]) -> dict[str, drex.ProblemPacket]:
9 """Build a deterministic optimization-style problem registry."""
10
11 def evaluator(output: dict[str, object]) -> list[dict[str, object]]:
12 """Emit one synthetic benchmark metric row."""
13 text = str(output.get("text", ""))
14 return [{"metric_name": "objective_score", "metric_value": len(text) / 110.0}]
15
16 registry: dict[str, drex.ProblemPacket] = {}
17 for problem_id in problem_ids:
18 registry[problem_id] = drex.ProblemPacket(
19 problem_id=problem_id,
20 family="optimization",
21 brief=f"Packaged benchmark brief for {problem_id}",
22 evaluator=evaluator,
23 )
24 return registry
25
26
27def _agent_factory(agent_name: str):
28 """Create a deterministic strategy-specific agent callable."""
29
30 def _agent(
31 *,
32 problem_packet: drex.ProblemPacket,
33 run_spec: drex.RunSpec,
34 condition: drex.Condition,
35 ) -> dict[str, object]:
36 """Generate one deterministic mock run result for strategy comparisons."""
37 compared_agent = str(condition.factor_assignments.get("agent_id", agent_name))
38 run_seed = run_spec.seed
39 strategy_bonus = 0.09 if compared_agent == "self-learning-agent" else 0.0
40 baseline_bonus = 0.03 if "baseline" in compared_agent else 0.0
41 family_bonus = 0.02 if problem_packet.problem_id.endswith("medium") else 0.0
42 primary_outcome = round(0.55 + strategy_bonus + baseline_bonus + family_bonus, 4)
43
44 text = (
45 f"{compared_agent} solved {problem_packet.problem_id} "
46 f"with seed={run_seed} condition={condition.condition_id}"
47 )
48
49 return {
50 "output": {"text": text},
51 "metrics": {
52 "primary_outcome": primary_outcome,
53 "input_tokens": 130,
54 "output_tokens": 210,
55 "cost_usd": 0.019,
56 },
57 "events": [
58 {
59 "event_type": "assistant_output",
60 "text": text,
61 "actor_id": compared_agent,
62 }
63 ],
64 "metadata": {"model_name": "example-model"},
65 }
66
67 return _agent
68
69
70def main() -> None:
71 """Run a packaged-problem strategy comparison study and write a summary artifact."""
72 config = drex.StrategyComparisonConfig(
73 bundle=drex.optimization_bundle(),
74 run_budget=drex.RunBudget(replicates=1, parallelism=1, max_runs=4),
75 output_dir=Path("artifacts") / "example-strategy-comparison",
76 problem_ids=("optimization-small", "optimization-medium"),
77 )
78 study = drex.build_strategy_comparison_study(config)
79
80 strategy_ids = tuple(str(level.value) for level in study.factors[0].levels)
81 problem_registry = _build_problem_registry(study.problem_ids)
82 agent_bindings = {
83 strategy_id: (lambda _condition, strategy_id=strategy_id: _agent_factory(strategy_id))
84 for strategy_id in strategy_ids
85 }
86
87 run_results = drex.run_study(
88 study,
89 agent_bindings=agent_bindings,
90 problem_registry=problem_registry,
91 )
92
93 summary = drex.render_markdown_summary(study, run_results)
94 summary_path = drex.write_markdown_report(
95 study.output_dir,
96 "strategy_comparison_summary.md",
97 summary,
98 )
99
100 print(f"Completed {len(run_results)} runs")
101 print(f"Summary written to {summary_path}")
102
103
104if __name__ == "__main__":
105 main()
Expected Results#
Run Command
PYTHONPATH=src python examples/recipe_strategy_comparison_run.py
The script prints completed run count and writes
artifacts/example-strategy-comparison/artifacts/strategy_comparison_summary.md.