Recipe Optimization Benchmark Run

Source: examples/recipe_optimization_benchmark_run.py

Introduction

Execute a non-default optimization benchmark recipe with deterministic mocks.

Technical Implementation

  1. Build OptimizationBenchmarkConfig overrides for factors and design.

  2. Create deterministic problem packets and per-agent factories.

  3. Run the study and write methods/significance/codebook markdown output.

  1from __future__ import annotations
  2
  3from pathlib import Path
  4
  5import design_research_experiments as drex
  6
  7
  8def _build_problem_registry(problem_ids: tuple[str, ...]) -> dict[str, drex.ProblemPacket]:
  9    """Build a deterministic optimization-oriented problem registry."""
 10
 11    def evaluator(output: dict[str, object]) -> list[dict[str, object]]:
 12        """Emit one deterministic optimization score row."""
 13        text = str(output.get("text", ""))
 14        return [{"metric_name": "objective_score", "metric_value": len(text) / 120.0}]
 15
 16    registry: dict[str, drex.ProblemPacket] = {}
 17    for problem_id in problem_ids:
 18        registry[problem_id] = drex.ProblemPacket(
 19            problem_id=problem_id,
 20            family="optimization",
 21            brief=f"Optimization benchmark brief for {problem_id}",
 22            evaluator=evaluator,
 23        )
 24    return registry
 25
 26
 27def _agent_factory(agent_name: str):
 28    """Create a deterministic optimization agent callable."""
 29
 30    def _agent(
 31        *,
 32        problem_packet: drex.ProblemPacket,
 33        run_spec: drex.RunSpec,
 34        condition: drex.Condition,
 35    ) -> dict[str, object]:
 36        """Generate one deterministic mock run result for optimization conditions."""
 37        run_seed = run_spec.seed
 38        factor_assignments = condition.factor_assignments
 39        learning_strategy = str(
 40            factor_assignments.get("learning_strategy", "deterministic-baseline")
 41        )
 42        tuning_regime = str(factor_assignments.get("tuning_regime", "conservative"))
 43
 44        strategy_bonus = 0.08 if learning_strategy == "self-learning-agent" else 0.0
 45        tuning_bonus = 0.04 if tuning_regime == "aggressive" else 0.0
 46        agent_bonus = 0.03 if agent_name == "self-learning-agent" else 0.0
 47        primary_outcome = round(0.56 + strategy_bonus + tuning_bonus + agent_bonus, 4)
 48
 49        text = (
 50            f"{agent_name} optimized {problem_packet.problem_id} "
 51            f"with strategy={learning_strategy} tuning={tuning_regime} seed={run_seed}"
 52        )
 53
 54        return {
 55            "output": {"text": text},
 56            "metrics": {
 57                "primary_outcome": primary_outcome,
 58                "input_tokens": 150,
 59                "output_tokens": 240,
 60                "cost_usd": 0.022,
 61            },
 62            "events": [
 63                {
 64                    "event_type": "assistant_output",
 65                    "text": text,
 66                    "actor_id": agent_name,
 67                }
 68            ],
 69            "metadata": {"model_name": "example-model"},
 70        }
 71
 72    return _agent
 73
 74
 75def main() -> None:
 76    """Run an optimization benchmark study and export markdown artifacts."""
 77    config = drex.OptimizationBenchmarkConfig(
 78        study_id="optimization-benchmark-custom",
 79        factors=(
 80            drex.Factor(
 81                name="learning_strategy",
 82                description="Agent learning approach.",
 83                kind=drex.FactorKind.MANIPULATED,
 84                levels=(
 85                    drex.Level(name="deterministic", value="deterministic-baseline"),
 86                    drex.Level(name="self_learning", value="self-learning-agent"),
 87                ),
 88            ),
 89            drex.Factor(
 90                name="tuning_regime",
 91                description="Hyperparameter regime.",
 92                kind=drex.FactorKind.MANIPULATED,
 93                levels=(
 94                    drex.Level(name="conservative", value="conservative"),
 95                    drex.Level(name="aggressive", value="aggressive"),
 96                    drex.Level(name="exploratory", value="exploratory"),
 97                ),
 98            ),
 99        ),
100        design_spec={"kind": "randomized_block", "randomize": True},
101        run_budget=drex.RunBudget(replicates=1, parallelism=1, max_runs=8),
102        output_dir=Path("artifacts") / "example-optimization-benchmark",
103        problem_ids=("optimization-small", "optimization-medium"),
104        agent_specs=("deterministic-baseline", "self-learning-agent"),
105    )
106    study = drex.build_optimization_benchmark_study(config)
107
108    problem_registry = _build_problem_registry(study.problem_ids)
109    agent_factories = {
110        "deterministic-baseline": lambda _condition: _agent_factory("deterministic-baseline"),
111        "self-learning-agent": lambda _condition: _agent_factory("self-learning-agent"),
112    }
113
114    run_results = drex.run_study(
115        study,
116        agent_factories=agent_factories,
117        problem_registry=problem_registry,
118    )
119
120    condition_ids = {
121        result.run_spec.condition_id for result in run_results if result.run_spec is not None
122    }
123    significance = drex.render_significance_brief(
124        [
125            {
126                "test": "mixed_effects",
127                "outcome": "primary_outcome",
128                "p_value": 0.04,
129                "effect_size": 0.37,
130            }
131        ]
132    )
133    methods = drex.render_methods_scaffold(study)
134    codebook = drex.render_codebook(
135        study,
136        [
137            condition
138            for condition in drex.build_design(study)
139            if condition.condition_id in condition_ids
140        ],
141    )
142
143    report = "\n\n".join((methods, significance, codebook))
144    report_path = drex.write_markdown_report(
145        study.output_dir, "optimization_benchmark_report.md", report
146    )
147
148    print(f"Completed {len(run_results)} runs")
149    print(f"Report written to {report_path}")
150
151
152if __name__ == "__main__":
153    main()

Expected Results

Run Command

PYTHONPATH=src python examples/recipe_optimization_benchmark_run.py

The script prints completed run count and writes artifacts/example-optimization-benchmark/artifacts/optimization_benchmark_report.md.