Recipe Strategy Comparison Run#

Source: examples/recipe_strategy_comparison_run.py

Introduction#

Execute a packaged-problem strategy comparison study with deterministic mocks.

Technical Implementation#

  1. Build StrategyComparisonConfig overrides for bundle selection, run budget, and output path.

  2. Create deterministic problem packets and one factory per compared agent strategy.

  3. Run the study and write a markdown summary artifact.

  1from __future__ import annotations
  2
  3from pathlib import Path
  4
  5import design_research_experiments as drex
  6
  7
  8def _build_problem_registry(problem_ids: tuple[str, ...]) -> dict[str, drex.ProblemPacket]:
  9    """Build a deterministic optimization-style problem registry."""
 10
 11    def evaluator(output: dict[str, object]) -> list[dict[str, object]]:
 12        """Emit one synthetic benchmark metric row."""
 13        text = str(output.get("text", ""))
 14        return [{"metric_name": "objective_score", "metric_value": len(text) / 110.0}]
 15
 16    registry: dict[str, drex.ProblemPacket] = {}
 17    for problem_id in problem_ids:
 18        registry[problem_id] = drex.ProblemPacket(
 19            problem_id=problem_id,
 20            family="optimization",
 21            brief=f"Packaged benchmark brief for {problem_id}",
 22            evaluator=evaluator,
 23        )
 24    return registry
 25
 26
 27def _agent_factory(agent_name: str):
 28    """Create a deterministic strategy-specific agent callable."""
 29
 30    def _agent(
 31        *,
 32        problem_packet: drex.ProblemPacket,
 33        run_spec: drex.RunSpec,
 34        condition: drex.Condition,
 35    ) -> dict[str, object]:
 36        """Generate one deterministic mock run result for strategy comparisons."""
 37        compared_agent = str(condition.factor_assignments.get("agent_id", agent_name))
 38        run_seed = run_spec.seed
 39        strategy_bonus = 0.09 if compared_agent == "self-learning-agent" else 0.0
 40        baseline_bonus = 0.03 if "baseline" in compared_agent else 0.0
 41        family_bonus = 0.02 if problem_packet.problem_id.endswith("medium") else 0.0
 42        primary_outcome = round(0.55 + strategy_bonus + baseline_bonus + family_bonus, 4)
 43
 44        text = (
 45            f"{compared_agent} solved {problem_packet.problem_id} "
 46            f"with seed={run_seed} condition={condition.condition_id}"
 47        )
 48
 49        return {
 50            "output": {"text": text},
 51            "metrics": {
 52                "primary_outcome": primary_outcome,
 53                "input_tokens": 130,
 54                "output_tokens": 210,
 55                "cost_usd": 0.019,
 56            },
 57            "events": [
 58                {
 59                    "event_type": "assistant_output",
 60                    "text": text,
 61                    "actor_id": compared_agent,
 62                }
 63            ],
 64            "metadata": {"model_name": "example-model"},
 65        }
 66
 67    return _agent
 68
 69
 70def main() -> None:
 71    """Run a packaged-problem strategy comparison study and write a summary artifact."""
 72    config = drex.StrategyComparisonConfig(
 73        bundle=drex.optimization_bundle(),
 74        run_budget=drex.RunBudget(replicates=1, parallelism=1, max_runs=4),
 75        output_dir=Path("artifacts") / "example-strategy-comparison",
 76        problem_ids=("optimization-small", "optimization-medium"),
 77    )
 78    study = drex.build_strategy_comparison_study(config)
 79
 80    strategy_ids = tuple(str(level.value) for level in study.factors[0].levels)
 81    problem_registry = _build_problem_registry(study.problem_ids)
 82    agent_bindings = {
 83        strategy_id: (lambda _condition, strategy_id=strategy_id: _agent_factory(strategy_id))
 84        for strategy_id in strategy_ids
 85    }
 86
 87    run_results = drex.run_study(
 88        study,
 89        agent_bindings=agent_bindings,
 90        problem_registry=problem_registry,
 91    )
 92
 93    summary = drex.render_markdown_summary(study, run_results)
 94    summary_path = drex.write_markdown_report(
 95        study.output_dir,
 96        "strategy_comparison_summary.md",
 97        summary,
 98    )
 99
100    print(f"Completed {len(run_results)} runs")
101    print(f"Summary written to {summary_path}")
102
103
104if __name__ == "__main__":
105    main()

Expected Results#

Run Command

PYTHONPATH=src python examples/recipe_strategy_comparison_run.py

The script prints completed run count and writes artifacts/example-strategy-comparison/artifacts/strategy_comparison_summary.md.