Source code for design_research_experiments.recipes

"""Reusable, typed recipe builders for common lab workflows."""

from __future__ import annotations

import re
from dataclasses import dataclass, replace
from pathlib import Path
from typing import Any

from .bundles import BenchmarkBundle, ideation_bundle, optimization_bundle
from .conditions import Constraint, Factor, FactorKind, Level
from .hypotheses import AnalysisPlan, Hypothesis, HypothesisDirection, HypothesisKind, OutcomeSpec
from .study import Block, RunBudget, SeedPolicy, Study

_AGENT_FACTOR_NAMES = frozenset({"agent_id", "agent", "agent_spec"})
_PROBLEM_FACTOR_NAMES = frozenset({"problem_id", "problem"})


[docs] @dataclass(slots=True) class RecipeStudyConfig: """Shared typed overrides for recipe study construction. Any field set to ``None`` keeps the recipe default. Any field set to a non-``None`` value replaces that section of the study definition wholesale. """ study_id: str | None = None title: str | None = None description: str | None = None authors: tuple[str, ...] | None = None rationale: str | None = None tags: tuple[str, ...] | None = None hypotheses: tuple[Hypothesis, ...] | None = None factors: tuple[Factor, ...] | None = None blocks: tuple[Block, ...] | None = None constraints: tuple[Constraint, ...] | None = None design_spec: dict[str, Any] | None = None outcomes: tuple[OutcomeSpec, ...] | None = None analysis_plans: tuple[AnalysisPlan, ...] | None = None run_budget: RunBudget | None = None seed_policy: SeedPolicy | None = None output_dir: Path | None = None provenance_metadata: dict[str, Any] | None = None notes: str | None = None problem_ids: tuple[str, ...] | None = None agent_specs: tuple[str, ...] | None = None primary_outcomes: tuple[str, ...] | None = None secondary_outcomes: tuple[str, ...] | None = None bundle: BenchmarkBundle | None = None
[docs] @dataclass(slots=True) class AgentArchitectureComparisonConfig(RecipeStudyConfig): """Overrides for the agent architecture comparison recipe."""
[docs] @dataclass(slots=True) class PromptFramingConfig(RecipeStudyConfig): """Overrides for the prompt framing recipe."""
[docs] @dataclass(slots=True) class GrammarScaffoldConfig(RecipeStudyConfig): """Overrides for the grammar scaffold recipe."""
[docs] @dataclass(slots=True) class HumanVsAgentProcessConfig(RecipeStudyConfig): """Overrides for the human-vs-agent process recipe."""
[docs] @dataclass(slots=True) class DiversityAndExplorationConfig(RecipeStudyConfig): """Overrides for the diversity and exploration recipe."""
[docs] @dataclass(slots=True) class OptimizationBenchmarkConfig(RecipeStudyConfig): """Overrides for the optimization benchmark recipe."""
[docs] @dataclass(slots=True) class ComparisonStudyConfig(RecipeStudyConfig): """Overrides for comparison-study recipe scaffolds.""" comparison_factor: Factor | None = None secondary_factor: Factor | None = None
[docs] @dataclass(slots=True) class UnivariateComparisonConfig(ComparisonStudyConfig): """Overrides for the univariate comparison recipe."""
[docs] @dataclass(slots=True) class BivariateComparisonConfig(ComparisonStudyConfig): """Overrides for the bivariate comparison recipe."""
[docs] @dataclass(slots=True) class StrategyComparisonConfig(ComparisonStudyConfig): """Overrides for the packaged-problem strategy comparison recipe."""
def _default_outcomes() -> tuple[OutcomeSpec, ...]: """Return baseline outcomes used by recipe scaffolds.""" return ( OutcomeSpec( name="primary_outcome", source_table="runs", column="primary_outcome", aggregation="mean", primary=True, expected_type="float", description="Primary study objective metric.", ), OutcomeSpec( name="latency_s", source_table="runs", column="latency_s", aggregation="mean", primary=False, expected_type="float", description="Run latency in seconds.", ), ) def _default_analysis_plan( hypothesis_id: str, *, tests: tuple[str, ...] = ("difference_in_means", "regression"), plots: tuple[str, ...] = ("condition_means",), export_tables: tuple[str, ...] = ("summary_by_condition",), random_effects: tuple[str, ...] = (), ) -> AnalysisPlan: """Return a compact default analysis plan.""" return AnalysisPlan( analysis_plan_id="ap1", hypothesis_ids=(hypothesis_id,), tests=tests, outcomes=("primary_outcome",), random_effects=random_effects, plots=plots, export_tables=export_tables, multiple_comparison_policy="holm", ) def _apply_recipe_config(study: Study, config: RecipeStudyConfig | None) -> Study: """Apply typed config overrides to a default recipe study.""" if config is None: return study resolved_study_id = config.study_id if config.study_id is not None else study.study_id study_output_dir = ( study.output_dir if study.output_dir is not None else Path("artifacts") / study.study_id ) if config.output_dir is not None: resolved_output_dir = config.output_dir elif config.study_id is not None: resolved_output_dir = Path("artifacts") / resolved_study_id else: resolved_output_dir = study_output_dir bundle_problem_ids = ( config.bundle.problem_ids if config.bundle is not None else study.problem_ids ) bundle_agent_specs = ( config.bundle.agent_specs if config.bundle is not None else study.agent_specs ) return Study( study_id=resolved_study_id, title=config.title if config.title is not None else study.title, description=config.description if config.description is not None else study.description, authors=config.authors if config.authors is not None else study.authors, rationale=config.rationale if config.rationale is not None else study.rationale, tags=config.tags if config.tags is not None else study.tags, hypotheses=config.hypotheses if config.hypotheses is not None else study.hypotheses, factors=config.factors if config.factors is not None else study.factors, blocks=config.blocks if config.blocks is not None else study.blocks, constraints=config.constraints if config.constraints is not None else study.constraints, design_spec=dict(config.design_spec) if config.design_spec is not None else dict(study.design_spec), outcomes=config.outcomes if config.outcomes is not None else study.outcomes, analysis_plans=( config.analysis_plans if config.analysis_plans is not None else study.analysis_plans ), run_budget=config.run_budget if config.run_budget is not None else study.run_budget, seed_policy=config.seed_policy if config.seed_policy is not None else study.seed_policy, output_dir=resolved_output_dir, provenance_metadata=( dict(config.provenance_metadata) if config.provenance_metadata is not None else dict(study.provenance_metadata) ), notes=config.notes if config.notes is not None else study.notes, problem_ids=config.problem_ids if config.problem_ids is not None else bundle_problem_ids, agent_specs=config.agent_specs if config.agent_specs is not None else bundle_agent_specs, primary_outcomes=( config.primary_outcomes if config.primary_outcomes is not None else study.primary_outcomes ), secondary_outcomes=( config.secondary_outcomes if config.secondary_outcomes is not None else study.secondary_outcomes ), ) def _comparison_level( *, name: str, value: str, label: str | None = None, is_baseline: bool = False, role: str | None = None, metadata: dict[str, Any] | None = None, ) -> Level: """Build one comparison-aware level with baseline/treatment metadata.""" resolved_metadata = dict(metadata or {}) resolved_metadata.setdefault("role", role or ("baseline" if is_baseline else "treatment")) resolved_metadata.setdefault("is_baseline", is_baseline) return Level(name=name, value=value, label=label, metadata=resolved_metadata) def _comparison_factor( *, name: str, description: str, levels: tuple[Level, ...], ) -> Factor: """Build one manipulated factor annotated as a comparison axis.""" return Factor( name=name, description=description, kind=FactorKind.MANIPULATED, levels=levels, metadata={"comparison_axis": True}, ) def _slug_level_name(value: str, *, index: int, seen: set[str]) -> str: """Normalize a stable level name and keep duplicates unique.""" base = re.sub(r"[^a-z0-9]+", "_", value.lower()).strip("_") or f"level_{index}" candidate = base suffix = 2 while candidate in seen: candidate = f"{base}_{suffix}" suffix += 1 seen.add(candidate) return candidate def _strategy_factor_from_bundle(bundle: BenchmarkBundle) -> Factor: """Build a default agent-strategy comparison factor from one benchmark bundle.""" baseline_index = 0 for index, agent_spec in enumerate(bundle.agent_specs): lowered = agent_spec.lower() if "baseline" in lowered or "random" in lowered: baseline_index = index break seen: set[str] = set() levels = tuple( _comparison_level( name=_slug_level_name(agent_spec, index=index, seen=seen), value=agent_spec, label=agent_spec.replace("-", " ").replace("_", " ").title(), is_baseline=index == baseline_index, ) for index, agent_spec in enumerate(bundle.agent_specs) ) return _comparison_factor( name="agent_id", description="Agent strategy or runtime binding under comparison.", levels=levels, ) def _resolve_comparison_factors( *, config: ComparisonStudyConfig | None, default_factors: tuple[Factor, ...], max_factors: int | None = None, ) -> tuple[Factor, ...]: """Resolve comparison factors with explicit precedence rules.""" if config is not None and config.factors is not None: return config.factors resolved = list(default_factors) if config is not None: if config.comparison_factor is not None: if resolved: resolved[0] = config.comparison_factor else: resolved.append(config.comparison_factor) if config.secondary_factor is not None: if len(resolved) >= 2: resolved[1] = config.secondary_factor elif len(resolved) == 1: resolved.append(config.secondary_factor) else: resolved.extend( factor for factor in (config.comparison_factor, config.secondary_factor) if factor is not None ) if max_factors is not None: return tuple(resolved[:max_factors]) return tuple(resolved) def _comparison_hypothesis( *, factor_names: tuple[str, ...], label: str, statement: str, kind: HypothesisKind = HypothesisKind.EFFECT, ) -> Hypothesis: """Build a default comparison hypothesis.""" return Hypothesis( hypothesis_id="h1", label=label, statement=statement, kind=kind, independent_vars=factor_names, dependent_vars=("primary_outcome",), direction=HypothesisDirection.DIFFERENT, linked_analysis_plan_id="ap1", ) def _finalize_comparison_bindings( study: Study, *, config: ComparisonStudyConfig | None, selected_bundle: BenchmarkBundle, default_problem_ids: tuple[str, ...], default_agent_specs: tuple[str, ...], ) -> Study: """Normalize bundle bindings so factor-bound comparisons do not Cartesian-expand.""" factor_names = {factor.name for factor in study.factors} problem_ids = study.problem_ids agent_specs = study.agent_specs if factor_names & _PROBLEM_FACTOR_NAMES: problem_ids = () elif config is not None and config.problem_ids is None and config.bundle is not None: problem_ids = selected_bundle.problem_ids elif not problem_ids: problem_ids = default_problem_ids if factor_names & _AGENT_FACTOR_NAMES: agent_specs = () elif not agent_specs or ( config is not None and config.agent_specs is None and config.bundle is not None ): agent_specs = default_agent_specs if problem_ids == study.problem_ids and agent_specs == study.agent_specs: return study return replace(study, problem_ids=problem_ids, agent_specs=agent_specs) def _build_comparison_study( *, config: ComparisonStudyConfig | None, study_id: str, title: str, description: str, rationale: str, tags: tuple[str, ...], default_bundle: BenchmarkBundle, default_factors: tuple[Factor, ...], default_problem_ids: tuple[str, ...] | None = None, default_agent_specs: tuple[str, ...] | None = None, hypothesis_label: str, hypothesis_statement: str, hypothesis_kind: HypothesisKind = HypothesisKind.EFFECT, analysis_plots: tuple[str, ...] = ("condition_means",), analysis_export_tables: tuple[str, ...] = ("summary_by_condition",), ) -> Study: """Assemble a comparison-study scaffold and apply typed overrides.""" selected_bundle = ( config.bundle if config is not None and config.bundle is not None else default_bundle ) resolved_factors = _resolve_comparison_factors( config=config, default_factors=default_factors, ) factor_names = tuple(factor.name for factor in resolved_factors) resolved_problem_ids = default_problem_ids or selected_bundle.problem_ids resolved_agent_specs: tuple[str, ...] if default_agent_specs is None: resolved_agent_specs = ( (selected_bundle.agent_specs[0],) if selected_bundle.agent_specs else ("default-agent",) ) else: resolved_agent_specs = default_agent_specs defaults = Study( study_id=study_id, title=title, description=description, authors=("Design Research Collective",), rationale=rationale, tags=tags, hypotheses=( _comparison_hypothesis( factor_names=factor_names, label=hypothesis_label, statement=hypothesis_statement, kind=hypothesis_kind, ), ), factors=resolved_factors, design_spec={"kind": "constrained_factorial", "randomize": True}, outcomes=_default_outcomes(), analysis_plans=( _default_analysis_plan( "h1", plots=analysis_plots, export_tables=analysis_export_tables, ), ), run_budget=RunBudget(replicates=2, parallelism=1), seed_policy=SeedPolicy(base_seed=37), output_dir=Path("artifacts") / study_id, problem_ids=resolved_problem_ids, agent_specs=resolved_agent_specs, primary_outcomes=("primary_outcome",), secondary_outcomes=("latency_s",), ) configured = _apply_recipe_config(defaults, config) return _finalize_comparison_bindings( configured, config=config, selected_bundle=selected_bundle, default_problem_ids=resolved_problem_ids, default_agent_specs=resolved_agent_specs, )
[docs] def build_univariate_comparison_study( config: UnivariateComparisonConfig | None = None, ) -> Study: """Build a one-factor comparison study scaffold over packaged problems.""" bundle = ideation_bundle() return _build_comparison_study( config=config, study_id="univariate-comparison", title="Univariate Comparison Study", description="Compare one manipulated condition across a packaged problem bundle.", rationale="Provide a compact scaffold for one-axis benchmark comparisons.", tags=("comparison", "univariate"), default_bundle=bundle, default_factors=( _comparison_factor( name="comparison_arm", description="Primary comparison arm.", levels=( _comparison_level( name="baseline", value="baseline", label="Baseline", is_baseline=True, ), _comparison_level(name="treatment", value="treatment", label="Treatment"), ), ), ), hypothesis_label="Univariate Comparison Effect", hypothesis_statement="The comparison arm changes the primary outcome.", )
[docs] def build_bivariate_comparison_study( config: BivariateComparisonConfig | None = None, ) -> Study: """Build a two-factor comparison study scaffold over packaged problems.""" bundle = ideation_bundle() return _build_comparison_study( config=config, study_id="bivariate-comparison", title="Bivariate Comparison Study", description="Compare two manipulated axes across a packaged problem bundle.", rationale="Provide a reusable scaffold for pairwise comparison designs and interactions.", tags=("comparison", "bivariate"), default_bundle=bundle, default_factors=( _comparison_factor( name="comparison_arm", description="Primary comparison arm.", levels=( _comparison_level( name="baseline", value="baseline", label="Baseline", is_baseline=True, ), _comparison_level(name="treatment", value="treatment", label="Treatment"), ), ), _comparison_factor( name="prompt_regime", description="Secondary comparison axis.", levels=( _comparison_level( name="standard", value="standard", label="Standard", is_baseline=True, ), _comparison_level(name="structured", value="structured", label="Structured"), ), ), ), hypothesis_label="Bivariate Comparison Effect", hypothesis_statement="The comparison axes jointly change the primary outcome.", analysis_plots=("condition_means", "interaction_means"), analysis_export_tables=("summary_by_condition", "summary_by_interaction"), )
[docs] def build_strategy_comparison_study( config: StrategyComparisonConfig | None = None, ) -> Study: """Build a packaged-problem strategy comparison study scaffold.""" bundle = ( config.bundle if config is not None and config.bundle is not None else optimization_bundle() ) default_factor = ( config.comparison_factor if config is not None and config.comparison_factor is not None else _strategy_factor_from_bundle(bundle) ) return _build_comparison_study( config=config, study_id="strategy-comparison", title="Strategy Comparison Study", description="Compare named agent strategies on packaged benchmark problems.", rationale="Centralize canonical strategy-comparison wiring for packaged benchmarks.", tags=("comparison", "strategy", "benchmark"), default_bundle=bundle, default_factors=(default_factor,), default_problem_ids=bundle.problem_ids, default_agent_specs=bundle.agent_specs, hypothesis_label="Strategy Comparison Effect", hypothesis_statement=( "Agent strategy changes the primary outcome on the packaged benchmark." ), hypothesis_kind=HypothesisKind.ROBUSTNESS, )
[docs] def build_agent_architecture_comparison_study( config: AgentArchitectureComparisonConfig | None = None, ) -> Study: """Build a study comparing agent architecture choices across prompt difficulty.""" hypothesis = Hypothesis( hypothesis_id="h1", label="Architecture Effect", statement="Agent architecture changes primary outcome.", kind=HypothesisKind.EFFECT, independent_vars=("agent_architecture", "prompt_difficulty"), dependent_vars=("primary_outcome",), direction=HypothesisDirection.DIFFERENT, linked_analysis_plan_id="ap1", ) defaults = Study( study_id="agent-architecture-comparison", title="Agent Architecture Comparison", description="Compare architecture variants across prompt difficulty manipulations.", authors=("Design Research Collective",), rationale="Benchmark architecture and workflow pattern differences.", tags=("ideation", "architecture"), hypotheses=(hypothesis,), factors=( Factor( name="agent_architecture", description="Agent architecture family.", kind=FactorKind.MANIPULATED, levels=( Level(name="direct", value="direct-llm"), Level(name="multistep", value="multi-step"), Level(name="reflective", value="reflective"), ), ), Factor( name="prompt_difficulty", description="Prompt complexity level.", kind=FactorKind.MANIPULATED, levels=( Level(name="easy", value="easy"), Level(name="medium", value="medium"), Level(name="hard", value="hard"), ), ), ), blocks=(Block(name="problem_family", levels=("ideation", "optimization")),), design_spec={"kind": "constrained_factorial", "randomize": True}, outcomes=_default_outcomes(), analysis_plans=(_default_analysis_plan("h1"),), run_budget=RunBudget(replicates=2, parallelism=1), seed_policy=SeedPolicy(base_seed=7), output_dir=Path("artifacts") / "agent-architecture-comparison", problem_ids=("problem-a", "problem-b"), agent_specs=("direct-llm", "multi-step", "reflective"), primary_outcomes=("primary_outcome",), secondary_outcomes=("latency_s",), ) return _apply_recipe_config(defaults, config)
[docs] def build_prompt_framing_study(config: PromptFramingConfig | None = None) -> Study: """Build an ideation study with framing and prompt manipulation.""" hypothesis = Hypothesis( hypothesis_id="h1", label="Prompt Framing Effect", statement="Prompt framing changes novelty and diversity outcomes.", kind=HypothesisKind.EFFECT, independent_vars=("prompt_frame", "prompt_difficulty"), dependent_vars=("primary_outcome",), direction=HypothesisDirection.DIFFERENT, linked_analysis_plan_id="ap1", ) defaults = Study( study_id="prompt-framing-study", title="Prompt Framing Study", description="Measure framing effects in ideation tasks.", authors=("Design Research Collective",), rationale="Quantify framing manipulations in creative generation.", tags=("ideation", "framing"), hypotheses=(hypothesis,), factors=( Factor( name="prompt_frame", description="Prompt framing style.", kind=FactorKind.MANIPULATED, levels=( Level(name="neutral", value="neutral"), Level(name="challenge", value="challenge"), Level(name="analogy", value="analogy"), ), ), Factor( name="prompt_difficulty", description="Prompt difficulty.", kind=FactorKind.MANIPULATED, levels=( Level(name="low", value="low"), Level(name="high", value="high"), ), ), ), blocks=(Block(name="domain", levels=("mobility", "health")),), design_spec={"kind": "randomized_block", "randomize": True}, outcomes=_default_outcomes(), analysis_plans=(_default_analysis_plan("h1"),), run_budget=RunBudget(replicates=3, parallelism=1), seed_policy=SeedPolicy(base_seed=11), output_dir=Path("artifacts") / "prompt-framing-study", problem_ids=("ideation-1", "ideation-2", "ideation-3"), agent_specs=("baseline-agent", "creative-agent"), primary_outcomes=("primary_outcome",), secondary_outcomes=("latency_s",), ) return _apply_recipe_config(defaults, config)
[docs] def build_grammar_scaffold_study(config: GrammarScaffoldConfig | None = None) -> Study: """Build a study comparing unconstrained and grammar-guided generation.""" hypothesis = Hypothesis( hypothesis_id="h1", label="Grammar Guidance Effect", statement="Grammar-guided generation improves primary outcome.", kind=HypothesisKind.EFFECT, independent_vars=("generation_mode",), dependent_vars=("primary_outcome",), direction=HypothesisDirection.GREATER, linked_analysis_plan_id="ap1", ) defaults = Study( study_id="grammar-scaffold-study", title="Grammar Scaffold Study", description="Benchmark constrained vs unconstrained generation modes.", authors=("Design Research Collective",), rationale="Assess structural scaffolds for design-generation quality.", tags=("grammar", "constrained-generation"), hypotheses=(hypothesis,), factors=( Factor( name="generation_mode", description="Generation scaffold type.", kind=FactorKind.MANIPULATED, levels=( Level(name="free", value="unconstrained"), Level(name="grammar", value="grammar-guided"), Level(name="tool", value="tool-guided"), ), ), ), blocks=(Block(name="problem_family", levels=("grammar", "text")),), design_spec={"kind": "full_factorial", "randomize": True}, outcomes=_default_outcomes(), analysis_plans=(_default_analysis_plan("h1"),), run_budget=RunBudget(replicates=2, parallelism=1), seed_policy=SeedPolicy(base_seed=19), output_dir=Path("artifacts") / "grammar-scaffold-study", problem_ids=("grammar-1", "grammar-2"), agent_specs=("direct-llm", "workflow-agent"), primary_outcomes=("primary_outcome",), secondary_outcomes=("latency_s",), ) return _apply_recipe_config(defaults, config)
[docs] def build_human_vs_agent_process_study(config: HumanVsAgentProcessConfig | None = None) -> Study: """Build a study comparing human-only, AI-assisted, and hybrid teams.""" hypothesis = Hypothesis( hypothesis_id="h1", label="Teaming Configuration Effect", statement="Hybrid teams alter process traces and outcomes.", kind=HypothesisKind.MODERATION, independent_vars=("team_mode",), dependent_vars=("primary_outcome",), direction=HypothesisDirection.DIFFERENT, linked_analysis_plan_id="ap1", ) defaults = Study( study_id="human-vs-agent-process", title="Human vs Agent Process Study", description="Capture communication and action traces across teaming modes.", authors=("Design Research Collective",), rationale="Compare human-only, AI-assisted, and hybrid workflows.", tags=("teaming", "process-trace"), hypotheses=(hypothesis,), factors=( Factor( name="team_mode", description="Team configuration.", kind=FactorKind.MANIPULATED, levels=( Level(name="human", value="human-only"), Level(name="assist", value="ai-assisted"), Level(name="hybrid", value="hybrid"), ), ), ), blocks=(Block(name="cohort", levels=("novice", "expert")),), design_spec={"kind": "repeated_measures", "counterbalance": True}, outcomes=_default_outcomes(), analysis_plans=(_default_analysis_plan("h1"),), run_budget=RunBudget(replicates=1, parallelism=1), seed_policy=SeedPolicy(base_seed=23), output_dir=Path("artifacts") / "human-vs-agent-process", problem_ids=("teaming-1", "teaming-2"), agent_specs=("human-only", "ai-assisted", "hybrid"), primary_outcomes=("primary_outcome",), secondary_outcomes=("latency_s",), ) return _apply_recipe_config(defaults, config)
[docs] def build_diversity_and_exploration_study( config: DiversityAndExplorationConfig | None = None, ) -> Study: """Build a study evaluating diversity and exploration outcomes.""" hypothesis = Hypothesis( hypothesis_id="h1", label="Exploration Strategy Robustness", statement="Exploration-heavy strategies increase diversity metrics.", kind=HypothesisKind.ROBUSTNESS, independent_vars=("search_strategy",), dependent_vars=("primary_outcome",), direction=HypothesisDirection.GREATER, linked_analysis_plan_id="ap1", ) defaults = Study( study_id="diversity-exploration", title="Diversity and Exploration Study", description="Assess exploration strategies across benchmark families.", authors=("Design Research Collective",), rationale="Characterize diversity-quality tradeoffs.", tags=("optimization", "diversity"), hypotheses=(hypothesis,), factors=( Factor( name="search_strategy", description="Exploration strategy.", kind=FactorKind.MANIPULATED, levels=( Level(name="greedy", value="greedy"), Level(name="epsilon", value="epsilon-greedy"), Level(name="ucb", value="ucb"), ), ), ), blocks=(Block(name="problem_family", levels=("bench-a", "bench-b", "bench-c")),), design_spec={"kind": "randomized_block", "randomize": True}, outcomes=_default_outcomes(), analysis_plans=(_default_analysis_plan("h1"),), run_budget=RunBudget(replicates=2, parallelism=1), seed_policy=SeedPolicy(base_seed=29), output_dir=Path("artifacts") / "diversity-exploration", problem_ids=("opt-a", "opt-b", "opt-c"), agent_specs=("deterministic", "self-learning"), primary_outcomes=("primary_outcome",), secondary_outcomes=("latency_s",), ) return _apply_recipe_config(defaults, config)
[docs] def build_optimization_benchmark_study( config: OptimizationBenchmarkConfig | None = None, ) -> Study: """Build a benchmark study for optimization generalization and learning effects.""" hypothesis = Hypothesis( hypothesis_id="h1", label="Learning Strategy Generalization", statement="Self-learning agents outperform deterministic baselines across families.", kind=HypothesisKind.ROBUSTNESS, independent_vars=("learning_strategy", "tuning_regime"), dependent_vars=("primary_outcome",), direction=HypothesisDirection.GREATER, linked_analysis_plan_id="ap1", ) bundle = optimization_bundle() defaults = Study( study_id="optimization-benchmark", title="Optimization Benchmark Study", description=( "Compare self-learning and deterministic baselines across optimization " "families and tuning regimes." ), authors=("Design Research Collective",), rationale="Measure optimization quality, robustness, and cross-family generalization.", tags=("optimization", "benchmark", "generalization"), hypotheses=(hypothesis,), factors=( Factor( name="learning_strategy", description="Agent learning approach.", kind=FactorKind.MANIPULATED, levels=( Level(name="deterministic", value="deterministic-baseline"), Level(name="self_learning", value="self-learning-agent"), ), ), Factor( name="tuning_regime", description="Hyperparameter tuning regime.", kind=FactorKind.MANIPULATED, levels=( Level(name="conservative", value="conservative"), Level(name="aggressive", value="aggressive"), ), ), ), blocks=(Block(name="problem_family", levels=("small", "medium", "large")),), design_spec={"kind": "randomized_block", "randomize": True}, outcomes=_default_outcomes(), analysis_plans=( AnalysisPlan( analysis_plan_id="ap1", hypothesis_ids=("h1",), tests=("difference_in_means", "mixed_effects"), outcomes=("primary_outcome",), random_effects=("problem_family",), plots=("family_condition_means",), export_tables=("optimization_summary",), multiple_comparison_policy="holm", ), ), run_budget=RunBudget(replicates=2, parallelism=1), seed_policy=SeedPolicy(base_seed=31), output_dir=Path("artifacts") / "optimization-benchmark", problem_ids=bundle.problem_ids, agent_specs=bundle.agent_specs, primary_outcomes=("primary_outcome",), secondary_outcomes=("latency_s",), ) return _apply_recipe_config(defaults, config)
__all__ = [ "AgentArchitectureComparisonConfig", "BivariateComparisonConfig", "ComparisonStudyConfig", "DiversityAndExplorationConfig", "GrammarScaffoldConfig", "HumanVsAgentProcessConfig", "OptimizationBenchmarkConfig", "PromptFramingConfig", "RecipeStudyConfig", "StrategyComparisonConfig", "UnivariateComparisonConfig", "build_agent_architecture_comparison_study", "build_bivariate_comparison_study", "build_diversity_and_exploration_study", "build_grammar_scaffold_study", "build_human_vs_agent_process_study", "build_optimization_benchmark_study", "build_prompt_framing_study", "build_strategy_comparison_study", "build_univariate_comparison_study", ]