Lab Study Pipeline#
Source: examples/lab_study_pipeline.py
Introduction#
Run a realistic small-sample lab workflow across control vs reframed conditions with sequence, language, dataset, embedding-map, statistical, and provenance outputs.
Technical Implementation#
Build an in-memory unified event table with condition labels and outcome fields.
Validate table quality and compute sequence/language summaries.
Profile and validate a dataframe schema; generate a codebook.
Run group comparison, regression, bootstrap, permutation, and power helpers.
Run PCA embedding-map clustering and write a reproducibility manifest with attached provenance payload.
1from __future__ import annotations
2
3from pathlib import Path
4
5import numpy as np
6import pandas as pd
7
8import design_research_analysis as dran
9
10
11def main() -> None:
12 """Run a compact, lab-authentic analysis workflow with reproducibility metadata."""
13 rows = [
14 {
15 "timestamp": "2026-02-03T09:00:00Z",
16 "session_id": "ctrl-01",
17 "condition": "control",
18 "actor_id": "designer-a",
19 "event_type": "propose",
20 "text": "good first concept from prior baseline",
21 "novelty_score": 4.2,
22 "cycle_time_min": 15.0,
23 },
24 {
25 "timestamp": "2026-02-03T09:01:00Z",
26 "session_id": "ctrl-01",
27 "condition": "control",
28 "actor_id": "designer-b",
29 "event_type": "evaluate",
30 "text": "difficult tradeoff discussion with weak evidence",
31 "novelty_score": 4.0,
32 "cycle_time_min": 17.0,
33 },
34 {
35 "timestamp": "2026-02-03T09:02:00Z",
36 "session_id": "ctrl-01",
37 "condition": "control",
38 "actor_id": "designer-a",
39 "event_type": "refine",
40 "text": "small improvement but still unclear mechanism",
41 "novelty_score": 4.3,
42 "cycle_time_min": 16.0,
43 },
44 {
45 "timestamp": "2026-02-03T09:03:00Z",
46 "session_id": "ctrl-01",
47 "condition": "control",
48 "actor_id": "designer-b",
49 "event_type": "evaluate",
50 "text": "better but risky integration path",
51 "novelty_score": 4.1,
52 "cycle_time_min": 16.5,
53 },
54 {
55 "timestamp": "2026-02-03T10:00:00Z",
56 "session_id": "reframe-01",
57 "condition": "reframed",
58 "actor_id": "designer-c",
59 "event_type": "propose",
60 "text": "clear reframed concept with strong rationale",
61 "novelty_score": 6.3,
62 "cycle_time_min": 12.0,
63 },
64 {
65 "timestamp": "2026-02-03T10:01:00Z",
66 "session_id": "reframe-01",
67 "condition": "reframed",
68 "actor_id": "designer-d",
69 "event_type": "evaluate",
70 "text": "helpful critique and collaborative option merge",
71 "novelty_score": 6.6,
72 "cycle_time_min": 11.5,
73 },
74 {
75 "timestamp": "2026-02-03T10:02:00Z",
76 "session_id": "reframe-01",
77 "condition": "reframed",
78 "actor_id": "designer-c",
79 "event_type": "refine",
80 "text": "effective refinement with successful constraint closure",
81 "novelty_score": 6.8,
82 "cycle_time_min": 10.5,
83 },
84 {
85 "timestamp": "2026-02-03T10:03:00Z",
86 "session_id": "reframe-01",
87 "condition": "reframed",
88 "actor_id": "designer-d",
89 "event_type": "evaluate",
90 "text": "excellent final concept and clear evidence trail",
91 "novelty_score": 6.7,
92 "cycle_time_min": 11.0,
93 },
94 ]
95
96 table = dran.coerce_unified_table(rows, config=dran.UnifiedTableConfig())
97 table = dran.derive_columns(table)
98 report = dran.validate_unified_table(table)
99 if not report.is_valid:
100 raise RuntimeError(f"Unified table validation failed: {report.errors}")
101
102 markov = dran.fit_markov_chain_from_table(table, order=1, smoothing=1.0)
103
104 embedding_lookup = {
105 row["text"]: [float(index), float(len(row["text"].split()))]
106 for index, row in enumerate(table, start=1)
107 }
108 trajectory = dran.compute_semantic_distance_trajectory(
109 table,
110 window_size=2,
111 embedder=lambda texts: [embedding_lookup[text] for text in texts],
112 )
113 convergence = dran.compute_language_convergence(
114 table,
115 window_size=2,
116 embedder=lambda texts: [embedding_lookup[text] for text in texts],
117 )
118 sentiment = dran.score_sentiment(table)
119
120 frame = pd.DataFrame(table)
121 profile = dran.profile_dataframe(frame)
122 schema_check = dran.validate_dataframe(
123 frame,
124 {
125 "session_id": {"dtype": "string", "required": True, "nullable": False},
126 "condition": {
127 "dtype": "string",
128 "required": True,
129 "allowed": ["control", "reframed"],
130 },
131 "novelty_score": {"dtype": "numeric", "required": True, "min": 0.0, "max": 10.0},
132 "cycle_time_min": {"dtype": "numeric", "required": True, "min": 0.0},
133 },
134 )
135 codebook = dran.generate_codebook(frame)
136
137 novelty = frame["novelty_score"].astype(float).tolist()
138 cycle_time = frame["cycle_time_min"].astype(float).tolist()
139 conditions = frame["condition"].astype(str).tolist()
140 control = [score for score, cond in zip(novelty, conditions, strict=True) if cond == "control"]
141 reframed = [
142 score for score, cond in zip(novelty, conditions, strict=True) if cond == "reframed"
143 ]
144
145 group_test = dran.compare_groups(values=novelty, groups=conditions, method="ttest")
146 regression = dran.fit_regression(
147 [[minutes] for minutes in cycle_time],
148 novelty,
149 feature_names=["cycle_time_min"],
150 )
151 bootstrap = dran.bootstrap_ci(novelty, n_resamples=500, seed=11)
152 permutation = dran.permutation_test(control, reframed, n_permutations=500, seed=11)
153 sample_size = dran.estimate_sample_size(
154 effect_size=0.8,
155 test="two_sample_t",
156 alpha=0.05,
157 power=0.8,
158 )
159 curve = dran.power_curve([0.2, 0.5, 0.8], n=24, test="two_sample_t")
160 mde = dran.minimum_detectable_effect(n=24, test="two_sample_t", alpha=0.05, power=0.8)
161
162 vectors = np.asarray(
163 [
164 [4.2, 15.0, 1.0],
165 [4.1, 16.8, 1.2],
166 [6.6, 11.2, 2.2],
167 [6.7, 10.8, 2.4],
168 ],
169 dtype=float,
170 )
171 embedding_map = dran.build_embedding_map(vectors, method="pca", n_components=2)
172 clusters = dran.cluster_embedding_map(embedding_map, method="kmeans", n_clusters=2)
173
174 context = dran.capture_run_context(seed=11)
175 manifest_path = Path("artifacts/runtime/lab_study_manifest.json")
176 dran.write_run_manifest(context, manifest_path)
177 payload = dran.attach_provenance(
178 {
179 "table_ok": report.is_valid,
180 "schema_ok": bool(schema_check["ok"]),
181 "codebook_columns": len(codebook),
182 },
183 context,
184 )
185
186 print(f"Markov states: {len(markov.states)}")
187 print(f"Trajectory groups: {sorted(trajectory)}")
188 print(f"Convergence labels: {convergence.direction_by_group}")
189 print(f"Sentiment docs: {sentiment['n_documents']}")
190 print(f"Profiled rows: {profile['n_rows']}")
191 print(f"Group p-value: {group_test.p_value:.3f}")
192 print(f"Regression R2: {regression.r2:.3f}")
193 print(f"Bootstrap estimate: {bootstrap['estimate']:.3f}")
194 print(f"Permutation p-value: {permutation['p_value']:.3f}")
195 print(f"Recommended n: {sample_size['recommended_n']}")
196 print(f"Power curve points: {len(curve)}")
197 print(f"MDE at n=24: {mde['minimum_detectable_effect']:.3f}")
198 print(f"Cluster labels: {clusters['labels']}")
199 print(f"Manifest written: {manifest_path}")
200 print(f"Payload keys: {sorted(payload)}")
201
202
203if __name__ == "__main__":
204 main()
Expected Results#
Run Command
PYTHONPATH=src python examples/lab_study_pipeline.py
Prints concise summaries for state count, convergence labels, sentiment totals,
schema/profile diagnostics, key statistical metrics, clustering labels, and the
manifest path under artifacts/runtime.
References#
docs/workflows.rst
docs/analysis_recipes.rst