"""IMP-43 (#72) u8 — measure ``--reuse-from`` wall-clock savings. Argv-driven measurement helper for the Stage 2 §u8 binding contract: re-derive a realistic savings target instead of mirroring the unverified issue-body 50–70% / 10–20s → 3–8s claim. Per-iteration measurement protocol (mirrors the u7a equivalence harness, ``tests/test_phase_z2_reuse_from_equivalence_unit.py``): (A) baseline full run, no overrides — reuse seed (B) full rerun full run + one --override-frame pin — control path (C) reuse --reuse-from + same pin — reuse path Wall-clock = ``time.perf_counter()`` around the subprocess.run call. The (A) seed run time is reported separately and NOT included in the B-vs-C comparison (the reuse path's whole point is that the seed already exists from a prior interactive run). For each iteration the frame pin is self-discovered from the seed run's ``step06_composition_plan.json``: the first unit's ``frame_template_id`` is re-pinned to itself, exercising the ``--override-frame`` CLI surface end-to-end without changing the semantic frame assignment (same approach the u7a/u7b equivalence tests already lock). Output: a JSON document to stdout with per-iteration timings, B/C p50 + p95, and the ratio C/B. Stderr carries the subprocess stdout/stderr tails on non-zero exits. Guardrails (Stage 2): * argv-driven, no hardcoded mdx — caller picks the sample * no hardcoded savings target — TBD until measured * value + path + upstream provenance lives in the printed JSON * does NOT mutate prev_run_dir; new runs land under fresh run_ids """ from __future__ import annotations import argparse import json import statistics import subprocess import sys import time import uuid from pathlib import Path REPO_ROOT = Path(__file__).resolve().parents[1] RUNS_DIR = REPO_ROOT / "data" / "runs" def _unique_run_id(prefix: str) -> str: return f"{prefix}_imp43_u8_{uuid.uuid4().hex[:8]}" def _spawn(extra_args: list[str], timeout: int) -> tuple[subprocess.CompletedProcess, float]: start = time.perf_counter() cp = subprocess.run( [sys.executable, "-m", "src.phase_z2_pipeline", *extra_args], capture_output=True, text=True, timeout=timeout, cwd=str(REPO_ROOT), ) return cp, time.perf_counter() - start def _assert_ok(label: str, cp: subprocess.CompletedProcess) -> None: if cp.returncode != 0: sys.stderr.write( f"[measure_reuse_savings] {label} failed rc={cp.returncode}\n" f"--- stderr tail ---\n{cp.stderr[-2000:]}\n" f"--- stdout tail ---\n{cp.stdout[-2000:]}\n" ) raise SystemExit(2) def _discover_first_frame_pin(seed_run_id: str) -> tuple[str, str]: p = RUNS_DIR / seed_run_id / "phase_z2" / "steps" / "step06_composition_plan.json" payload = json.loads(p.read_text(encoding="utf-8")) for u in payload.get("data", {}).get("selected_units") or []: sids = u.get("source_section_ids") or [] tpl = u.get("frame_template_id") if isinstance(sids, list) and sids and isinstance(tpl, str) and tpl: return ("+".join(str(s) for s in sids), tpl) raise SystemExit( f"[measure_reuse_savings] seed {seed_run_id} step06 has no pinnable " f"(unit_id, frame_template_id); path={p}" ) def _percentile(values: list[float], pct: float) -> float: if not values: return float("nan") if len(values) == 1: return values[0] s = sorted(values) k = (len(s) - 1) * pct lo = int(k) hi = min(lo + 1, len(s) - 1) return s[lo] + (s[hi] - s[lo]) * (k - lo) def main() -> int: ap = argparse.ArgumentParser( prog="python -m scripts.measure_reuse_savings", description="Measure IMP-43 --reuse-from wall-clock savings.", ) ap.add_argument("mdx_path", type=Path, help="MDX sample to measure against") ap.add_argument("--iterations", type=int, default=3, help="trials (default 3)") ap.add_argument("--timeout", type=int, default=900, help="per-run timeout seconds") args = ap.parse_args() if not args.mdx_path.is_file(): sys.stderr.write(f"[measure_reuse_savings] mdx not found: {args.mdx_path}\n") return 2 iterations: list[dict] = [] for i in range(args.iterations): seed_id = _unique_run_id(f"seed{i}") cp_a, t_a = _spawn([str(args.mdx_path), seed_id], args.timeout) _assert_ok(f"(A) seed iter={i}", cp_a) unit_id, tpl_id = _discover_first_frame_pin(seed_id) override = ["--override-frame", f"{unit_id}={tpl_id}"] full_id = _unique_run_id(f"full{i}") cp_b, t_b = _spawn([str(args.mdx_path), full_id, *override], args.timeout) _assert_ok(f"(B) full rerun iter={i}", cp_b) reuse_id = _unique_run_id(f"reuse{i}") cp_c, t_c = _spawn( [str(args.mdx_path), reuse_id, "--reuse-from", seed_id, *override], args.timeout, ) _assert_ok(f"(C) reuse iter={i}", cp_c) iterations.append({ "iter": i, "seed_run_id": seed_id, "full_run_id": full_id, "reuse_run_id": reuse_id, "override_frame": f"{unit_id}={tpl_id}", "seed_seconds": t_a, "full_rerun_seconds": t_b, "reuse_seconds": t_c, }) full_times = [it["full_rerun_seconds"] for it in iterations] reuse_times = [it["reuse_seconds"] for it in iterations] summary = { "mdx_path": str(args.mdx_path), "iterations_count": len(iterations), "full_rerun_seconds_p50": _percentile(full_times, 0.50), "full_rerun_seconds_p95": _percentile(full_times, 0.95), "reuse_seconds_p50": _percentile(reuse_times, 0.50), "reuse_seconds_p95": _percentile(reuse_times, 0.95), "reuse_over_full_ratio_p50": ( _percentile(reuse_times, 0.50) / _percentile(full_times, 0.50) if full_times and statistics.median(full_times) > 0 else float("nan") ), "iterations": iterations, "note": ( "IMP-43 (#72) u8 measurement. Issue-body 50–70% / 10–20s → 3–8s " "claim is NOT honored here — actual numbers depend on host, " "Selenium cold-start, and AI cache state. Update " "docs/architecture/PHASE-Z-PIPELINE-STATUS-BOARD.md §8 with the " "p50/p95 reported here when run on the project's reference host." ), } sys.stdout.write(json.dumps(summary, ensure_ascii=False, indent=2)) sys.stdout.write("\n") return 0 if __name__ == "__main__": raise SystemExit(main())