feat(#72): IMP-43 u1~u8 --reuse-from incremental rerun (Step 0/1/2/5/6 reuse + Step 7+ re-execute)

u1 argparse --reuse-from PREV_RUN_ID + post-merge fail-closed guard (rejects layout/zone_geometry/zone_section/image override axes by name; only --override-frame is preserved). u2 src/phase_z2_reuse_snapshot.py — JSON-only Step 6 snapshot with mdx_sha256 integrity key and {value, source_path, upstream_step} provenance per axis (pickle forbidden per Stage 2 guardrail). u3 _write_reuse_snapshot at the Step 6 boundary; soft-fails to stderr without aborting the seed run. u4 prev_run_dir RO copy of step00/01/02/05/06 + _reuse_snapshot.json into new run_dir, state rehydration, reuse marker, frame-override application on restored units, Step 7+ resume. u4b fail-closed for missing prev_run_dir / missing/corrupt/invalid snapshot / mdx_sha256 mismatch / accidental new==prev write, with value+path+upstream diagnostics per axis. u5 reuse_from Optional[str] threaded through run_phase_z2_mvp1 signature and CLI dispatch; default None preserves byte-identical pre-IMP-43 behavior. u6 Front /api/run optional reuseFromRunId forwarding (vite.config.ts + designAgentApi.ts + run_pipeline_reuse_from.test.ts). u7a fast CI equivalence (1 mdx × 1 layout × 2 frames); step13 whitelist = run_id/timestamps/prev_run_id only. u7b 3 layouts × 3 mdx × 32 frames sweep gated by pytest.mark.sweep (registered in pyproject.toml; default CI must use -m 'not sweep'). u8 scripts/measure_reuse_savings.py argv-driven A/B/C harness with frame pin self-discovery + seed-time exclusion; status board §8 TBD anchor (issue-body 50-70% / 10-20s→3-8s claim explicitly unverified, not mirrored). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-24 22:44:27 +09:00
parent 8648a468d9
commit b4be6c1cd0
15 changed files with 5128 additions and 656 deletions
--- a/scripts/measure_reuse_savings.py
+++ b/scripts/measure_reuse_savings.py
@@ -0,0 +1,178 @@
+"""IMP-43 (#72) u8 — measure ``--reuse-from`` wall-clock savings.
+
+Argv-driven measurement helper for the Stage 2 §u8 binding contract:
+re-derive a realistic savings target instead of mirroring the
+unverified issue-body 50–70% / 10–20s → 3–8s claim.
+
+Per-iteration measurement protocol (mirrors the u7a equivalence
+harness, ``tests/test_phase_z2_reuse_from_equivalence_unit.py``):
+
+  (A) baseline   full run, no overrides              — reuse seed
+  (B) full rerun full run + one --override-frame pin — control path
+  (C) reuse      --reuse-from <seed> + same pin      — reuse path
+
+Wall-clock = ``time.perf_counter()`` around the subprocess.run call.
+The (A) seed run time is reported separately and NOT included in the
+B-vs-C comparison (the reuse path's whole point is that the seed
+already exists from a prior interactive run).
+
+For each iteration the frame pin is self-discovered from the seed
+run's ``step06_composition_plan.json``: the first unit's
+``frame_template_id`` is re-pinned to itself, exercising the
+``--override-frame`` CLI surface end-to-end without changing the
+semantic frame assignment (same approach the u7a/u7b equivalence
+tests already lock).
+
+Output: a JSON document to stdout with per-iteration timings,
+B/C p50 + p95, and the ratio C/B. Stderr carries the subprocess
+stdout/stderr tails on non-zero exits.
+
+Guardrails (Stage 2):
+  * argv-driven, no hardcoded mdx — caller picks the sample
+  * no hardcoded savings target — TBD until measured
+  * value + path + upstream provenance lives in the printed JSON
+  * does NOT mutate prev_run_dir; new runs land under fresh run_ids
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import statistics
+import subprocess
+import sys
+import time
+import uuid
+from pathlib import Path
+
+REPO_ROOT = Path(__file__).resolve().parents[1]
+RUNS_DIR = REPO_ROOT / "data" / "runs"
+
+
+def _unique_run_id(prefix: str) -> str:
+    return f"{prefix}_imp43_u8_{uuid.uuid4().hex[:8]}"
+
+
+def _spawn(extra_args: list[str], timeout: int) -> tuple[subprocess.CompletedProcess, float]:
+    start = time.perf_counter()
+    cp = subprocess.run(
+        [sys.executable, "-m", "src.phase_z2_pipeline", *extra_args],
+        capture_output=True,
+        text=True,
+        timeout=timeout,
+        cwd=str(REPO_ROOT),
+    )
+    return cp, time.perf_counter() - start
+
+
+def _assert_ok(label: str, cp: subprocess.CompletedProcess) -> None:
+    if cp.returncode != 0:
+        sys.stderr.write(
+            f"[measure_reuse_savings] {label} failed rc={cp.returncode}\n"
+            f"--- stderr tail ---\n{cp.stderr[-2000:]}\n"
+            f"--- stdout tail ---\n{cp.stdout[-2000:]}\n"
+        )
+        raise SystemExit(2)
+
+
+def _discover_first_frame_pin(seed_run_id: str) -> tuple[str, str]:
+    p = RUNS_DIR / seed_run_id / "phase_z2" / "steps" / "step06_composition_plan.json"
+    payload = json.loads(p.read_text(encoding="utf-8"))
+    for u in payload.get("data", {}).get("selected_units") or []:
+        sids = u.get("source_section_ids") or []
+        tpl = u.get("frame_template_id")
+        if isinstance(sids, list) and sids and isinstance(tpl, str) and tpl:
+            return ("+".join(str(s) for s in sids), tpl)
+    raise SystemExit(
+        f"[measure_reuse_savings] seed {seed_run_id} step06 has no pinnable "
+        f"(unit_id, frame_template_id); path={p}"
+    )
+
+
+def _percentile(values: list[float], pct: float) -> float:
+    if not values:
+        return float("nan")
+    if len(values) == 1:
+        return values[0]
+    s = sorted(values)
+    k = (len(s) - 1) * pct
+    lo = int(k)
+    hi = min(lo + 1, len(s) - 1)
+    return s[lo] + (s[hi] - s[lo]) * (k - lo)
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(
+        prog="python -m scripts.measure_reuse_savings",
+        description="Measure IMP-43 --reuse-from wall-clock savings.",
+    )
+    ap.add_argument("mdx_path", type=Path, help="MDX sample to measure against")
+    ap.add_argument("--iterations", type=int, default=3, help="trials (default 3)")
+    ap.add_argument("--timeout", type=int, default=900, help="per-run timeout seconds")
+    args = ap.parse_args()
+
+    if not args.mdx_path.is_file():
+        sys.stderr.write(f"[measure_reuse_savings] mdx not found: {args.mdx_path}\n")
+        return 2
+
+    iterations: list[dict] = []
+    for i in range(args.iterations):
+        seed_id = _unique_run_id(f"seed{i}")
+        cp_a, t_a = _spawn([str(args.mdx_path), seed_id], args.timeout)
+        _assert_ok(f"(A) seed iter={i}", cp_a)
+
+        unit_id, tpl_id = _discover_first_frame_pin(seed_id)
+        override = ["--override-frame", f"{unit_id}={tpl_id}"]
+
+        full_id = _unique_run_id(f"full{i}")
+        cp_b, t_b = _spawn([str(args.mdx_path), full_id, *override], args.timeout)
+        _assert_ok(f"(B) full rerun iter={i}", cp_b)
+
+        reuse_id = _unique_run_id(f"reuse{i}")
+        cp_c, t_c = _spawn(
+            [str(args.mdx_path), reuse_id, "--reuse-from", seed_id, *override],
+            args.timeout,
+        )
+        _assert_ok(f"(C) reuse iter={i}", cp_c)
+
+        iterations.append({
+            "iter": i,
+            "seed_run_id": seed_id,
+            "full_run_id": full_id,
+            "reuse_run_id": reuse_id,
+            "override_frame": f"{unit_id}={tpl_id}",
+            "seed_seconds": t_a,
+            "full_rerun_seconds": t_b,
+            "reuse_seconds": t_c,
+        })
+
+    full_times = [it["full_rerun_seconds"] for it in iterations]
+    reuse_times = [it["reuse_seconds"] for it in iterations]
+
+    summary = {
+        "mdx_path": str(args.mdx_path),
+        "iterations_count": len(iterations),
+        "full_rerun_seconds_p50": _percentile(full_times, 0.50),
+        "full_rerun_seconds_p95": _percentile(full_times, 0.95),
+        "reuse_seconds_p50": _percentile(reuse_times, 0.50),
+        "reuse_seconds_p95": _percentile(reuse_times, 0.95),
+        "reuse_over_full_ratio_p50": (
+            _percentile(reuse_times, 0.50) / _percentile(full_times, 0.50)
+            if full_times and statistics.median(full_times) > 0
+            else float("nan")
+        ),
+        "iterations": iterations,
+        "note": (
+            "IMP-43 (#72) u8 measurement. Issue-body 50–70% / 10–20s → 3–8s "
+            "claim is NOT honored here — actual numbers depend on host, "
+            "Selenium cold-start, and AI cache state. Update "
+            "docs/architecture/PHASE-Z-PIPELINE-STATUS-BOARD.md §8 with the "
+            "p50/p95 reported here when run on the project's reference host."
+        ),
+    }
+    sys.stdout.write(json.dumps(summary, ensure_ascii=False, indent=2))
+    sys.stdout.write("\n")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())