feat(#72): IMP-43 u1~u8 --reuse-from incremental rerun (Step 0/1/2/5/6 reuse + Step 7+ re-execute)
Some checks failed
Multi-MDX Regression (IMP-91) / multi-mdx-regression (push) Failing after 25s

u1 argparse --reuse-from PREV_RUN_ID + post-merge fail-closed guard (rejects
layout/zone_geometry/zone_section/image override axes by name; only
--override-frame is preserved).
u2 src/phase_z2_reuse_snapshot.py — JSON-only Step 6 snapshot with mdx_sha256
integrity key and {value, source_path, upstream_step} provenance per axis
(pickle forbidden per Stage 2 guardrail).
u3 _write_reuse_snapshot at the Step 6 boundary; soft-fails to stderr without
aborting the seed run.
u4 prev_run_dir RO copy of step00/01/02/05/06 + _reuse_snapshot.json into
new run_dir, state rehydration, reuse marker, frame-override application on
restored units, Step 7+ resume.
u4b fail-closed for missing prev_run_dir / missing/corrupt/invalid snapshot /
mdx_sha256 mismatch / accidental new==prev write, with value+path+upstream
diagnostics per axis.
u5 reuse_from Optional[str] threaded through run_phase_z2_mvp1 signature and
CLI dispatch; default None preserves byte-identical pre-IMP-43 behavior.
u6 Front /api/run optional reuseFromRunId forwarding (vite.config.ts +
designAgentApi.ts + run_pipeline_reuse_from.test.ts).
u7a fast CI equivalence (1 mdx × 1 layout × 2 frames); step13 whitelist =
run_id/timestamps/prev_run_id only. u7b 3 layouts × 3 mdx × 32 frames
sweep gated by pytest.mark.sweep (registered in pyproject.toml; default CI
must use -m 'not sweep').
u8 scripts/measure_reuse_savings.py argv-driven A/B/C harness with frame
pin self-discovery + seed-time exclusion; status board §8 TBD anchor
(issue-body 50-70% / 10-20s→3-8s claim explicitly unverified, not mirrored).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-24 22:44:27 +09:00
parent 8648a468d9
commit b4be6c1cd0
15 changed files with 5128 additions and 656 deletions

View File

@@ -0,0 +1,178 @@
"""IMP-43 (#72) u8 — measure ``--reuse-from`` wall-clock savings.
Argv-driven measurement helper for the Stage 2 §u8 binding contract:
re-derive a realistic savings target instead of mirroring the
unverified issue-body 5070% / 1020s → 38s claim.
Per-iteration measurement protocol (mirrors the u7a equivalence
harness, ``tests/test_phase_z2_reuse_from_equivalence_unit.py``):
(A) baseline full run, no overrides — reuse seed
(B) full rerun full run + one --override-frame pin — control path
(C) reuse --reuse-from <seed> + same pin — reuse path
Wall-clock = ``time.perf_counter()`` around the subprocess.run call.
The (A) seed run time is reported separately and NOT included in the
B-vs-C comparison (the reuse path's whole point is that the seed
already exists from a prior interactive run).
For each iteration the frame pin is self-discovered from the seed
run's ``step06_composition_plan.json``: the first unit's
``frame_template_id`` is re-pinned to itself, exercising the
``--override-frame`` CLI surface end-to-end without changing the
semantic frame assignment (same approach the u7a/u7b equivalence
tests already lock).
Output: a JSON document to stdout with per-iteration timings,
B/C p50 + p95, and the ratio C/B. Stderr carries the subprocess
stdout/stderr tails on non-zero exits.
Guardrails (Stage 2):
* argv-driven, no hardcoded mdx — caller picks the sample
* no hardcoded savings target — TBD until measured
* value + path + upstream provenance lives in the printed JSON
* does NOT mutate prev_run_dir; new runs land under fresh run_ids
"""
from __future__ import annotations
import argparse
import json
import statistics
import subprocess
import sys
import time
import uuid
from pathlib import Path
REPO_ROOT = Path(__file__).resolve().parents[1]
RUNS_DIR = REPO_ROOT / "data" / "runs"
def _unique_run_id(prefix: str) -> str:
return f"{prefix}_imp43_u8_{uuid.uuid4().hex[:8]}"
def _spawn(extra_args: list[str], timeout: int) -> tuple[subprocess.CompletedProcess, float]:
start = time.perf_counter()
cp = subprocess.run(
[sys.executable, "-m", "src.phase_z2_pipeline", *extra_args],
capture_output=True,
text=True,
timeout=timeout,
cwd=str(REPO_ROOT),
)
return cp, time.perf_counter() - start
def _assert_ok(label: str, cp: subprocess.CompletedProcess) -> None:
if cp.returncode != 0:
sys.stderr.write(
f"[measure_reuse_savings] {label} failed rc={cp.returncode}\n"
f"--- stderr tail ---\n{cp.stderr[-2000:]}\n"
f"--- stdout tail ---\n{cp.stdout[-2000:]}\n"
)
raise SystemExit(2)
def _discover_first_frame_pin(seed_run_id: str) -> tuple[str, str]:
p = RUNS_DIR / seed_run_id / "phase_z2" / "steps" / "step06_composition_plan.json"
payload = json.loads(p.read_text(encoding="utf-8"))
for u in payload.get("data", {}).get("selected_units") or []:
sids = u.get("source_section_ids") or []
tpl = u.get("frame_template_id")
if isinstance(sids, list) and sids and isinstance(tpl, str) and tpl:
return ("+".join(str(s) for s in sids), tpl)
raise SystemExit(
f"[measure_reuse_savings] seed {seed_run_id} step06 has no pinnable "
f"(unit_id, frame_template_id); path={p}"
)
def _percentile(values: list[float], pct: float) -> float:
if not values:
return float("nan")
if len(values) == 1:
return values[0]
s = sorted(values)
k = (len(s) - 1) * pct
lo = int(k)
hi = min(lo + 1, len(s) - 1)
return s[lo] + (s[hi] - s[lo]) * (k - lo)
def main() -> int:
ap = argparse.ArgumentParser(
prog="python -m scripts.measure_reuse_savings",
description="Measure IMP-43 --reuse-from wall-clock savings.",
)
ap.add_argument("mdx_path", type=Path, help="MDX sample to measure against")
ap.add_argument("--iterations", type=int, default=3, help="trials (default 3)")
ap.add_argument("--timeout", type=int, default=900, help="per-run timeout seconds")
args = ap.parse_args()
if not args.mdx_path.is_file():
sys.stderr.write(f"[measure_reuse_savings] mdx not found: {args.mdx_path}\n")
return 2
iterations: list[dict] = []
for i in range(args.iterations):
seed_id = _unique_run_id(f"seed{i}")
cp_a, t_a = _spawn([str(args.mdx_path), seed_id], args.timeout)
_assert_ok(f"(A) seed iter={i}", cp_a)
unit_id, tpl_id = _discover_first_frame_pin(seed_id)
override = ["--override-frame", f"{unit_id}={tpl_id}"]
full_id = _unique_run_id(f"full{i}")
cp_b, t_b = _spawn([str(args.mdx_path), full_id, *override], args.timeout)
_assert_ok(f"(B) full rerun iter={i}", cp_b)
reuse_id = _unique_run_id(f"reuse{i}")
cp_c, t_c = _spawn(
[str(args.mdx_path), reuse_id, "--reuse-from", seed_id, *override],
args.timeout,
)
_assert_ok(f"(C) reuse iter={i}", cp_c)
iterations.append({
"iter": i,
"seed_run_id": seed_id,
"full_run_id": full_id,
"reuse_run_id": reuse_id,
"override_frame": f"{unit_id}={tpl_id}",
"seed_seconds": t_a,
"full_rerun_seconds": t_b,
"reuse_seconds": t_c,
})
full_times = [it["full_rerun_seconds"] for it in iterations]
reuse_times = [it["reuse_seconds"] for it in iterations]
summary = {
"mdx_path": str(args.mdx_path),
"iterations_count": len(iterations),
"full_rerun_seconds_p50": _percentile(full_times, 0.50),
"full_rerun_seconds_p95": _percentile(full_times, 0.95),
"reuse_seconds_p50": _percentile(reuse_times, 0.50),
"reuse_seconds_p95": _percentile(reuse_times, 0.95),
"reuse_over_full_ratio_p50": (
_percentile(reuse_times, 0.50) / _percentile(full_times, 0.50)
if full_times and statistics.median(full_times) > 0
else float("nan")
),
"iterations": iterations,
"note": (
"IMP-43 (#72) u8 measurement. Issue-body 5070% / 1020s → 38s "
"claim is NOT honored here — actual numbers depend on host, "
"Selenium cold-start, and AI cache state. Update "
"docs/architecture/PHASE-Z-PIPELINE-STATUS-BOARD.md §8 with the "
"p50/p95 reported here when run on the project's reference host."
),
}
sys.stdout.write(json.dumps(summary, ensure_ascii=False, indent=2))
sys.stdout.write("\n")
return 0
if __name__ == "__main__":
raise SystemExit(main())