feat(#72): IMP-43 u1~u8 --reuse-from incremental rerun (Step 0/1/2/5/6 reuse + Step 7+ re-execute)
Some checks failed
Multi-MDX Regression (IMP-91) / multi-mdx-regression (push) Failing after 25s
Some checks failed
Multi-MDX Regression (IMP-91) / multi-mdx-regression (push) Failing after 25s
u1 argparse --reuse-from PREV_RUN_ID + post-merge fail-closed guard (rejects
layout/zone_geometry/zone_section/image override axes by name; only
--override-frame is preserved).
u2 src/phase_z2_reuse_snapshot.py — JSON-only Step 6 snapshot with mdx_sha256
integrity key and {value, source_path, upstream_step} provenance per axis
(pickle forbidden per Stage 2 guardrail).
u3 _write_reuse_snapshot at the Step 6 boundary; soft-fails to stderr without
aborting the seed run.
u4 prev_run_dir RO copy of step00/01/02/05/06 + _reuse_snapshot.json into
new run_dir, state rehydration, reuse marker, frame-override application on
restored units, Step 7+ resume.
u4b fail-closed for missing prev_run_dir / missing/corrupt/invalid snapshot /
mdx_sha256 mismatch / accidental new==prev write, with value+path+upstream
diagnostics per axis.
u5 reuse_from Optional[str] threaded through run_phase_z2_mvp1 signature and
CLI dispatch; default None preserves byte-identical pre-IMP-43 behavior.
u6 Front /api/run optional reuseFromRunId forwarding (vite.config.ts +
designAgentApi.ts + run_pipeline_reuse_from.test.ts).
u7a fast CI equivalence (1 mdx × 1 layout × 2 frames); step13 whitelist =
run_id/timestamps/prev_run_id only. u7b 3 layouts × 3 mdx × 32 frames
sweep gated by pytest.mark.sweep (registered in pyproject.toml; default CI
must use -m 'not sweep').
u8 scripts/measure_reuse_savings.py argv-driven A/B/C harness with frame
pin self-discovery + seed-time exclusion; status board §8 TBD anchor
(issue-body 50-70% / 10-20s→3-8s claim explicitly unverified, not mirrored).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
178
scripts/measure_reuse_savings.py
Normal file
178
scripts/measure_reuse_savings.py
Normal file
@@ -0,0 +1,178 @@
|
||||
"""IMP-43 (#72) u8 — measure ``--reuse-from`` wall-clock savings.
|
||||
|
||||
Argv-driven measurement helper for the Stage 2 §u8 binding contract:
|
||||
re-derive a realistic savings target instead of mirroring the
|
||||
unverified issue-body 50–70% / 10–20s → 3–8s claim.
|
||||
|
||||
Per-iteration measurement protocol (mirrors the u7a equivalence
|
||||
harness, ``tests/test_phase_z2_reuse_from_equivalence_unit.py``):
|
||||
|
||||
(A) baseline full run, no overrides — reuse seed
|
||||
(B) full rerun full run + one --override-frame pin — control path
|
||||
(C) reuse --reuse-from <seed> + same pin — reuse path
|
||||
|
||||
Wall-clock = ``time.perf_counter()`` around the subprocess.run call.
|
||||
The (A) seed run time is reported separately and NOT included in the
|
||||
B-vs-C comparison (the reuse path's whole point is that the seed
|
||||
already exists from a prior interactive run).
|
||||
|
||||
For each iteration the frame pin is self-discovered from the seed
|
||||
run's ``step06_composition_plan.json``: the first unit's
|
||||
``frame_template_id`` is re-pinned to itself, exercising the
|
||||
``--override-frame`` CLI surface end-to-end without changing the
|
||||
semantic frame assignment (same approach the u7a/u7b equivalence
|
||||
tests already lock).
|
||||
|
||||
Output: a JSON document to stdout with per-iteration timings,
|
||||
B/C p50 + p95, and the ratio C/B. Stderr carries the subprocess
|
||||
stdout/stderr tails on non-zero exits.
|
||||
|
||||
Guardrails (Stage 2):
|
||||
* argv-driven, no hardcoded mdx — caller picks the sample
|
||||
* no hardcoded savings target — TBD until measured
|
||||
* value + path + upstream provenance lives in the printed JSON
|
||||
* does NOT mutate prev_run_dir; new runs land under fresh run_ids
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import statistics
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parents[1]
|
||||
RUNS_DIR = REPO_ROOT / "data" / "runs"
|
||||
|
||||
|
||||
def _unique_run_id(prefix: str) -> str:
|
||||
return f"{prefix}_imp43_u8_{uuid.uuid4().hex[:8]}"
|
||||
|
||||
|
||||
def _spawn(extra_args: list[str], timeout: int) -> tuple[subprocess.CompletedProcess, float]:
|
||||
start = time.perf_counter()
|
||||
cp = subprocess.run(
|
||||
[sys.executable, "-m", "src.phase_z2_pipeline", *extra_args],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout,
|
||||
cwd=str(REPO_ROOT),
|
||||
)
|
||||
return cp, time.perf_counter() - start
|
||||
|
||||
|
||||
def _assert_ok(label: str, cp: subprocess.CompletedProcess) -> None:
|
||||
if cp.returncode != 0:
|
||||
sys.stderr.write(
|
||||
f"[measure_reuse_savings] {label} failed rc={cp.returncode}\n"
|
||||
f"--- stderr tail ---\n{cp.stderr[-2000:]}\n"
|
||||
f"--- stdout tail ---\n{cp.stdout[-2000:]}\n"
|
||||
)
|
||||
raise SystemExit(2)
|
||||
|
||||
|
||||
def _discover_first_frame_pin(seed_run_id: str) -> tuple[str, str]:
|
||||
p = RUNS_DIR / seed_run_id / "phase_z2" / "steps" / "step06_composition_plan.json"
|
||||
payload = json.loads(p.read_text(encoding="utf-8"))
|
||||
for u in payload.get("data", {}).get("selected_units") or []:
|
||||
sids = u.get("source_section_ids") or []
|
||||
tpl = u.get("frame_template_id")
|
||||
if isinstance(sids, list) and sids and isinstance(tpl, str) and tpl:
|
||||
return ("+".join(str(s) for s in sids), tpl)
|
||||
raise SystemExit(
|
||||
f"[measure_reuse_savings] seed {seed_run_id} step06 has no pinnable "
|
||||
f"(unit_id, frame_template_id); path={p}"
|
||||
)
|
||||
|
||||
|
||||
def _percentile(values: list[float], pct: float) -> float:
|
||||
if not values:
|
||||
return float("nan")
|
||||
if len(values) == 1:
|
||||
return values[0]
|
||||
s = sorted(values)
|
||||
k = (len(s) - 1) * pct
|
||||
lo = int(k)
|
||||
hi = min(lo + 1, len(s) - 1)
|
||||
return s[lo] + (s[hi] - s[lo]) * (k - lo)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
ap = argparse.ArgumentParser(
|
||||
prog="python -m scripts.measure_reuse_savings",
|
||||
description="Measure IMP-43 --reuse-from wall-clock savings.",
|
||||
)
|
||||
ap.add_argument("mdx_path", type=Path, help="MDX sample to measure against")
|
||||
ap.add_argument("--iterations", type=int, default=3, help="trials (default 3)")
|
||||
ap.add_argument("--timeout", type=int, default=900, help="per-run timeout seconds")
|
||||
args = ap.parse_args()
|
||||
|
||||
if not args.mdx_path.is_file():
|
||||
sys.stderr.write(f"[measure_reuse_savings] mdx not found: {args.mdx_path}\n")
|
||||
return 2
|
||||
|
||||
iterations: list[dict] = []
|
||||
for i in range(args.iterations):
|
||||
seed_id = _unique_run_id(f"seed{i}")
|
||||
cp_a, t_a = _spawn([str(args.mdx_path), seed_id], args.timeout)
|
||||
_assert_ok(f"(A) seed iter={i}", cp_a)
|
||||
|
||||
unit_id, tpl_id = _discover_first_frame_pin(seed_id)
|
||||
override = ["--override-frame", f"{unit_id}={tpl_id}"]
|
||||
|
||||
full_id = _unique_run_id(f"full{i}")
|
||||
cp_b, t_b = _spawn([str(args.mdx_path), full_id, *override], args.timeout)
|
||||
_assert_ok(f"(B) full rerun iter={i}", cp_b)
|
||||
|
||||
reuse_id = _unique_run_id(f"reuse{i}")
|
||||
cp_c, t_c = _spawn(
|
||||
[str(args.mdx_path), reuse_id, "--reuse-from", seed_id, *override],
|
||||
args.timeout,
|
||||
)
|
||||
_assert_ok(f"(C) reuse iter={i}", cp_c)
|
||||
|
||||
iterations.append({
|
||||
"iter": i,
|
||||
"seed_run_id": seed_id,
|
||||
"full_run_id": full_id,
|
||||
"reuse_run_id": reuse_id,
|
||||
"override_frame": f"{unit_id}={tpl_id}",
|
||||
"seed_seconds": t_a,
|
||||
"full_rerun_seconds": t_b,
|
||||
"reuse_seconds": t_c,
|
||||
})
|
||||
|
||||
full_times = [it["full_rerun_seconds"] for it in iterations]
|
||||
reuse_times = [it["reuse_seconds"] for it in iterations]
|
||||
|
||||
summary = {
|
||||
"mdx_path": str(args.mdx_path),
|
||||
"iterations_count": len(iterations),
|
||||
"full_rerun_seconds_p50": _percentile(full_times, 0.50),
|
||||
"full_rerun_seconds_p95": _percentile(full_times, 0.95),
|
||||
"reuse_seconds_p50": _percentile(reuse_times, 0.50),
|
||||
"reuse_seconds_p95": _percentile(reuse_times, 0.95),
|
||||
"reuse_over_full_ratio_p50": (
|
||||
_percentile(reuse_times, 0.50) / _percentile(full_times, 0.50)
|
||||
if full_times and statistics.median(full_times) > 0
|
||||
else float("nan")
|
||||
),
|
||||
"iterations": iterations,
|
||||
"note": (
|
||||
"IMP-43 (#72) u8 measurement. Issue-body 50–70% / 10–20s → 3–8s "
|
||||
"claim is NOT honored here — actual numbers depend on host, "
|
||||
"Selenium cold-start, and AI cache state. Update "
|
||||
"docs/architecture/PHASE-Z-PIPELINE-STATUS-BOARD.md §8 with the "
|
||||
"p50/p95 reported here when run on the project's reference host."
|
||||
),
|
||||
}
|
||||
sys.stdout.write(json.dumps(summary, ensure_ascii=False, indent=2))
|
||||
sys.stdout.write("\n")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user