Some checks failed
Multi-MDX Regression (IMP-91) / multi-mdx-regression (push) Failing after 25s
u1 argparse --reuse-from PREV_RUN_ID + post-merge fail-closed guard (rejects
layout/zone_geometry/zone_section/image override axes by name; only
--override-frame is preserved).
u2 src/phase_z2_reuse_snapshot.py — JSON-only Step 6 snapshot with mdx_sha256
integrity key and {value, source_path, upstream_step} provenance per axis
(pickle forbidden per Stage 2 guardrail).
u3 _write_reuse_snapshot at the Step 6 boundary; soft-fails to stderr without
aborting the seed run.
u4 prev_run_dir RO copy of step00/01/02/05/06 + _reuse_snapshot.json into
new run_dir, state rehydration, reuse marker, frame-override application on
restored units, Step 7+ resume.
u4b fail-closed for missing prev_run_dir / missing/corrupt/invalid snapshot /
mdx_sha256 mismatch / accidental new==prev write, with value+path+upstream
diagnostics per axis.
u5 reuse_from Optional[str] threaded through run_phase_z2_mvp1 signature and
CLI dispatch; default None preserves byte-identical pre-IMP-43 behavior.
u6 Front /api/run optional reuseFromRunId forwarding (vite.config.ts +
designAgentApi.ts + run_pipeline_reuse_from.test.ts).
u7a fast CI equivalence (1 mdx × 1 layout × 2 frames); step13 whitelist =
run_id/timestamps/prev_run_id only. u7b 3 layouts × 3 mdx × 32 frames
sweep gated by pytest.mark.sweep (registered in pyproject.toml; default CI
must use -m 'not sweep').
u8 scripts/measure_reuse_savings.py argv-driven A/B/C harness with frame
pin self-discovery + seed-time exclusion; status board §8 TBD anchor
(issue-body 50-70% / 10-20s→3-8s claim explicitly unverified, not mirrored).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
179 lines
6.5 KiB
Python
179 lines
6.5 KiB
Python
"""IMP-43 (#72) u8 — measure ``--reuse-from`` wall-clock savings.
|
||
|
||
Argv-driven measurement helper for the Stage 2 §u8 binding contract:
|
||
re-derive a realistic savings target instead of mirroring the
|
||
unverified issue-body 50–70% / 10–20s → 3–8s claim.
|
||
|
||
Per-iteration measurement protocol (mirrors the u7a equivalence
|
||
harness, ``tests/test_phase_z2_reuse_from_equivalence_unit.py``):
|
||
|
||
(A) baseline full run, no overrides — reuse seed
|
||
(B) full rerun full run + one --override-frame pin — control path
|
||
(C) reuse --reuse-from <seed> + same pin — reuse path
|
||
|
||
Wall-clock = ``time.perf_counter()`` around the subprocess.run call.
|
||
The (A) seed run time is reported separately and NOT included in the
|
||
B-vs-C comparison (the reuse path's whole point is that the seed
|
||
already exists from a prior interactive run).
|
||
|
||
For each iteration the frame pin is self-discovered from the seed
|
||
run's ``step06_composition_plan.json``: the first unit's
|
||
``frame_template_id`` is re-pinned to itself, exercising the
|
||
``--override-frame`` CLI surface end-to-end without changing the
|
||
semantic frame assignment (same approach the u7a/u7b equivalence
|
||
tests already lock).
|
||
|
||
Output: a JSON document to stdout with per-iteration timings,
|
||
B/C p50 + p95, and the ratio C/B. Stderr carries the subprocess
|
||
stdout/stderr tails on non-zero exits.
|
||
|
||
Guardrails (Stage 2):
|
||
* argv-driven, no hardcoded mdx — caller picks the sample
|
||
* no hardcoded savings target — TBD until measured
|
||
* value + path + upstream provenance lives in the printed JSON
|
||
* does NOT mutate prev_run_dir; new runs land under fresh run_ids
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import json
|
||
import statistics
|
||
import subprocess
|
||
import sys
|
||
import time
|
||
import uuid
|
||
from pathlib import Path
|
||
|
||
REPO_ROOT = Path(__file__).resolve().parents[1]
|
||
RUNS_DIR = REPO_ROOT / "data" / "runs"
|
||
|
||
|
||
def _unique_run_id(prefix: str) -> str:
|
||
return f"{prefix}_imp43_u8_{uuid.uuid4().hex[:8]}"
|
||
|
||
|
||
def _spawn(extra_args: list[str], timeout: int) -> tuple[subprocess.CompletedProcess, float]:
|
||
start = time.perf_counter()
|
||
cp = subprocess.run(
|
||
[sys.executable, "-m", "src.phase_z2_pipeline", *extra_args],
|
||
capture_output=True,
|
||
text=True,
|
||
timeout=timeout,
|
||
cwd=str(REPO_ROOT),
|
||
)
|
||
return cp, time.perf_counter() - start
|
||
|
||
|
||
def _assert_ok(label: str, cp: subprocess.CompletedProcess) -> None:
|
||
if cp.returncode != 0:
|
||
sys.stderr.write(
|
||
f"[measure_reuse_savings] {label} failed rc={cp.returncode}\n"
|
||
f"--- stderr tail ---\n{cp.stderr[-2000:]}\n"
|
||
f"--- stdout tail ---\n{cp.stdout[-2000:]}\n"
|
||
)
|
||
raise SystemExit(2)
|
||
|
||
|
||
def _discover_first_frame_pin(seed_run_id: str) -> tuple[str, str]:
|
||
p = RUNS_DIR / seed_run_id / "phase_z2" / "steps" / "step06_composition_plan.json"
|
||
payload = json.loads(p.read_text(encoding="utf-8"))
|
||
for u in payload.get("data", {}).get("selected_units") or []:
|
||
sids = u.get("source_section_ids") or []
|
||
tpl = u.get("frame_template_id")
|
||
if isinstance(sids, list) and sids and isinstance(tpl, str) and tpl:
|
||
return ("+".join(str(s) for s in sids), tpl)
|
||
raise SystemExit(
|
||
f"[measure_reuse_savings] seed {seed_run_id} step06 has no pinnable "
|
||
f"(unit_id, frame_template_id); path={p}"
|
||
)
|
||
|
||
|
||
def _percentile(values: list[float], pct: float) -> float:
|
||
if not values:
|
||
return float("nan")
|
||
if len(values) == 1:
|
||
return values[0]
|
||
s = sorted(values)
|
||
k = (len(s) - 1) * pct
|
||
lo = int(k)
|
||
hi = min(lo + 1, len(s) - 1)
|
||
return s[lo] + (s[hi] - s[lo]) * (k - lo)
|
||
|
||
|
||
def main() -> int:
|
||
ap = argparse.ArgumentParser(
|
||
prog="python -m scripts.measure_reuse_savings",
|
||
description="Measure IMP-43 --reuse-from wall-clock savings.",
|
||
)
|
||
ap.add_argument("mdx_path", type=Path, help="MDX sample to measure against")
|
||
ap.add_argument("--iterations", type=int, default=3, help="trials (default 3)")
|
||
ap.add_argument("--timeout", type=int, default=900, help="per-run timeout seconds")
|
||
args = ap.parse_args()
|
||
|
||
if not args.mdx_path.is_file():
|
||
sys.stderr.write(f"[measure_reuse_savings] mdx not found: {args.mdx_path}\n")
|
||
return 2
|
||
|
||
iterations: list[dict] = []
|
||
for i in range(args.iterations):
|
||
seed_id = _unique_run_id(f"seed{i}")
|
||
cp_a, t_a = _spawn([str(args.mdx_path), seed_id], args.timeout)
|
||
_assert_ok(f"(A) seed iter={i}", cp_a)
|
||
|
||
unit_id, tpl_id = _discover_first_frame_pin(seed_id)
|
||
override = ["--override-frame", f"{unit_id}={tpl_id}"]
|
||
|
||
full_id = _unique_run_id(f"full{i}")
|
||
cp_b, t_b = _spawn([str(args.mdx_path), full_id, *override], args.timeout)
|
||
_assert_ok(f"(B) full rerun iter={i}", cp_b)
|
||
|
||
reuse_id = _unique_run_id(f"reuse{i}")
|
||
cp_c, t_c = _spawn(
|
||
[str(args.mdx_path), reuse_id, "--reuse-from", seed_id, *override],
|
||
args.timeout,
|
||
)
|
||
_assert_ok(f"(C) reuse iter={i}", cp_c)
|
||
|
||
iterations.append({
|
||
"iter": i,
|
||
"seed_run_id": seed_id,
|
||
"full_run_id": full_id,
|
||
"reuse_run_id": reuse_id,
|
||
"override_frame": f"{unit_id}={tpl_id}",
|
||
"seed_seconds": t_a,
|
||
"full_rerun_seconds": t_b,
|
||
"reuse_seconds": t_c,
|
||
})
|
||
|
||
full_times = [it["full_rerun_seconds"] for it in iterations]
|
||
reuse_times = [it["reuse_seconds"] for it in iterations]
|
||
|
||
summary = {
|
||
"mdx_path": str(args.mdx_path),
|
||
"iterations_count": len(iterations),
|
||
"full_rerun_seconds_p50": _percentile(full_times, 0.50),
|
||
"full_rerun_seconds_p95": _percentile(full_times, 0.95),
|
||
"reuse_seconds_p50": _percentile(reuse_times, 0.50),
|
||
"reuse_seconds_p95": _percentile(reuse_times, 0.95),
|
||
"reuse_over_full_ratio_p50": (
|
||
_percentile(reuse_times, 0.50) / _percentile(full_times, 0.50)
|
||
if full_times and statistics.median(full_times) > 0
|
||
else float("nan")
|
||
),
|
||
"iterations": iterations,
|
||
"note": (
|
||
"IMP-43 (#72) u8 measurement. Issue-body 50–70% / 10–20s → 3–8s "
|
||
"claim is NOT honored here — actual numbers depend on host, "
|
||
"Selenium cold-start, and AI cache state. Update "
|
||
"docs/architecture/PHASE-Z-PIPELINE-STATUS-BOARD.md §8 with the "
|
||
"p50/p95 reported here when run on the project's reference host."
|
||
),
|
||
}
|
||
sys.stdout.write(json.dumps(summary, ensure_ascii=False, indent=2))
|
||
sys.stdout.write("\n")
|
||
return 0
|
||
|
||
|
||
if __name__ == "__main__":
|
||
raise SystemExit(main())
|