Files
C.E.L_Slide_test2/scripts/measure_reuse_savings.py
kyeongmin b4be6c1cd0
Some checks failed
Multi-MDX Regression (IMP-91) / multi-mdx-regression (push) Failing after 25s
feat(#72): IMP-43 u1~u8 --reuse-from incremental rerun (Step 0/1/2/5/6 reuse + Step 7+ re-execute)
u1 argparse --reuse-from PREV_RUN_ID + post-merge fail-closed guard (rejects
layout/zone_geometry/zone_section/image override axes by name; only
--override-frame is preserved).
u2 src/phase_z2_reuse_snapshot.py — JSON-only Step 6 snapshot with mdx_sha256
integrity key and {value, source_path, upstream_step} provenance per axis
(pickle forbidden per Stage 2 guardrail).
u3 _write_reuse_snapshot at the Step 6 boundary; soft-fails to stderr without
aborting the seed run.
u4 prev_run_dir RO copy of step00/01/02/05/06 + _reuse_snapshot.json into
new run_dir, state rehydration, reuse marker, frame-override application on
restored units, Step 7+ resume.
u4b fail-closed for missing prev_run_dir / missing/corrupt/invalid snapshot /
mdx_sha256 mismatch / accidental new==prev write, with value+path+upstream
diagnostics per axis.
u5 reuse_from Optional[str] threaded through run_phase_z2_mvp1 signature and
CLI dispatch; default None preserves byte-identical pre-IMP-43 behavior.
u6 Front /api/run optional reuseFromRunId forwarding (vite.config.ts +
designAgentApi.ts + run_pipeline_reuse_from.test.ts).
u7a fast CI equivalence (1 mdx × 1 layout × 2 frames); step13 whitelist =
run_id/timestamps/prev_run_id only. u7b 3 layouts × 3 mdx × 32 frames
sweep gated by pytest.mark.sweep (registered in pyproject.toml; default CI
must use -m 'not sweep').
u8 scripts/measure_reuse_savings.py argv-driven A/B/C harness with frame
pin self-discovery + seed-time exclusion; status board §8 TBD anchor
(issue-body 50-70% / 10-20s→3-8s claim explicitly unverified, not mirrored).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-24 22:44:27 +09:00

179 lines
6.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""IMP-43 (#72) u8 — measure ``--reuse-from`` wall-clock savings.
Argv-driven measurement helper for the Stage 2 §u8 binding contract:
re-derive a realistic savings target instead of mirroring the
unverified issue-body 5070% / 1020s → 38s claim.
Per-iteration measurement protocol (mirrors the u7a equivalence
harness, ``tests/test_phase_z2_reuse_from_equivalence_unit.py``):
(A) baseline full run, no overrides — reuse seed
(B) full rerun full run + one --override-frame pin — control path
(C) reuse --reuse-from <seed> + same pin — reuse path
Wall-clock = ``time.perf_counter()`` around the subprocess.run call.
The (A) seed run time is reported separately and NOT included in the
B-vs-C comparison (the reuse path's whole point is that the seed
already exists from a prior interactive run).
For each iteration the frame pin is self-discovered from the seed
run's ``step06_composition_plan.json``: the first unit's
``frame_template_id`` is re-pinned to itself, exercising the
``--override-frame`` CLI surface end-to-end without changing the
semantic frame assignment (same approach the u7a/u7b equivalence
tests already lock).
Output: a JSON document to stdout with per-iteration timings,
B/C p50 + p95, and the ratio C/B. Stderr carries the subprocess
stdout/stderr tails on non-zero exits.
Guardrails (Stage 2):
* argv-driven, no hardcoded mdx — caller picks the sample
* no hardcoded savings target — TBD until measured
* value + path + upstream provenance lives in the printed JSON
* does NOT mutate prev_run_dir; new runs land under fresh run_ids
"""
from __future__ import annotations
import argparse
import json
import statistics
import subprocess
import sys
import time
import uuid
from pathlib import Path
REPO_ROOT = Path(__file__).resolve().parents[1]
RUNS_DIR = REPO_ROOT / "data" / "runs"
def _unique_run_id(prefix: str) -> str:
return f"{prefix}_imp43_u8_{uuid.uuid4().hex[:8]}"
def _spawn(extra_args: list[str], timeout: int) -> tuple[subprocess.CompletedProcess, float]:
start = time.perf_counter()
cp = subprocess.run(
[sys.executable, "-m", "src.phase_z2_pipeline", *extra_args],
capture_output=True,
text=True,
timeout=timeout,
cwd=str(REPO_ROOT),
)
return cp, time.perf_counter() - start
def _assert_ok(label: str, cp: subprocess.CompletedProcess) -> None:
if cp.returncode != 0:
sys.stderr.write(
f"[measure_reuse_savings] {label} failed rc={cp.returncode}\n"
f"--- stderr tail ---\n{cp.stderr[-2000:]}\n"
f"--- stdout tail ---\n{cp.stdout[-2000:]}\n"
)
raise SystemExit(2)
def _discover_first_frame_pin(seed_run_id: str) -> tuple[str, str]:
p = RUNS_DIR / seed_run_id / "phase_z2" / "steps" / "step06_composition_plan.json"
payload = json.loads(p.read_text(encoding="utf-8"))
for u in payload.get("data", {}).get("selected_units") or []:
sids = u.get("source_section_ids") or []
tpl = u.get("frame_template_id")
if isinstance(sids, list) and sids and isinstance(tpl, str) and tpl:
return ("+".join(str(s) for s in sids), tpl)
raise SystemExit(
f"[measure_reuse_savings] seed {seed_run_id} step06 has no pinnable "
f"(unit_id, frame_template_id); path={p}"
)
def _percentile(values: list[float], pct: float) -> float:
if not values:
return float("nan")
if len(values) == 1:
return values[0]
s = sorted(values)
k = (len(s) - 1) * pct
lo = int(k)
hi = min(lo + 1, len(s) - 1)
return s[lo] + (s[hi] - s[lo]) * (k - lo)
def main() -> int:
ap = argparse.ArgumentParser(
prog="python -m scripts.measure_reuse_savings",
description="Measure IMP-43 --reuse-from wall-clock savings.",
)
ap.add_argument("mdx_path", type=Path, help="MDX sample to measure against")
ap.add_argument("--iterations", type=int, default=3, help="trials (default 3)")
ap.add_argument("--timeout", type=int, default=900, help="per-run timeout seconds")
args = ap.parse_args()
if not args.mdx_path.is_file():
sys.stderr.write(f"[measure_reuse_savings] mdx not found: {args.mdx_path}\n")
return 2
iterations: list[dict] = []
for i in range(args.iterations):
seed_id = _unique_run_id(f"seed{i}")
cp_a, t_a = _spawn([str(args.mdx_path), seed_id], args.timeout)
_assert_ok(f"(A) seed iter={i}", cp_a)
unit_id, tpl_id = _discover_first_frame_pin(seed_id)
override = ["--override-frame", f"{unit_id}={tpl_id}"]
full_id = _unique_run_id(f"full{i}")
cp_b, t_b = _spawn([str(args.mdx_path), full_id, *override], args.timeout)
_assert_ok(f"(B) full rerun iter={i}", cp_b)
reuse_id = _unique_run_id(f"reuse{i}")
cp_c, t_c = _spawn(
[str(args.mdx_path), reuse_id, "--reuse-from", seed_id, *override],
args.timeout,
)
_assert_ok(f"(C) reuse iter={i}", cp_c)
iterations.append({
"iter": i,
"seed_run_id": seed_id,
"full_run_id": full_id,
"reuse_run_id": reuse_id,
"override_frame": f"{unit_id}={tpl_id}",
"seed_seconds": t_a,
"full_rerun_seconds": t_b,
"reuse_seconds": t_c,
})
full_times = [it["full_rerun_seconds"] for it in iterations]
reuse_times = [it["reuse_seconds"] for it in iterations]
summary = {
"mdx_path": str(args.mdx_path),
"iterations_count": len(iterations),
"full_rerun_seconds_p50": _percentile(full_times, 0.50),
"full_rerun_seconds_p95": _percentile(full_times, 0.95),
"reuse_seconds_p50": _percentile(reuse_times, 0.50),
"reuse_seconds_p95": _percentile(reuse_times, 0.95),
"reuse_over_full_ratio_p50": (
_percentile(reuse_times, 0.50) / _percentile(full_times, 0.50)
if full_times and statistics.median(full_times) > 0
else float("nan")
),
"iterations": iterations,
"note": (
"IMP-43 (#72) u8 measurement. Issue-body 5070% / 1020s → 38s "
"claim is NOT honored here — actual numbers depend on host, "
"Selenium cold-start, and AI cache state. Update "
"docs/architecture/PHASE-Z-PIPELINE-STATUS-BOARD.md §8 with the "
"p50/p95 reported here when run on the project's reference host."
),
}
sys.stdout.write(json.dumps(summary, ensure_ascii=False, indent=2))
sys.stdout.write("\n")
return 0
if __name__ == "__main__":
raise SystemExit(main())