C.E.L_Slide_test2/scripts/measure_reuse_savings.py

"""IMP-43 (#72) u8 — measure ``--reuse-from`` wall-clock savings.

Argv-driven measurement helper for the Stage 2 §u8 binding contract:
re-derive a realistic savings target instead of mirroring the
unverified issue-body 50–70% / 10–20s → 3–8s claim.

Per-iteration measurement protocol (mirrors the u7a equivalence
harness, ``tests/test_phase_z2_reuse_from_equivalence_unit.py``):

  (A) baseline   full run, no overrides              — reuse seed
  (B) full rerun full run + one --override-frame pin — control path
  (C) reuse      --reuse-from <seed> + same pin      — reuse path

Wall-clock = ``time.perf_counter()`` around the subprocess.run call.
The (A) seed run time is reported separately and NOT included in the
B-vs-C comparison (the reuse path's whole point is that the seed
already exists from a prior interactive run).

For each iteration the frame pin is self-discovered from the seed
run's ``step06_composition_plan.json``: the first unit's
``frame_template_id`` is re-pinned to itself, exercising the
``--override-frame`` CLI surface end-to-end without changing the
semantic frame assignment (same approach the u7a/u7b equivalence
tests already lock).

Output: a JSON document to stdout with per-iteration timings,
B/C p50 + p95, and the ratio C/B. Stderr carries the subprocess
stdout/stderr tails on non-zero exits.

Guardrails (Stage 2):
  * argv-driven, no hardcoded mdx — caller picks the sample
  * no hardcoded savings target — TBD until measured
  * value + path + upstream provenance lives in the printed JSON
  * does NOT mutate prev_run_dir; new runs land under fresh run_ids
"""
from __future__ import annotations

import argparse
import json
import statistics
import subprocess
import sys
import time
import uuid
from pathlib import Path

REPO_ROOT = Path(__file__).resolve().parents[1]
RUNS_DIR = REPO_ROOT / "data" / "runs"


def _unique_run_id(prefix: str) -> str:
    return f"{prefix}_imp43_u8_{uuid.uuid4().hex[:8]}"


def _spawn(extra_args: list[str], timeout: int) -> tuple[subprocess.CompletedProcess, float]:
    start = time.perf_counter()
    cp = subprocess.run(
        [sys.executable, "-m", "src.phase_z2_pipeline", *extra_args],
        capture_output=True,
        text=True,
        timeout=timeout,
        cwd=str(REPO_ROOT),
    )
    return cp, time.perf_counter() - start


def _assert_ok(label: str, cp: subprocess.CompletedProcess) -> None:
    if cp.returncode != 0:
        sys.stderr.write(
            f"[measure_reuse_savings] {label} failed rc={cp.returncode}\n"
            f"--- stderr tail ---\n{cp.stderr[-2000:]}\n"
            f"--- stdout tail ---\n{cp.stdout[-2000:]}\n"
        )
        raise SystemExit(2)


def _discover_first_frame_pin(seed_run_id: str) -> tuple[str, str]:
    p = RUNS_DIR / seed_run_id / "phase_z2" / "steps" / "step06_composition_plan.json"
    payload = json.loads(p.read_text(encoding="utf-8"))
    for u in payload.get("data", {}).get("selected_units") or []:
        sids = u.get("source_section_ids") or []
        tpl = u.get("frame_template_id")
        if isinstance(sids, list) and sids and isinstance(tpl, str) and tpl:
            return ("+".join(str(s) for s in sids), tpl)
    raise SystemExit(
        f"[measure_reuse_savings] seed {seed_run_id} step06 has no pinnable "
        f"(unit_id, frame_template_id); path={p}"
    )


def _percentile(values: list[float], pct: float) -> float:
    if not values:
        return float("nan")
    if len(values) == 1:
        return values[0]
    s = sorted(values)
    k = (len(s) - 1) * pct
    lo = int(k)
    hi = min(lo + 1, len(s) - 1)
    return s[lo] + (s[hi] - s[lo]) * (k - lo)


def main() -> int:
    ap = argparse.ArgumentParser(
        prog="python -m scripts.measure_reuse_savings",
        description="Measure IMP-43 --reuse-from wall-clock savings.",
    )
    ap.add_argument("mdx_path", type=Path, help="MDX sample to measure against")
    ap.add_argument("--iterations", type=int, default=3, help="trials (default 3)")
    ap.add_argument("--timeout", type=int, default=900, help="per-run timeout seconds")
    args = ap.parse_args()

    if not args.mdx_path.is_file():
        sys.stderr.write(f"[measure_reuse_savings] mdx not found: {args.mdx_path}\n")
        return 2

    iterations: list[dict] = []
    for i in range(args.iterations):
        seed_id = _unique_run_id(f"seed{i}")
        cp_a, t_a = _spawn([str(args.mdx_path), seed_id], args.timeout)
        _assert_ok(f"(A) seed iter={i}", cp_a)

        unit_id, tpl_id = _discover_first_frame_pin(seed_id)
        override = ["--override-frame", f"{unit_id}={tpl_id}"]

        full_id = _unique_run_id(f"full{i}")
        cp_b, t_b = _spawn([str(args.mdx_path), full_id, *override], args.timeout)
        _assert_ok(f"(B) full rerun iter={i}", cp_b)

        reuse_id = _unique_run_id(f"reuse{i}")
        cp_c, t_c = _spawn(
            [str(args.mdx_path), reuse_id, "--reuse-from", seed_id, *override],
            args.timeout,
        )
        _assert_ok(f"(C) reuse iter={i}", cp_c)

        iterations.append({
            "iter": i,
            "seed_run_id": seed_id,
            "full_run_id": full_id,
            "reuse_run_id": reuse_id,
            "override_frame": f"{unit_id}={tpl_id}",
            "seed_seconds": t_a,
            "full_rerun_seconds": t_b,
            "reuse_seconds": t_c,
        })

    full_times = [it["full_rerun_seconds"] for it in iterations]
    reuse_times = [it["reuse_seconds"] for it in iterations]

    summary = {
        "mdx_path": str(args.mdx_path),
        "iterations_count": len(iterations),
        "full_rerun_seconds_p50": _percentile(full_times, 0.50),
        "full_rerun_seconds_p95": _percentile(full_times, 0.95),
        "reuse_seconds_p50": _percentile(reuse_times, 0.50),
        "reuse_seconds_p95": _percentile(reuse_times, 0.95),
        "reuse_over_full_ratio_p50": (
            _percentile(reuse_times, 0.50) / _percentile(full_times, 0.50)
            if full_times and statistics.median(full_times) > 0
            else float("nan")
        ),
        "iterations": iterations,
        "note": (
            "IMP-43 (#72) u8 measurement. Issue-body 50–70% / 10–20s → 3–8s "
            "claim is NOT honored here — actual numbers depend on host, "
            "Selenium cold-start, and AI cache state. Update "
            "docs/architecture/PHASE-Z-PIPELINE-STATUS-BOARD.md §8 with the "
            "p50/p95 reported here when run on the project's reference host."
        ),
    }
    sys.stdout.write(json.dumps(summary, ensure_ascii=False, indent=2))
    sys.stdout.write("\n")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())