"""IMP-89 89-a u4 — capture final.html SHA baseline via the FULL Phase Z pipeline. Runs ``src.phase_z2_pipeline.run_phase_z2_mvp1`` end-to-end for every mdx file in ``samples/mdx_batch/`` (01-05) under PHASE_Z_B4_MAPPER_SOURCE=OFF (default). Each run writes a real ``final.html`` to disk at ``//phase_z2/final.html`` — exactly the production write site at ``src/phase_z2_pipeline.py:5994-5996``. The bytes of that on-disk artifact are normalized (IMP-94 marker strip — see below) and SHA-256 hashed, then stored in ``tests/regression/fixtures/89a_pre_baseline_sha.json``. The u4 regression test in ``tests/regression/test_b4_mapper_source_sha_parity.py`` runs the same pipeline shape under flag OFF, reads the on-disk ``final.html``, applies the same IMP-94 normalization, hashes the result, and asserts SHA equality with each frozen value. The mathematical chain that makes this a genuine "pre-89-a baseline" guard: * Under flag OFF, ``_select_mapper_template_id(plan, T) == T`` for every ``(plan, T)`` pair (locked by u2 + u4 algebraic precondition tests). * Therefore the mapper input is byte-identical to the legacy pre-89-a call shape ``map_mdx_to_slots(section, unit.frame_template_id)``. * Therefore the rendered HTML is byte-identical to pre-89-a output. * Therefore the on-disk ``final.html`` is byte-identical → SHA matches. Any future drift — in the selector, mapper, render_slide, slide_base.html, or any upstream code path — produces a divergent SHA and breaks the test. IMP-94 Layer A marker normalization (additive-only delta) ========================================================= IMP-94 (issue #94) injected ``data-region-id`` + ``data-content-unit-id`` attributes on family-partial root divs via ``src/region_marker_stamper.py``. Per the issue body guardrail (``byte-equivalent except for new data-* attrs``) and to keep the captured baseline stable across deterministic stamps of evolving region/content IDs, both the capture script and the regression test strip those two attributes (with their leading space, matching the exact emission shape at ``src/region_marker_stamper.py:131-135``) before SHA-256 hashing. The strip is disjoint from the #96 ``data-frame-slot-id`` axis by attribute name. Run from repo root:: python tests/regression/scripts/capture_89a_pre_baseline.py The capture script is idempotent and meant to be re-run only when an upstream mapper/render/template delta is reviewed and accepted. It refuses to run with PHASE_Z_B4_MAPPER_SOURCE enabled (the post-89-a flag-ON state is NOT the baseline axis). """ from __future__ import annotations import hashlib import json import os import re import sys import tempfile from datetime import datetime, timezone from pathlib import Path _REPO_ROOT = Path(__file__).resolve().parents[3] sys.path.insert(0, str(_REPO_ROOT)) sys.path.insert(0, str(_REPO_ROOT / "src")) import src.phase_z2_pipeline as pz2 # noqa: E402 _SAMPLES_DIR = _REPO_ROOT / "samples" / "mdx_batch" _MDX_BATCH = ("01.mdx", "02.mdx", "03.mdx", "04.mdx", "05.mdx") _OUT_PATH = ( _REPO_ROOT / "tests" / "regression" / "fixtures" / "89a_pre_baseline_sha.json" ) # IMP-94 additive marker strip patterns (mirror of # tests/regression/test_b4_mapper_source_sha_parity.py — keep both in sync). # Anchored on `(leading space + attr token)` shape from # src/region_marker_stamper.py:131-135. Disjoint from #96 data-frame-slot-id. _STRIP_REGION_ID_RE = re.compile(rb' data-region-id="[^"]*"') _STRIP_CONTENT_UNIT_ID_RE = re.compile(rb' data-content-unit-id="[^"]*"') def _strip_imp94_markers(raw_bytes: bytes) -> bytes: """Return ``raw_bytes`` with IMP-94 ``data-region-id`` and ``data-content-unit-id`` attribute tokens removed (additive-only normalization — see module docstring). """ stripped = _STRIP_REGION_ID_RE.sub(b"", raw_bytes) stripped = _STRIP_CONTENT_UNIT_ID_RE.sub(b"", stripped) return stripped def _capture_one(mdx_file: str, runs_root: Path) -> dict: """Run the full pipeline once and hash the on-disk final.html. ``pz2.RUNS_DIR`` MUST be pinned to ``runs_root`` by the caller before invocation; ``run_phase_z2_mvp1`` writes final.html to ``//phase_z2/final.html``. ``SystemExit`` from the pipeline (e.g. IMP-87 EMPTY_SHELL_NO_CONTENT BLOCKED exit on mdx 05) is caught: the BLOCKED exit fires AFTER the final.html write at ``src/phase_z2_pipeline.py:5994-5996``, so the artifact still exists on disk and the SHA is captured. The exit code is recorded on the entry so the test can assert the same terminal state under flag OFF. If final.html is missing post-exit, that is a genuine pipeline failure and the script aborts. IMP-94 markers are stripped from the captured bytes before hashing (see module docstring); ``final_html_size_bytes`` reflects the size of the normalized bytes that were actually hashed (the same shape the regression test produces). """ mdx_path = _SAMPLES_DIR / mdx_file assert mdx_path.exists(), f"sample missing: {mdx_path}" run_id = f"89a_baseline_{mdx_path.stem}" pipeline_exit_code: int | None = None try: pz2.run_phase_z2_mvp1(mdx_path, run_id=run_id) except SystemExit as exc: pipeline_exit_code = ( int(exc.code) if isinstance(exc.code, int) else 1 ) final_html_path = runs_root / run_id / "phase_z2" / "final.html" assert final_html_path.exists(), ( f"final.html not written by pipeline: {final_html_path} " f"(pipeline_exit_code={pipeline_exit_code})" ) raw_bytes = final_html_path.read_bytes() assert len(raw_bytes) > 0, f"final.html is empty: {final_html_path}" normalized_bytes = _strip_imp94_markers(raw_bytes) return { "mdx_file": mdx_file, "run_id": run_id, "final_html_size_bytes": len(normalized_bytes), "sha256": hashlib.sha256(normalized_bytes).hexdigest(), "pipeline_exit_code": pipeline_exit_code, } def capture() -> dict: assert os.environ.get("PHASE_Z_B4_MAPPER_SOURCE", "") == "", ( "PHASE_Z_B4_MAPPER_SOURCE must be unset when capturing baseline " "(default-OFF state is the production-equivalent axis for u4). " "Refusing to run with the flag enabled." ) _OUT_PATH.parent.mkdir(parents=True, exist_ok=True) with tempfile.TemporaryDirectory(prefix="89a_baseline_") as tmp: runs_root = Path(tmp) original_runs_dir = pz2.RUNS_DIR pz2.RUNS_DIR = runs_root try: entries = [_capture_one(mf, runs_root) for mf in _MDX_BATCH] finally: pz2.RUNS_DIR = original_runs_dir return { "schema_version": 2, "axis": ( "IMP-89 89-a u4 — final.html SHA baseline captured via FULL " "run_phase_z2_mvp1 pipeline (flag OFF / default)" ), "description": ( "Frozen SHA-256 of `final.html` bytes (the artifact written to " "disk at src/phase_z2_pipeline.py:5994-5996) captured by running " "the full Phase Z pipeline end-to-end for each mdx 01-05 under " "PHASE_Z_B4_MAPPER_SOURCE=OFF. Under flag OFF the 89-a selector " "`_select_mapper_template_id(plan, T)` returns `T` verbatim, so " "the mapper input is byte-identical to the pre-89-a legacy call " "shape `map_mdx_to_slots(section, unit.frame_template_id)` — " "the rendered HTML and therefore the final.html SHA match the " "pre-89-a baseline. The u4 regression test runs the same " "pipeline shape under flag OFF and asserts SHA equality. " "Regenerate only when an upstream mapper/render/template delta " "is deliberately reviewed and accepted." ), "captured_at_utc": ( datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") ), "renderer": { "entrypoint": "src.phase_z2_pipeline.run_phase_z2_mvp1", "write_site": "src/phase_z2_pipeline.py:5994-5996", "artifact_relpath": "//phase_z2/final.html", }, "mdx_batch": list(_MDX_BATCH), "mdx_files": {entry["mdx_file"]: entry for entry in entries}, "total_files": len(entries), } def main() -> None: data = capture() _OUT_PATH.write_text( json.dumps(data, indent=2, ensure_ascii=False) + "\n", encoding="utf-8", ) print( f"wrote {_OUT_PATH} ({data['total_files']} files: " f"{', '.join(data['mdx_files'].keys())})" ) if __name__ == "__main__": main()