"""IMP-43 (#72) u4 — focused tests for the --reuse-from entry helpers. u4 scope (per the Stage 2 Exit Report): - Pure path resolution, file copy, snapshot load+validate, MdxSection + CompositionUnit rehydration, and reuse-marker writing. - Helpers RAISE on missing artifacts / corrupt snapshot / mdx_sha256 mismatch — u4b adds the stderr + sys.exit(2) translation and the prev_run_dir == new_run_dir accidental-write guard around them. - The kwarg threading + the in-``run_phase_z2_mvp1`` branch that invokes these helpers land in u5. Tested helpers (``src/phase_z2_pipeline.py``): * ``_resolve_reuse_from_prev_run_dir`` * ``_copy_reuse_artifacts_from_prev_run`` * ``_load_and_validate_reuse_snapshot`` * ``_rehydrate_mdx_sections_from_snapshot`` * ``_rehydrate_composition_units_from_snapshot`` * ``_write_reuse_marker`` * ``_RehydratedV4Candidate`` (V4Match-shape duck type) * ``_REUSE_STEP_ARTIFACTS`` / ``REUSE_MARKER_FILENAME`` / ``REUSE_MARKER_SCHEMA_VERSION`` """ from __future__ import annotations import hashlib import json from dataclasses import dataclass, field from pathlib import Path from typing import Any, Optional import pytest import src.phase_z2_pipeline as _pz2 from src.phase_z2_composition import CompositionUnit from src.phase_z2_reuse_snapshot import ( SNAPSHOT_FILENAME, SNAPSHOT_VERSION, SnapshotValidationError, build_snapshot, ) # -- synthetic duck-typed inputs (mirror u3 test fixture) ----------------- @dataclass class _Section: section_id: str section_num: int title: str raw_content: str heading_number: Optional[str] = None v4_alias_keys: list = field(default_factory=list) sub_sections: list = field(default_factory=list) @dataclass class _V4Candidate: template_id: str frame_id: str frame_number: int confidence: float label: str @dataclass class _Unit: source_section_ids: list merge_type: str frame_template_id: str frame_id: str frame_number: int confidence: float label: str phase_z_status: str raw_content: str title: str score: float v4_rank: Optional[int] = 1 selection_path: str = "rank_1" fallback_reason: Optional[str] = None rationale: dict = field(default_factory=dict) auto_selectable: bool = True filter_reasons: list = field(default_factory=list) notes: list = field(default_factory=list) v4_candidates: list = field(default_factory=list) provisional: bool = False def _mdx_text() -> str: return "# Slide\n\n## 03-1 DX status\n\n- bullet one\n- bullet two\n" def _build_canonical_snapshot( *, mdx_source_text: Optional[str] = None, layout_preset: str = "single", ) -> dict: text = mdx_source_text if mdx_source_text is not None else _mdx_text() cand = _V4Candidate( template_id="tpl_a", frame_id="fid_a", frame_number=13, confidence=0.91, label="use_as_is", ) section = _Section( section_id="03-1", section_num=1, title="DX status", raw_content="- bullet one\n- bullet two", heading_number="3.1", v4_alias_keys=["03-1.1"], sub_sections=[], ) unit = _Unit( source_section_ids=["03-1"], merge_type="single", frame_template_id="tpl_a", frame_id="fid_a", frame_number=13, confidence=0.91, label="use_as_is", phase_z_status="auto_renderable", raw_content="- bullet one\n- bullet two", title="DX status", score=0.91, v4_candidates=[cand], provisional=False, auto_selectable=True, filter_reasons=[], notes=["a note"], rationale={"weight": 1.0}, ) return build_snapshot( mdx_sha256=hashlib.sha256(text.encode("utf-8")).hexdigest(), slide_title="Slide", slide_footer=None, sections=[section], stage0_adapter_diagnostics={"used": True, "fallback_reason": None}, stage0_normalized_assets={"popups": [], "images": [], "tables": []}, v4_evidence=[ { "section_id": "03-1", "v4_candidates": [ { "template_id": "tpl_a", "frame_id": "fid_a", "frame_number": 13, "confidence": 0.91, "label": "use_as_is", } ], "candidate_status": "ok", } ], layout_preset_pre_override=layout_preset, units=[unit], comp_debug={"v4_fallback_summary": {"fallback_used_count": 0}}, v4_fallback_traces={"03-1": {"selection_path": "rank_1"}}, ai_preflight={"enabled": False, "skipped": True}, ) def _seed_prev_run_dir(prev_run_dir: Path, *, snapshot: dict) -> None: """Populate ``prev_run_dir`` with the Step 0/1/2/5/6 artifacts plus the reuse snapshot — minimal but valid surface for u4 helpers.""" (prev_run_dir / "steps").mkdir(parents=True, exist_ok=True) for fname in _pz2._REUSE_STEP_ARTIFACTS: # JSON-shaped surface — exact shape doesn't matter for u4 (the # copy helper doesn't introspect contents); just must exist. (prev_run_dir / "steps" / fname).write_text( f'{{"name": "{fname}"}}' if fname.endswith(".json") else "raw mdx body bytes", encoding="utf-8", ) (prev_run_dir / SNAPSHOT_FILENAME).write_text( json.dumps(snapshot, ensure_ascii=False, indent=2), encoding="utf-8", ) # -- _REUSE_STEP_ARTIFACTS constant --------------------------------------- def test_reuse_step_artifacts_locks_stage2_boundary(): """Stage 2 boundary lock — Step 0/1/2/5/6 artifacts only. Step 3/4 deliberately absent: step03 / step04 ARE written after Step 6 (around src/phase_z2_pipeline.py:5931 / 5964) before the Step 7 artifact (~6294), but both are emitted with step_status='trace-only' / pipeline_path_connected=False — they are diagnostic projections of the Step 6 debug_zones, not pipeline-path-connected inputs that Step 7+ rehydrate from.""" assert _pz2._REUSE_STEP_ARTIFACTS == ( "step00_preconditions.json", "step01_mdx_upload.json", "step01_mdx_source.md", "step02_normalized.json", "step05_v4_evidence.json", "step06_composition_plan.json", ) def test_reuse_marker_filename_is_dotfile_at_run_dir_root(): assert _pz2.REUSE_MARKER_FILENAME == "_reuse_marker.json" # -- _resolve_reuse_from_prev_run_dir ------------------------------------- def test_resolve_prev_run_dir_returns_runs_dir_phase_z2_path(): rv = _pz2._resolve_reuse_from_prev_run_dir("20260524_120000_phase_z2") expected = _pz2.RUNS_DIR / "20260524_120000_phase_z2" / "phase_z2" assert rv == expected def test_resolve_prev_run_dir_does_not_check_existence(tmp_path: Path): """Pure path computation — must NOT touch the filesystem (u4b handles the missing-prev-run case).""" rv = _pz2._resolve_reuse_from_prev_run_dir("never_existed_run_id") assert isinstance(rv, Path) # The path does not actually exist; helper still returned cleanly. assert not rv.exists() # -- _copy_reuse_artifacts_from_prev_run ---------------------------------- def test_copy_reuse_artifacts_copies_all_step_files(tmp_path: Path): prev = tmp_path / "prev" / "phase_z2" new = tmp_path / "new" / "phase_z2" snap = _build_canonical_snapshot() _seed_prev_run_dir(prev, snapshot=snap) copied = _pz2._copy_reuse_artifacts_from_prev_run(prev, new) for fname in _pz2._REUSE_STEP_ARTIFACTS: assert (new / "steps" / fname).exists(), f"missing copy: {fname}" assert copied[fname] == f"steps/{fname}" def test_copy_reuse_artifacts_copies_snapshot_to_run_dir_root(tmp_path: Path): prev = tmp_path / "prev" / "phase_z2" new = tmp_path / "new" / "phase_z2" snap = _build_canonical_snapshot() _seed_prev_run_dir(prev, snapshot=snap) copied = _pz2._copy_reuse_artifacts_from_prev_run(prev, new) # Snapshot lives at run_dir root (NOT under steps/) per u3 contract. assert (new / SNAPSHOT_FILENAME).exists() assert copied[SNAPSHOT_FILENAME] == SNAPSHOT_FILENAME def test_copy_reuse_artifacts_creates_steps_subdir_if_absent(tmp_path: Path): prev = tmp_path / "prev" / "phase_z2" new = tmp_path / "new" / "phase_z2" snap = _build_canonical_snapshot() _seed_prev_run_dir(prev, snapshot=snap) # new_run_dir / steps does not yet exist assert not (new / "steps").exists() _pz2._copy_reuse_artifacts_from_prev_run(prev, new) assert (new / "steps").is_dir() def test_copy_reuse_artifacts_missing_step_raises_filenotfound( tmp_path: Path, ): prev = tmp_path / "prev" / "phase_z2" new = tmp_path / "new" / "phase_z2" snap = _build_canonical_snapshot() _seed_prev_run_dir(prev, snapshot=snap) # Delete one of the required step artifacts. (prev / "steps" / "step05_v4_evidence.json").unlink() with pytest.raises(FileNotFoundError) as ei: _pz2._copy_reuse_artifacts_from_prev_run(prev, new) msg = str(ei.value) assert "step05_v4_evidence.json" in msg assert "prev_run_dir" in msg def test_copy_reuse_artifacts_missing_snapshot_raises_filenotfound( tmp_path: Path, ): prev = tmp_path / "prev" / "phase_z2" new = tmp_path / "new" / "phase_z2" snap = _build_canonical_snapshot() _seed_prev_run_dir(prev, snapshot=snap) (prev / SNAPSHOT_FILENAME).unlink() with pytest.raises(FileNotFoundError) as ei: _pz2._copy_reuse_artifacts_from_prev_run(prev, new) assert SNAPSHOT_FILENAME in str(ei.value) def test_copy_reuse_artifacts_byte_identical_copy(tmp_path: Path): """Bytes must match exactly — copy, not transform.""" prev = tmp_path / "prev" / "phase_z2" new = tmp_path / "new" / "phase_z2" snap = _build_canonical_snapshot() _seed_prev_run_dir(prev, snapshot=snap) _pz2._copy_reuse_artifacts_from_prev_run(prev, new) for fname in _pz2._REUSE_STEP_ARTIFACTS: assert ( (prev / "steps" / fname).read_bytes() == (new / "steps" / fname).read_bytes() ) assert ( (prev / SNAPSHOT_FILENAME).read_bytes() == (new / SNAPSHOT_FILENAME).read_bytes() ) # -- _load_and_validate_reuse_snapshot ------------------------------------ def test_load_and_validate_returns_snapshot_dict(tmp_path: Path): text = _mdx_text() snap = _build_canonical_snapshot(mdx_source_text=text) (tmp_path / SNAPSHOT_FILENAME).write_text( json.dumps(snap, ensure_ascii=False, indent=2), encoding="utf-8" ) loaded = _pz2._load_and_validate_reuse_snapshot( tmp_path, mdx_source_text=text ) assert loaded["schema_version"] == SNAPSHOT_VERSION assert loaded["slide_title"]["value"] == "Slide" def test_load_and_validate_mdx_sha256_mismatch_raises(tmp_path: Path): """Snapshot was built for ``text_a`` but caller passes ``text_b``; u2 validator raises ``SnapshotValidationError`` (subclass of ``ValueError``). u4b translates to exit 2 — here we only assert the raise.""" text_a = "# Slide A\n" text_b = "# Slide B (different bytes)\n" snap = _build_canonical_snapshot(mdx_source_text=text_a) (tmp_path / SNAPSHOT_FILENAME).write_text( json.dumps(snap, ensure_ascii=False, indent=2), encoding="utf-8" ) with pytest.raises(SnapshotValidationError) as ei: _pz2._load_and_validate_reuse_snapshot( tmp_path, mdx_source_text=text_b ) assert "mdx_sha256 mismatch" in str(ei.value) def test_load_and_validate_corrupt_json_raises(tmp_path: Path): (tmp_path / SNAPSHOT_FILENAME).write_text( "{ not valid json", encoding="utf-8" ) with pytest.raises(json.JSONDecodeError): _pz2._load_and_validate_reuse_snapshot( tmp_path, mdx_source_text=_mdx_text() ) def test_load_and_validate_missing_snapshot_file_raises(tmp_path: Path): """No snapshot at all — bare ``read_text`` raises FileNotFoundError. u4b translates this to exit 2 with a provenance message.""" with pytest.raises(FileNotFoundError): _pz2._load_and_validate_reuse_snapshot( tmp_path, mdx_source_text=_mdx_text() ) def test_load_and_validate_schema_version_mismatch_raises(tmp_path: Path): text = _mdx_text() snap = _build_canonical_snapshot(mdx_source_text=text) snap["schema_version"] = SNAPSHOT_VERSION + 1 # force mismatch (tmp_path / SNAPSHOT_FILENAME).write_text( json.dumps(snap, ensure_ascii=False, indent=2), encoding="utf-8" ) with pytest.raises(SnapshotValidationError) as ei: _pz2._load_and_validate_reuse_snapshot( tmp_path, mdx_source_text=text ) assert "schema_version" in str(ei.value) # -- _rehydrate_mdx_sections_from_snapshot -------------------------------- def test_rehydrate_sections_returns_mdxsection_instances(): snap = _build_canonical_snapshot() sections = _pz2._rehydrate_mdx_sections_from_snapshot(snap) assert len(sections) == 1 assert isinstance(sections[0], _pz2.MdxSection) assert sections[0].section_id == "03-1" assert sections[0].title == "DX status" assert sections[0].raw_content == "- bullet one\n- bullet two" def test_rehydrate_sections_preserves_heading_number_and_aliases(): snap = _build_canonical_snapshot() sections = _pz2._rehydrate_mdx_sections_from_snapshot(snap) assert sections[0].heading_number == "3.1" assert sections[0].v4_alias_keys == ["03-1.1"] assert sections[0].sub_sections == [] # -- _rehydrate_composition_units_from_snapshot --------------------------- def test_rehydrate_units_returns_composition_unit_instances(): snap = _build_canonical_snapshot() units = _pz2._rehydrate_composition_units_from_snapshot(snap) assert len(units) == 1 assert isinstance(units[0], CompositionUnit) def test_rehydrate_units_preserves_core_fields(): snap = _build_canonical_snapshot() units = _pz2._rehydrate_composition_units_from_snapshot(snap) u = units[0] assert u.source_section_ids == ["03-1"] assert u.merge_type == "single" assert u.frame_template_id == "tpl_a" assert u.frame_id == "fid_a" assert u.frame_number == 13 assert u.confidence == pytest.approx(0.91) assert u.label == "use_as_is" assert u.phase_z_status == "auto_renderable" assert u.title == "DX status" assert u.score == pytest.approx(0.91) def test_rehydrate_units_preserves_provisional_and_auto_selectable(): snap = _build_canonical_snapshot() units = _pz2._rehydrate_composition_units_from_snapshot(snap) assert units[0].provisional is False assert units[0].auto_selectable is True assert units[0].filter_reasons == [] assert units[0].notes == ["a note"] assert units[0].rationale == {"weight": 1.0} def test_rehydrate_units_v4_candidates_expose_attribute_access(): """``_apply_frame_override_to_unit`` reads ``cand.template_id`` / ``cand.frame_id`` / etc. off ``unit.v4_candidates`` — restored entries MUST expose attribute access, not raw dict access.""" snap = _build_canonical_snapshot() units = _pz2._rehydrate_composition_units_from_snapshot(snap) cands = units[0].v4_candidates assert len(cands) == 1 c = cands[0] assert isinstance(c, _pz2._RehydratedV4Candidate) assert c.template_id == "tpl_a" assert c.frame_id == "fid_a" assert c.frame_number == 13 assert c.confidence == pytest.approx(0.91) assert c.label == "use_as_is" def test_rehydrate_units_empty_v4_candidates_yields_empty_list(): snap = _build_canonical_snapshot() snap["units"]["value"][0]["v4_candidates"] = [] units = _pz2._rehydrate_composition_units_from_snapshot(snap) assert units[0].v4_candidates == [] # -- _write_reuse_marker -------------------------------------------------- def test_write_reuse_marker_writes_json_with_prev_run_id(tmp_path: Path): copied = { "step00_preconditions.json": "steps/step00_preconditions.json", SNAPSHOT_FILENAME: SNAPSHOT_FILENAME, } rv = _pz2._write_reuse_marker( tmp_path, prev_run_id="20260524_010101_phase_z2", copied_artifacts=copied, ) assert rv == tmp_path / _pz2.REUSE_MARKER_FILENAME marker = json.loads(rv.read_text(encoding="utf-8")) assert marker["schema_version"] == _pz2.REUSE_MARKER_SCHEMA_VERSION assert marker["reuse_from_prev_run_id"] == "20260524_010101_phase_z2" assert marker["snapshot_filename"] == SNAPSHOT_FILENAME def test_write_reuse_marker_records_copied_artifacts_and_boundary( tmp_path: Path, ): copied = { fname: f"steps/{fname}" for fname in _pz2._REUSE_STEP_ARTIFACTS } copied[SNAPSHOT_FILENAME] = SNAPSHOT_FILENAME _pz2._write_reuse_marker( tmp_path, prev_run_id="20260524_010101_phase_z2", copied_artifacts=copied, ) marker = json.loads( (tmp_path / _pz2.REUSE_MARKER_FILENAME).read_text(encoding="utf-8") ) assert marker["copied_artifacts"] == copied assert marker["boundary_steps"] == list(_pz2._REUSE_STEP_ARTIFACTS) assert marker["resume_at_step"] == 7 # -- module surface anchors ----------------------------------------------- def test_pipeline_exposes_all_u4_helpers(): """u5 wires these into ``run_phase_z2_mvp1`` — they must remain module-level callable surface on ``phase_z2_pipeline``.""" for name in ( "_resolve_reuse_from_prev_run_dir", "_copy_reuse_artifacts_from_prev_run", "_load_and_validate_reuse_snapshot", "_rehydrate_mdx_sections_from_snapshot", "_rehydrate_composition_units_from_snapshot", "_write_reuse_marker", "_RehydratedV4Candidate", "_REUSE_STEP_ARTIFACTS", "REUSE_MARKER_FILENAME", "REUSE_MARKER_SCHEMA_VERSION", ): assert hasattr(_pz2, name), f"u4 surface missing: {name}" def test_pipeline_run_signature_reuse_from_is_kw_only_optional_none(): """u5 — ``reuse_from`` is now part of ``run_phase_z2_mvp1``'s public signature. The kwarg MUST be keyword-only (after the ``*`` barrier), default to ``None`` (so absent flag preserves the pre-u5 behaviour), and sit alongside the existing override kwargs. The locked ``until_u5`` regression has flipped — keep this assertion as the forward-direction lock so future signature drift (e.g. a positional promotion or a default change) trips loudly.""" import inspect sig = inspect.signature(_pz2.run_phase_z2_mvp1) assert "reuse_from" in sig.parameters, ( "u5 must thread reuse_from into run_phase_z2_mvp1 — kwarg missing. " f"current params: {list(sig.parameters)}" ) param = sig.parameters["reuse_from"] assert param.kind is inspect.Parameter.KEYWORD_ONLY, ( f"reuse_from must be keyword-only (after the ``*`` barrier); " f"got kind={param.kind}" ) assert param.default is None, ( f"reuse_from must default to None to preserve pre-u5 behaviour; " f"got default={param.default!r}" )