C.E.L_Slide_test2/tests/test_phase_z2_reuse_snapshot_write.py

"""IMP-43 (#72) u3 — focused tests for the Step 6 reuse snapshot writer.

u3 scope (per the Stage 2 Exit Report):

- ``_write_reuse_snapshot`` writes ``run_dir/_reuse_snapshot.json`` *after*
  the Step 6 artifact lands; failure WARNS and CONTINUES (the helper does
  NOT raise out of the main pipeline run).
- The Step 6 artifact data dict records the run_dir-relative sidecar path
  as ``data.reuse_snapshot_path`` (additive informational field, always
  set to ``SNAPSHOT_FILENAME`` regardless of write success — u4 will
  fail-closed on missing / invalid sidecar via u2's ``validate_snapshot``).

The helper is tested in isolation (no full pipeline run) — pipeline call
site presence is asserted structurally so we exercise behaviour without
re-running Step 0~6 inside the test process. End-to-end equivalence under
``--reuse-from`` is u7a / u7b scope.
"""
from __future__ import annotations

import json
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Optional

import pytest

import src.phase_z2_pipeline as _pz2
from src.phase_z2_reuse_snapshot import (
    SNAPSHOT_FILENAME,
    SNAPSHOT_VERSION,
    SnapshotValidationError,
    validate_snapshot,
)


# -- synthetic duck-typed inputs ------------------------------------------


@dataclass
class _Section:
    section_id: str
    section_num: int
    title: str
    raw_content: str
    heading_number: Optional[str] = None
    v4_alias_keys: list = field(default_factory=list)
    sub_sections: list = field(default_factory=list)


@dataclass
class _V4Candidate:
    template_id: str
    frame_id: str
    frame_number: int
    confidence: float
    label: str


@dataclass
class _Unit:
    source_section_ids: list
    merge_type: str
    frame_template_id: str
    frame_id: str
    frame_number: int
    confidence: float
    label: str
    phase_z_status: str
    raw_content: str
    title: str
    score: float
    v4_rank: Optional[int] = 1
    selection_path: str = "rank_1"
    fallback_reason: Optional[str] = None
    rationale: dict = field(default_factory=dict)
    auto_selectable: bool = True
    filter_reasons: list = field(default_factory=list)
    notes: list = field(default_factory=list)
    v4_candidates: list = field(default_factory=list)
    provisional: bool = False


def _make_kwargs(**overrides: Any) -> dict[str, Any]:
    cand = _V4Candidate(
        template_id="tpl_a",
        frame_id="fid_a",
        frame_number=13,
        confidence=0.91,
        label="use_as_is",
    )
    section = _Section(
        section_id="03-1",
        section_num=1,
        title="DX status",
        raw_content="- bullet one\n- bullet two",
    )
    unit = _Unit(
        source_section_ids=["03-1"],
        merge_type="single",
        frame_template_id="tpl_a",
        frame_id="fid_a",
        frame_number=13,
        confidence=0.91,
        label="use_as_is",
        phase_z_status="auto_renderable",
        raw_content="- bullet one\n- bullet two",
        title="DX status",
        score=0.91,
        v4_candidates=[cand],
    )
    kwargs: dict[str, Any] = dict(
        mdx_source_text="# Slide\n\n## 03-1 DX status\n\n- bullet one\n- bullet two\n",
        slide_title="Slide",
        slide_footer=None,
        sections=[section],
        stage0_adapter_diagnostics={"used": True, "fallback_reason": None},
        stage0_normalized_assets={"popups": [], "images": [], "tables": []},
        v4_evidence=[
            {
                "section_id": "03-1",
                "v4_candidates": [
                    {
                        "template_id": "tpl_a",
                        "frame_id": "fid_a",
                        "frame_number": 13,
                        "confidence": 0.91,
                        "label": "use_as_is",
                    }
                ],
                "candidate_status": "ok",
            }
        ],
        layout_preset_pre_override="single",
        units=[unit],
        comp_debug={"v4_fallback_summary": {"fallback_used_count": 0}},
        v4_fallback_traces={"03-1": {"selection_path": "rank_1"}},
        ai_preflight={"enabled": False, "skipped": True},
    )
    kwargs.update(overrides)
    return kwargs


# -- success path ---------------------------------------------------------


def test_writes_snapshot_file_at_run_dir_root(tmp_path: Path):
    rv = _pz2._write_reuse_snapshot(tmp_path, **_make_kwargs())
    assert rv == SNAPSHOT_FILENAME
    fpath = tmp_path / SNAPSHOT_FILENAME
    assert fpath.exists(), f"snapshot not written at {fpath}"


def test_written_snapshot_validates(tmp_path: Path):
    kwargs = _make_kwargs()
    rv = _pz2._write_reuse_snapshot(tmp_path, **kwargs)
    assert rv == SNAPSHOT_FILENAME
    snap = json.loads((tmp_path / SNAPSHOT_FILENAME).read_text(encoding="utf-8"))

    # mdx_sha256 is derived from mdx_source_text — recompute to verify
    # the helper is hashing the UTF-8 bytes of the same source we passed.
    import hashlib as _hl

    expected_sha = _hl.sha256(
        kwargs["mdx_source_text"].encode("utf-8")
    ).hexdigest()
    validate_snapshot(snap, expected_mdx_sha256=expected_sha)


def test_snapshot_has_correct_schema_version(tmp_path: Path):
    _pz2._write_reuse_snapshot(tmp_path, **_make_kwargs())
    snap = json.loads((tmp_path / SNAPSHOT_FILENAME).read_text(encoding="utf-8"))
    assert snap["schema_version"] == SNAPSHOT_VERSION


def test_snapshot_records_layout_preset_pre_override(tmp_path: Path):
    _pz2._write_reuse_snapshot(
        tmp_path, **_make_kwargs(layout_preset_pre_override="horizontal-2")
    )
    snap = json.loads((tmp_path / SNAPSHOT_FILENAME).read_text(encoding="utf-8"))
    assert snap["layout_preset_pre_override"]["value"] == "horizontal-2"


def test_snapshot_is_utf8_encoded_with_non_ascii_content(tmp_path: Path):
    _pz2._write_reuse_snapshot(
        tmp_path,
        **_make_kwargs(
            slide_title="설계 방식의 왜곡",
            mdx_source_text="# 설계 방식\n\n- 한글 bullet\n",
        ),
    )
    # ensure_ascii=False is intentional so Korean text round-trips
    # readable; if a future refactor drops it the bytes change but the
    # JSON still parses — we assert the file is decodable AS utf-8 and
    # the value survives the round trip.
    raw = (tmp_path / SNAPSHOT_FILENAME).read_text(encoding="utf-8")
    snap = json.loads(raw)
    assert snap["slide_title"]["value"] == "설계 방식의 왜곡"


# -- failure path ---------------------------------------------------------


def test_failure_warns_and_returns_none(tmp_path: Path, monkeypatch, capsys):
    """When ``build_snapshot`` raises, the helper must NOT propagate the
    exception — it WARNS on stderr and returns ``None`` so the main
    pipeline run continues."""

    def _boom(**_kwargs):
        raise RuntimeError("synthetic build failure")

    monkeypatch.setattr(_pz2, "build_snapshot", _boom)

    rv = _pz2._write_reuse_snapshot(tmp_path, **_make_kwargs())

    assert rv is None
    captured = capsys.readouterr()
    assert "reuse-snapshot" in captured.err
    assert "WARN" in captured.err
    assert "RuntimeError" in captured.err
    # File MUST NOT exist on failure (no partial JSON on disk).
    assert not (tmp_path / SNAPSHOT_FILENAME).exists()


def test_failure_on_unwritable_run_dir_warns_and_returns_none(
    tmp_path: Path, monkeypatch, capsys
):
    """Simulate disk write failure: helper warns + returns None, never
    raises out to the caller (Stage 2 guardrail: optional sidecar)."""
    nonexistent = tmp_path / "does" / "not" / "exist"
    # nonexistent.exists() is False — Path.write_text raises FileNotFoundError.

    rv = _pz2._write_reuse_snapshot(nonexistent, **_make_kwargs())

    assert rv is None
    captured = capsys.readouterr()
    assert "reuse-snapshot" in captured.err
    assert "WARN" in captured.err
    # FileNotFoundError specifically — sanity-check the type surfaces in
    # the warning so debugging is not blind.
    assert "FileNotFoundError" in captured.err


# -- pipeline integration anchors -----------------------------------------


def test_pipeline_imports_helper_and_constant():
    """The pipeline module must expose the helper for the post-Step-6
    call site, and the constant must round-trip from the snapshot
    module (single source of truth)."""
    assert hasattr(_pz2, "_write_reuse_snapshot")
    assert callable(_pz2._write_reuse_snapshot)
    assert _pz2.SNAPSHOT_FILENAME == "_reuse_snapshot.json"


def test_pipeline_call_site_follows_step06_artifact_write():
    """Structural guard: the helper must be invoked AFTER the Step 6
    artifact write in ``run_phase_z2_mvp1`` so the sidecar lands next
    to ``steps/step06_composition_plan.json`` (Stage 2 spec)."""
    source = Path(_pz2.__file__).read_text(encoding="utf-8")
    # Locate the step06 artifact write call site by its locked name arg.
    step06_marker = '6, "composition_plan"'
    idx_step06 = source.find(step06_marker)
    assert idx_step06 != -1, "step06 artifact write call site missing"
    # The helper call must appear AFTER the step06 marker.
    idx_helper = source.find("_write_reuse_snapshot(", idx_step06)
    assert idx_helper != -1, "u3 helper call missing after step06 write"


def test_pipeline_step06_artifact_data_records_snapshot_path():
    """Structural guard: the Step 6 artifact data dict must include the
    ``reuse_snapshot_path`` field so a future ``--reuse-from`` consumer
    can locate the expected sidecar via the canonical step artifact
    (Stage 2 spec — informational; absence of the file is u4's
    fail-closed concern)."""
    source = Path(_pz2.__file__).read_text(encoding="utf-8")
    step06_marker = '6, "composition_plan"'
    idx_step06 = source.find(step06_marker)
    assert idx_step06 != -1
    # Search a generous window after the marker for the field key.
    window = source[idx_step06 : idx_step06 + 8000]
    assert '"reuse_snapshot_path"' in window
    assert "SNAPSHOT_FILENAME" in window