C.E.L_Slide_test2/tests/phase_z2/test_pz2_vu_preservation.py

"""Tests for IMP-16-U1 unit u8: ``verify_text_preservation``.

Locks the Phase Z port of the deterministic text-preservation check
(Phase Q reference: ``src/content_verifier.py:206-273``). The function
is pure and composes u2 (extract_text_from_html), u3
(normalize_for_comparison), u6 (split_into_sentences), and u7
(_sentence_matches_html). No Phase Q import is exercised.
"""
from __future__ import annotations

from src.phase_z2_verification_utils import (
    VerificationResult,
    _MISSING_SENTENCE_REPORT_LIMIT,
    _MISSING_SENTENCE_TRUNCATE_LEN,
    _TEXT_PRESERVATION_DEFAULT_THRESHOLD,
    verify_text_preservation,
)


def test_verify_text_preservation_defaults_locked():
    # Locks the Phase Q caller convention: threshold default = 0.70,
    # missing-list report cap = 5, per-item truncate length = 60.
    assert _TEXT_PRESERVATION_DEFAULT_THRESHOLD == 0.70
    assert _MISSING_SENTENCE_REPORT_LIMIT == 5
    assert _MISSING_SENTENCE_TRUNCATE_LEN == 60


def test_verify_text_preservation_empty_sentences_returns_passed():
    # MDX that reduces to zero sentences after split_into_sentences
    # (e.g. headers only) must return passed=True with score 1.0 and
    # an empty errors/warnings surface. Phase Q parity: early return
    # before any HTML extraction.
    result = verify_text_preservation("# header only", "<p>anything</p>", "core")
    assert isinstance(result, VerificationResult)
    assert result.passed is True
    assert result.area_name == "core"
    assert result.checks == {"text_preservation": True}
    assert result.score == 1.0
    assert result.errors == []
    assert result.warnings == []


def test_verify_text_preservation_full_match_passes():
    # All MDX sentences preserved in HTML -> score 1.0, passed True,
    # no warnings (warnings only attached when score < 1.0), no errors.
    mdx = "데이터 분석은 핵심 과정입니다. 시각화로 의사 결정을 지원합니다."
    html = (
        "<p>데이터 분석은 핵심 과정입니다.</p>"
        "<p>시각화로 의사 결정을 지원합니다.</p>"
    )
    result = verify_text_preservation(mdx, html, "body")
    assert result.passed is True
    assert result.score == 1.0
    assert result.warnings == []
    assert result.errors == []


def test_verify_text_preservation_below_threshold_reports_errors():
    # Only one of two MDX sentences appears in the HTML -> score 0.5,
    # below default threshold 0.70 -> passed False, errors list opens
    # with the "누락 문장 (1/2):" header followed by quoted missing
    # sentences (truncation gate not crossed).
    mdx = (
        "데이터 분석은 핵심 과정입니다.\n"
        "전혀 다른 문맥의 두 번째 문장입니다."
    )
    html = "<p>데이터 분석은 핵심 과정입니다.</p>"
    result = verify_text_preservation(mdx, html, "core")
    assert result.passed is False
    assert result.score == 0.5
    assert result.checks == {"text_preservation": False}
    assert result.errors[0] == "누락 문장 (1/2):"
    assert any("두 번째 문장" in line for line in result.errors[1:])
    assert result.warnings == ["보존율: 50% (1/2 문장)"]


def test_verify_text_preservation_truncates_long_missing_sentence():
    # A missing sentence longer than 60 chars must be rendered with
    # the "...\"" tail. Phase Z surface lifts the 60 constant to a
    # named module value (_MISSING_SENTENCE_TRUNCATE_LEN) so the gate
    # is auditable.
    long_sentence = "엄청나게 긴 문장이 들어가서 절단 동작을 검증합니다." + ("끝" * 60)
    mdx = long_sentence + "."
    html = "<p>관련 없는 문구</p>"
    result = verify_text_preservation(mdx, html, "footer", threshold=0.99)
    assert result.passed is False
    # Header + at least one missing-line entry; the entry must end with `..."`.
    assert len(result.errors) >= 2
    assert result.errors[-1].endswith("...\"")
    truncated_body = result.errors[-1].split('"', 2)[1].rstrip(".")
    assert len(truncated_body) == _MISSING_SENTENCE_TRUNCATE_LEN


def test_verify_text_preservation_caps_missing_report_at_limit():
    # Generate seven MDX-only sentences with no HTML coverage.
    # passed=False, errors list = 1 header + at most 5 missing entries
    # (_MISSING_SENTENCE_REPORT_LIMIT). The header reports the true
    # missing/total counts even though only 5 are surfaced.
    mdx_lines = [f"전혀 다른 문맥의 문장 번호 {i} 입니다." for i in range(7)]
    mdx = "\n".join(mdx_lines)
    html = "<p>관련 없는 문구</p>"
    result = verify_text_preservation(mdx, html, "core")
    assert result.passed is False
    assert result.errors[0] == "누락 문장 (7/7):"
    assert len(result.errors) == 1 + _MISSING_SENTENCE_REPORT_LIMIT


def test_verify_text_preservation_custom_threshold_passes_at_50_percent():
    # Lowering the threshold to 0.50 makes a 50% preservation pass.
    mdx = (
        "데이터 분석은 핵심 과정입니다.\n"
        "전혀 다른 문맥의 두 번째 문장입니다."
    )
    html = "<p>데이터 분석은 핵심 과정입니다.</p>"
    result = verify_text_preservation(mdx, html, "core", threshold=0.50)
    assert result.passed is True
    assert result.score == 0.5
    # Score < 1.0 so the 보존율 warning is still attached for trace surface.
    assert result.warnings == ["보존율: 50% (1/2 문장)"]