"""Tests for IMP-16-U1 unit u8: ``verify_text_preservation``. Locks the Phase Z port of the deterministic text-preservation check (Phase Q reference: ``src/content_verifier.py:206-273``). The function is pure and composes u2 (extract_text_from_html), u3 (normalize_for_comparison), u6 (split_into_sentences), and u7 (_sentence_matches_html). No Phase Q import is exercised. """ from __future__ import annotations from src.phase_z2_verification_utils import ( VerificationResult, _MISSING_SENTENCE_REPORT_LIMIT, _MISSING_SENTENCE_TRUNCATE_LEN, _TEXT_PRESERVATION_DEFAULT_THRESHOLD, verify_text_preservation, ) def test_verify_text_preservation_defaults_locked(): # Locks the Phase Q caller convention: threshold default = 0.70, # missing-list report cap = 5, per-item truncate length = 60. assert _TEXT_PRESERVATION_DEFAULT_THRESHOLD == 0.70 assert _MISSING_SENTENCE_REPORT_LIMIT == 5 assert _MISSING_SENTENCE_TRUNCATE_LEN == 60 def test_verify_text_preservation_empty_sentences_returns_passed(): # MDX that reduces to zero sentences after split_into_sentences # (e.g. headers only) must return passed=True with score 1.0 and # an empty errors/warnings surface. Phase Q parity: early return # before any HTML extraction. result = verify_text_preservation("# header only", "
anything
", "core") assert isinstance(result, VerificationResult) assert result.passed is True assert result.area_name == "core" assert result.checks == {"text_preservation": True} assert result.score == 1.0 assert result.errors == [] assert result.warnings == [] def test_verify_text_preservation_full_match_passes(): # All MDX sentences preserved in HTML -> score 1.0, passed True, # no warnings (warnings only attached when score < 1.0), no errors. mdx = "데이터 분석은 핵심 과정입니다. 시각화로 의사 결정을 지원합니다." html = ( "데이터 분석은 핵심 과정입니다.
" "시각화로 의사 결정을 지원합니다.
" ) result = verify_text_preservation(mdx, html, "body") assert result.passed is True assert result.score == 1.0 assert result.warnings == [] assert result.errors == [] def test_verify_text_preservation_below_threshold_reports_errors(): # Only one of two MDX sentences appears in the HTML -> score 0.5, # below default threshold 0.70 -> passed False, errors list opens # with the "누락 문장 (1/2):" header followed by quoted missing # sentences (truncation gate not crossed). mdx = ( "데이터 분석은 핵심 과정입니다.\n" "전혀 다른 문맥의 두 번째 문장입니다." ) html = "데이터 분석은 핵심 과정입니다.
" result = verify_text_preservation(mdx, html, "core") assert result.passed is False assert result.score == 0.5 assert result.checks == {"text_preservation": False} assert result.errors[0] == "누락 문장 (1/2):" assert any("두 번째 문장" in line for line in result.errors[1:]) assert result.warnings == ["보존율: 50% (1/2 문장)"] def test_verify_text_preservation_truncates_long_missing_sentence(): # A missing sentence longer than 60 chars must be rendered with # the "...\"" tail. Phase Z surface lifts the 60 constant to a # named module value (_MISSING_SENTENCE_TRUNCATE_LEN) so the gate # is auditable. long_sentence = "엄청나게 긴 문장이 들어가서 절단 동작을 검증합니다." + ("끝" * 60) mdx = long_sentence + "." html = "관련 없는 문구
" result = verify_text_preservation(mdx, html, "footer", threshold=0.99) assert result.passed is False # Header + at least one missing-line entry; the entry must end with `..."`. assert len(result.errors) >= 2 assert result.errors[-1].endswith("...\"") truncated_body = result.errors[-1].split('"', 2)[1].rstrip(".") assert len(truncated_body) == _MISSING_SENTENCE_TRUNCATE_LEN def test_verify_text_preservation_caps_missing_report_at_limit(): # Generate seven MDX-only sentences with no HTML coverage. # passed=False, errors list = 1 header + at most 5 missing entries # (_MISSING_SENTENCE_REPORT_LIMIT). The header reports the true # missing/total counts even though only 5 are surfaced. mdx_lines = [f"전혀 다른 문맥의 문장 번호 {i} 입니다." for i in range(7)] mdx = "\n".join(mdx_lines) html = "관련 없는 문구
" result = verify_text_preservation(mdx, html, "core") assert result.passed is False assert result.errors[0] == "누락 문장 (7/7):" assert len(result.errors) == 1 + _MISSING_SENTENCE_REPORT_LIMIT def test_verify_text_preservation_custom_threshold_passes_at_50_percent(): # Lowering the threshold to 0.50 makes a 50% preservation pass. mdx = ( "데이터 분석은 핵심 과정입니다.\n" "전혀 다른 문맥의 두 번째 문장입니다." ) html = "데이터 분석은 핵심 과정입니다.
" result = verify_text_preservation(mdx, html, "core", threshold=0.50) assert result.passed is True assert result.score == 0.5 # Score < 1.0 so the 보존율 warning is still attached for trace surface. assert result.warnings == ["보존율: 50% (1/2 문장)"]