feat(IMP-16): U1 H3 verification utility port + U2 wiring design

U1 (runtime, u1-u10): new Phase Z-owned deterministic verification module src/phase_z2_verification_utils.py (335 LOC, stdlib only) porting H3 utility surface — VerificationResult, extract_text_from_html, normalize_for_comparison, extract_keywords, strip_meta_lines, split_into_sentences, verify_text_preservation, detect_invented_text. 10 unit tests under tests/phase_z2/test_pz2_vu_*.py (56 tests). u11 (design-only): docs/architecture/IMP-16-U2-WIRING-DESIGN.md fixes the Step 1/2/14/21/22 reverse-path contract, redesigned frame-contract pattern reservation (IMP-20), and IMP-07 hard-gate criteria. No runtime wiring lands in this commit — U2 stays blocked until IMP-07 reverse path is implemented + verified + runtime-hit. Guardrails: no src.content_verifier import; no FORBIDDEN_KEI_MEMOS / generate_with_retry / REQUIRED_PATTERNS / verify_structure / verify_area / verify_all_areas usage; no AI / Kei / httpx / SSE path; AI-isolation contract upheld (utility is deterministic). Tests: 56 targeted PASS (0.19s), 15 regression baseline PASS (7.59s). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 04:42:35 +09:00
parent 614c53358e
commit 23ba8b68cd
12 changed files with 1156 additions and 0 deletions
--- a/tests/phase_z2/test_pz2_vu_dataclass.py
+++ b/tests/phase_z2/test_pz2_vu_dataclass.py
@@ -0,0 +1,59 @@
+"""u1 — VerificationResult dataclass surface (IMP-16-U1).
+
+Locks the Phase Z verification utility module anchor and the
+VerificationResult shape so downstream units (u2~u10) can rely on it
+without importing src.content_verifier.
+"""
+from __future__ import annotations
+
+import ast
+import importlib
+
+import pytest
+
+
+def test_module_importable_without_content_verifier():
+    mod = importlib.import_module("src.phase_z2_verification_utils")
+    tree = ast.parse(open(mod.__file__, encoding="utf-8").read())
+    for node in ast.walk(tree):
+        if isinstance(node, ast.Import):
+            for alias in node.names:
+                assert "content_verifier" not in alias.name, (
+                    "Phase Z verification utility must not import "
+                    "src.content_verifier"
+                )
+        elif isinstance(node, ast.ImportFrom):
+            assert node.module is None or "content_verifier" not in node.module, (
+                "Phase Z verification utility must not import "
+                "src.content_verifier"
+            )
+
+
+def test_verification_result_defaults():
+    from src.phase_z2_verification_utils import VerificationResult
+
+    r = VerificationResult(passed=True, area_name="zone_test")
+    assert r.passed is True
+    assert r.area_name == "zone_test"
+    assert r.checks == {}
+    assert r.score == 0.0
+    assert r.errors == []
+    assert r.warnings == []
+
+
+def test_verification_result_independent_default_collections():
+    from src.phase_z2_verification_utils import VerificationResult
+
+    a = VerificationResult(passed=False, area_name="a")
+    b = VerificationResult(passed=False, area_name="b")
+    a.checks["x"] = True
+    a.errors.append("e")
+    a.warnings.append("w")
+    assert b.checks == {} and b.errors == [] and b.warnings == []
+
+
+def test_verification_result_required_fields():
+    from src.phase_z2_verification_utils import VerificationResult
+
+    with pytest.raises(TypeError):
+        VerificationResult()  # type: ignore[call-arg]
--- a/tests/phase_z2/test_pz2_vu_extract.py
+++ b/tests/phase_z2/test_pz2_vu_extract.py
@@ -0,0 +1,54 @@
+"""u2 — pure HTML text extraction surface (IMP-16-U1).
+
+Locks the deterministic visible-text extraction contract:
+  - <style> / <script> contents are excluded.
+  - Whitespace-only chunks are dropped; surviving chunks are stripped.
+  - Order of visible-text fragments is preserved.
+  - No import of src.content_verifier.
+"""
+from __future__ import annotations
+
+
+def test_extract_plain_text_fragments_in_order():
+    from src.phase_z2_verification_utils import extract_text_from_html
+
+    html = "<p>first</p><p>second</p><p>third</p>"
+    assert extract_text_from_html(html) == ["first", "second", "third"]
+
+
+def test_extract_skips_style_and_script_bodies():
+    from src.phase_z2_verification_utils import extract_text_from_html
+
+    html = (
+        "<html><head>"
+        "<style>body { color: red; } .x { font-size: 12px; }</style>"
+        "<script>var keep_out = 1;</script>"
+        "</head><body><p>visible</p></body></html>"
+    )
+    out = extract_text_from_html(html)
+    assert "visible" in out
+    joined = " ".join(out)
+    assert "color: red" not in joined
+    assert "keep_out" not in joined
+
+
+def test_extract_drops_whitespace_only_chunks_and_strips_survivors():
+    from src.phase_z2_verification_utils import extract_text_from_html
+
+    html = "<div>   \n\n   </div><div>  hello  </div><span>   world\t</span>"
+    out = extract_text_from_html(html)
+    assert out == ["hello", "world"]
+
+
+def test_extract_preserves_korean_and_inline_markup_text():
+    from src.phase_z2_verification_utils import extract_text_from_html
+
+    html = "<p>설계 <strong>방식</strong>의 왜곡</p>"
+    out = extract_text_from_html(html)
+    assert out == ["설계", "방식", "의 왜곡"]
+
+
+def test_extract_empty_input_returns_empty_list():
+    from src.phase_z2_verification_utils import extract_text_from_html
+
+    assert extract_text_from_html("") == []
--- a/tests/phase_z2/test_pz2_vu_integration.py
+++ b/tests/phase_z2/test_pz2_vu_integration.py
@@ -0,0 +1,106 @@
+"""Tests for IMP-16-U1 unit u10: sample-backed smoke without pipeline import.
+
+End-to-end smoke of the deterministic chain (extract_text_from_html ∘
+normalize_for_comparison ∘ split_into_sentences ∘ _sentence_matches_html
+→ verify_text_preservation / detect_invented_text) on a real
+``samples/mdx_batch`` MDX file. Per Stage 2 rationale: smoke coverage
+uses the sample but does NOT hardcode a sample-specific pass.
+
+Also locks the AI-isolation contract for the verification axis: this
+test and the production module MUST NOT import orchestrator /
+phase_z2_pipeline / Phase Q content_verifier / Kei client.
+"""
+from __future__ import annotations
+
+import ast
+from pathlib import Path
+
+from src.phase_z2_verification_utils import (
+    VerificationResult,
+    detect_invented_text,
+    verify_text_preservation,
+)
+
+_REPO_ROOT = Path(__file__).resolve().parents[2]
+_SAMPLE_MDX_PATH = _REPO_ROOT / "samples" / "mdx_batch" / "02.mdx"
+_FORBIDDEN_IMPORT_ROOTS = (
+    "orchestrator",
+    "src.phase_z2_pipeline",
+    "src.content_verifier",
+    "src.kei_client",
+)
+
+
+def _module_imports(path: Path) -> set[str]:
+    tree = ast.parse(path.read_text(encoding="utf-8"))
+    names: set[str] = set()
+    for node in ast.walk(tree):
+        if isinstance(node, ast.Import):
+            for alias in node.names:
+                names.add(alias.name)
+        elif isinstance(node, ast.ImportFrom) and node.module:
+            names.add(node.module)
+    return names
+
+
+def test_integration_sample_mdx_exists():
+    # Smoke fixture availability gate; explicit so a missing sample
+    # surfaces as a fixture problem, not a downstream assertion failure.
+    assert _SAMPLE_MDX_PATH.exists(), f"sample missing: {_SAMPLE_MDX_PATH}"
+
+
+def test_integration_full_chain_runs_on_real_sample():
+    # Locks API contract over the full chain on a real MDX: returns a
+    # VerificationResult, area_name passthrough works, score within
+    # [0.0, 1.0], and detect_invented_text returns a list. No assertion
+    # is made about a specific score so the sample is not hardcoded as
+    # the pipeline's pass rule (Stage 2 u10 rationale).
+    mdx = _SAMPLE_MDX_PATH.read_text(encoding="utf-8")
+    html = f"<div>{mdx}</div>"
+    result = verify_text_preservation(mdx, html, "smoke")
+    assert isinstance(result, VerificationResult)
+    assert result.area_name == "smoke"
+    assert 0.0 <= result.score <= 1.0
+    assert isinstance(detect_invented_text(mdx, html), list)
+
+
+def test_integration_mirrored_html_passes_default_threshold():
+    # When the HTML side mirrors the MDX text verbatim, the deterministic
+    # preservation check must pass the Phase Q-default threshold (0.70).
+    # This is the integration-level guarantee for the B-2 reverse path:
+    # round-tripped HTML that preserves the MDX text must verify.
+    mdx = _SAMPLE_MDX_PATH.read_text(encoding="utf-8")
+    html = f"<div>{mdx}</div>"
+    result = verify_text_preservation(mdx, html, "smoke")
+    assert result.passed is True
+
+
+def test_integration_fabricated_html_flags_invented_text():
+    # Locks the hallucination-guard end-to-end: HTML text that has no
+    # keyword anchor in the source MDX must be flagged. Synthetic
+    # sentence chosen so its keywords (완전히, 만들어낸, 원본, 등장 …)
+    # do not appear in samples/mdx_batch/02.mdx.
+    mdx = _SAMPLE_MDX_PATH.read_text(encoding="utf-8")
+    fabricated_html = (
+        "<p>완전히 새로 만들어낸 문장으로 원본에는 전혀 등장하지 않는 내용입니다.</p>"
+    )
+    invented = detect_invented_text(mdx, fabricated_html)
+    assert isinstance(invented, list)
+    assert len(invented) >= 1
+
+
+def test_integration_no_forbidden_imports():
+    # AI-isolation + Phase Z scope-lock guard. Production module and
+    # this test file must not import orchestrator / phase_z2_pipeline /
+    # Phase Q content_verifier / Kei client. AST scan of the on-disk
+    # source (not the imported module) so re-exports cannot mask a leak.
+    for path in (
+        _REPO_ROOT / "src" / "phase_z2_verification_utils.py",
+        Path(__file__).resolve(),
+    ):
+        modules = _module_imports(path)
+        for module in modules:
+            for forbidden in _FORBIDDEN_IMPORT_ROOTS:
+                assert not (module == forbidden or module.startswith(forbidden + ".")), (
+                    f"{path.name} imports forbidden module: {module}"
+                )
--- a/tests/phase_z2/test_pz2_vu_invented.py
+++ b/tests/phase_z2/test_pz2_vu_invented.py
@@ -0,0 +1,84 @@
+"""Tests for IMP-16-U1 unit u9: ``detect_invented_text``.
+
+Locks the Phase Z port of the deterministic hallucination guard
+(Phase Q reference: ``src/content_verifier.py:276-315``). The function
+is pure and composes u2 (extract_text_from_html), u3
+(normalize_for_comparison), and u4 (extract_keywords). No Phase Q
+import is exercised.
+"""
+from __future__ import annotations
+
+from src.phase_z2_verification_utils import (
+    _INVENTED_TEXT_ALLOWED_LABELS,
+    _INVENTED_TEXT_CSS_NUMBER_PATTERN,
+    _INVENTED_TEXT_KEYWORD_THRESHOLD,
+    _INVENTED_TEXT_MIN_LENGTH,
+    _INVENTED_TEXT_TRUNCATE_LEN,
+    detect_invented_text,
+)
+
+
+def test_detect_invented_text_constants_locked() -> None:
+    """Lock the five named module constants ported from Phase Q literals."""
+    assert _INVENTED_TEXT_MIN_LENGTH == 15
+    assert _INVENTED_TEXT_ALLOWED_LABELS == frozenset(
+        {"용어 정의", "핵심 메시지", "상세 비교"}
+    )
+    assert _INVENTED_TEXT_CSS_NUMBER_PATTERN.pattern == r"^[\d\s.,%px#rgb()]+$"
+    assert _INVENTED_TEXT_KEYWORD_THRESHOLD == 0.4
+    assert _INVENTED_TEXT_TRUNCATE_LEN == 80
+
+
+def test_detect_invented_text_returns_empty_when_html_is_in_mdx() -> None:
+    """Text whose keywords fully appear in MDX is NOT flagged."""
+    mdx = "원본 콘텐츠는 분석에 관한 것입니다."
+    html = "<p>원본 콘텐츠는 분석에 관한 것입니다.</p>"
+    assert detect_invented_text(mdx, html) == []
+
+
+def test_detect_invented_text_flags_text_with_low_keyword_overlap() -> None:
+    """Text whose keywords do not appear in MDX is flagged as invented."""
+    mdx = "원본 콘텐츠는 분석에 관한 것입니다."
+    html = "<p>완전히 다른 발명된 텍스트가 여기 있습니다 일반적이지 않은</p>"
+    result = detect_invented_text(mdx, html)
+    assert len(result) == 1
+    assert "발명된" in result[0]
+
+
+def test_detect_invented_text_skips_short_text() -> None:
+    """Text shorter than ``min_length`` is not even considered."""
+    mdx = "원본 콘텐츠"
+    html = "<p>짧은 텍스트</p>"
+    assert detect_invented_text(mdx, html) == []
+
+
+def test_detect_invented_text_skips_allowed_structural_labels() -> None:
+    """Allowed labels are skipped even when keyword overlap is zero.
+
+    Phase Q default ``min_length=15`` makes the allowed-label gate
+    unreachable for the bundled labels (all < 15 chars). The Phase Z
+    port preserves the gate verbatim — exercised here with
+    ``min_length=0`` so the structural-label short-circuit is
+    actually observable.
+    """
+    mdx = "원본 콘텐츠"
+    html = "<h2>용어 정의</h2><h2>핵심 메시지</h2><h2>상세 비교</h2>"
+    assert detect_invented_text(mdx, html, min_length=0) == []
+
+
+def test_detect_invented_text_skips_css_number_pattern_fragments() -> None:
+    """CSS/numeric fragments (e.g. ``100px 200px 300px``) are skipped."""
+    mdx = "원본 콘텐츠"
+    html = "<style>.x { padding: 100px; }</style><div>100px 200px 300px</div>"
+    assert detect_invented_text(mdx, html) == []
+
+
+def test_detect_invented_text_truncates_flagged_value_to_80_chars() -> None:
+    """A flagged fragment longer than 80 chars is truncated for reporting."""
+    mdx = "원본 콘텐츠"
+    invented = "발명" * 50
+    html = f"<p>{invented}</p>"
+    result = detect_invented_text(mdx, html)
+    assert len(result) == 1
+    assert len(result[0]) == 80
+    assert result[0] == invented[:80]
--- a/tests/phase_z2/test_pz2_vu_keywords.py
+++ b/tests/phase_z2/test_pz2_vu_keywords.py
@@ -0,0 +1,52 @@
+"""Tests for Phase Z2 IMP-16-U1 unit u4: extract_keywords.
+
+Locks the deterministic surface: 3+ character tokens on the Phase Z H3
+character class, longest-match trailing particle strip with a length>=2
+stem guard, and no Phase Q content_verifier import.
+"""
+from __future__ import annotations
+
+from src.phase_z2_verification_utils import _PARTICLES, extract_keywords
+
+
+def test_extract_keywords_drops_short_tokens() -> None:
+    # "AI" (2 chars) and "X" (1 char) are dropped; "기술" (2 chars) is dropped too.
+    # "데이터" (3 chars) survives; "분석함" (3 chars) survives.
+    assert extract_keywords("AI 기술 X 데이터 분석함") == ["데이터", "분석함"]
+
+
+def test_extract_keywords_strips_trailing_particle_when_stem_ge_2() -> None:
+    # "설계의" (3 chars) → particle "의" stripped, stem "설계" (2 chars) kept.
+    # "방식은" → particle "은" stripped → "방식".
+    assert extract_keywords("설계의 방식은") == ["설계", "방식"]
+
+
+def test_extract_keywords_keeps_token_when_stem_would_be_too_short() -> None:
+    # "에서" guard: a 3-char token whose 2-char suffix is a particle
+    # but whose stem (1 char) is < 2 must keep the original token.
+    # "안에서" → suffix "에서" len 2, stem "안" len 1 → guard fires,
+    # falls through, then next particle "서" is NOT in _PARTICLES,
+    # so the whole token "안에서" remains.
+    assert extract_keywords("안에서") == ["안에서"]
+
+
+def test_extract_keywords_longest_match_particle_wins() -> None:
+    # "_PARTICLES" is sorted longest-first, so "에서" wins over "서"/"에".
+    # "현장에서" → "에서" stripped → "현장".
+    assert "에서" in _PARTICLES
+    assert extract_keywords("현장에서") == ["현장"]
+
+
+def test_extract_keywords_tokenises_korean_alnum_and_parens() -> None:
+    # The Phase Z H3 character class is [가-힣a-zA-Z0-9()]+.
+    # "프로젝트(2024)" is one token; "Hello!" splits into "Hello" only.
+    # Punctuation outside the class acts as a delimiter.
+    result = extract_keywords("프로젝트(2024) Hello! World123")
+    assert "프로젝트(2024)" in result
+    assert "Hello" in result
+    assert "World123" in result
+    assert "!" not in "".join(result)
+
+
+def test_extract_keywords_empty_returns_empty() -> None:
+    assert extract_keywords("") == []
--- a/tests/phase_z2/test_pz2_vu_match_helper.py
+++ b/tests/phase_z2/test_pz2_vu_match_helper.py
@@ -0,0 +1,66 @@
+"""Tests for IMP-16-U1 unit u7: ``_sentence_matches_html``.
+
+Locks the Phase Z port of the deterministic per-sentence match
+helper (Phase Q reference: inline body of ``verify_text_preservation``
+at src/content_verifier.py:232-251). The helper is pure; no Phase Q
+import is exercised. Thresholds are locked as named constants so the
+0.6 / 0.65 surface cannot drift silently.
+"""
+from __future__ import annotations
+
+from src.phase_z2_verification_utils import (
+    _SENTENCE_KEYWORD_MATCH_THRESHOLD,
+    _SENTENCE_SEQUENCE_MATCH_THRESHOLD,
+    _sentence_matches_html,
+)
+
+
+def test_match_helper_thresholds_locked():
+    assert _SENTENCE_KEYWORD_MATCH_THRESHOLD == 0.6
+    assert _SENTENCE_SEQUENCE_MATCH_THRESHOLD == 0.65
+
+
+def test_match_helper_returns_true_when_no_keywords():
+    # "AI" tokenises to a single 2-char token which extract_keywords drops
+    # (len < 3 gate). Empty keyword list -> helper returns True regardless
+    # of HTML side. Phase Q parity: matched += 1; continue on empty keywords.
+    assert _sentence_matches_html("AI", "", []) is True
+
+
+def test_match_helper_keyword_ratio_meets_threshold():
+    # Sentence "데이터 분석의 핵심" -> keywords = ["데이터", "분석"]:
+    #   "데이터" (len 3, no particle ending) kept;
+    #   "분석의" (len 3, ends with "의", stem "분석" len 2) -> "분석" kept;
+    #   "핵심" (len 2 < 3) dropped.
+    # Both keywords are substrings of the html_combined string, so
+    # kw_ratio = 2 / 2 = 1.0 >= 0.6 -> True via keyword axis.
+    assert _sentence_matches_html(
+        "데이터 분석의 핵심",
+        "데이터 분석을 수행합니다",
+        ["데이터 분석을 수행합니다"],
+    ) is True
+
+
+def test_match_helper_sequence_ratio_fallback():
+    # Sentence "데이터 분석" -> keywords = ["데이터"] (the 2-char "분석"
+    # is dropped by the len<3 gate). "데이터" is NOT in html_combined,
+    # so kw_ratio = 0. The SequenceMatcher fallback compares the
+    # normalized sentence against each normalized html_text; the second
+    # fragment matches verbatim, yielding ratio 1.0 >= 0.65 -> True.
+    assert _sentence_matches_html(
+        "데이터 분석",
+        "abc xyz",
+        ["abc xyz", "데이터 분석"],
+    ) is True
+
+
+def test_match_helper_below_both_thresholds_returns_false():
+    # No keyword overlap and no high-similarity html fragment:
+    # kw_ratio = 0, best SequenceMatcher ratio is far below 0.65.
+    # Helper must return False so verify_text_preservation (u8)
+    # records the sentence as missing.
+    assert _sentence_matches_html(
+        "데이터 분석",
+        "abc xyz",
+        ["abc xyz"],
+    ) is False
--- a/tests/phase_z2/test_pz2_vu_meta_strip.py
+++ b/tests/phase_z2/test_pz2_vu_meta_strip.py
@@ -0,0 +1,73 @@
+"""u5 — meta-line stripping surface (IMP-16-U1).
+
+Locks the deterministic meta-line filter contract:
+  - lines whose stripped form starts with any ``_META_PREFIXES`` entry
+    are dropped (8 prefix surface);
+  - lines containing any ``_META_INLINE_FRAGMENTS`` entry are dropped
+    (3 inline fragment surface);
+  - other lines pass through with original whitespace preserved;
+  - empty input returns the empty string;
+  - no import of src.content_verifier.
+"""
+from __future__ import annotations
+
+
+def test_strip_meta_lines_drops_prefix_lines():
+    from src.phase_z2_verification_utils import _META_PREFIXES, strip_meta_lines
+
+    # Exactly the 8-prefix Phase Z surface — locks both content and size.
+    assert _META_PREFIXES == [
+        "제목 라벨:",
+        "표현 의도:",
+        "슬라이드 주인공",
+        "가장 큰 시각적 비중",
+        "시각적으로",
+        "간결하게 제기",
+        "개별 증거로 제시",
+        "계층적으로 시각화",
+    ]
+    text = "제목 라벨: 어떤 제목\n본문 한 줄\n표현 의도: 강조"
+    assert strip_meta_lines(text) == "본문 한 줄"
+
+
+def test_strip_meta_lines_matches_prefix_on_stripped_line():
+    from src.phase_z2_verification_utils import strip_meta_lines
+
+    # Leading whitespace must not protect a meta-prefix line.
+    text = "   제목 라벨: indented meta\n실제 본문"
+    assert strip_meta_lines(text) == "실제 본문"
+
+
+def test_strip_meta_lines_drops_inline_fragment_lines():
+    from src.phase_z2_verification_utils import (
+        _META_INLINE_FRAGMENTS,
+        strip_meta_lines,
+    )
+
+    # Phase Z inline-fragment surface is exactly these three.
+    assert _META_INLINE_FRAGMENTS == (
+        "현상-문제 인과관계",
+        "상위-하위 포함 관계",
+        "독립적 나열",
+    )
+    text = (
+        "구조: 현상-문제 인과관계 로 설계\n"
+        "유형: 상위-하위 포함 관계\n"
+        "패턴: 독립적 나열 형태\n"
+        "그래서 결론은 한 줄"
+    )
+    assert strip_meta_lines(text) == "그래서 결론은 한 줄"
+
+
+def test_strip_meta_lines_keeps_unrelated_lines_verbatim():
+    from src.phase_z2_verification_utils import strip_meta_lines
+
+    # Non-meta lines must pass through with original whitespace preserved.
+    text = "  본문 한 줄\n\n다른 줄"
+    assert strip_meta_lines(text) == "  본문 한 줄\n\n다른 줄"
+
+
+def test_strip_meta_lines_empty_input_returns_empty_string():
+    from src.phase_z2_verification_utils import strip_meta_lines
+
+    assert strip_meta_lines("") == ""
--- a/tests/phase_z2/test_pz2_vu_normalize.py
+++ b/tests/phase_z2/test_pz2_vu_normalize.py
@@ -0,0 +1,64 @@
+"""u3 — Korean text normalization surface (IMP-16-U1).
+
+Locks the deterministic text-normalization contract:
+  - whitespace runs collapse + strip;
+  - bullet markers from the Phase Q surface set are removed;
+  - the small HTML-entity set used by the reverse path is decoded;
+  - a single trailing 개조식 ending is folded to its 서술형 form;
+  - particle list is sorted longest-first (matching the Phase Q surface
+    so downstream keyword stripping is greedy);
+  - no import of src.content_verifier.
+"""
+from __future__ import annotations
+
+
+def test_normalize_collapses_whitespace_and_strips():
+    from src.phase_z2_verification_utils import normalize_for_comparison
+
+    assert normalize_for_comparison("  hello\n\n  world\t") == "hello world"
+
+
+def test_normalize_removes_bullet_markers():
+    from src.phase_z2_verification_utils import normalize_for_comparison
+
+    # Each marker from the Phase Q surface set must be stripped.
+    for marker in ["•", "◦", "·", "-", "▪", "▸", "►"]:
+        assert normalize_for_comparison(f"{marker} 항목") == "항목"
+
+
+def test_normalize_decodes_html_entities():
+    from src.phase_z2_verification_utils import normalize_for_comparison
+
+    text = "A &amp; B &lt;tag&gt; &nbsp; &#39;q&#39; &quot;d&quot;"
+    assert normalize_for_comparison(text) == "A & B <tag>   'q' \"d\""
+
+
+def test_normalize_folds_trailing_gaejo_endings():
+    from src.phase_z2_verification_utils import normalize_for_comparison
+
+    assert normalize_for_comparison("적용함") == "적용한다"
+    assert normalize_for_comparison("필요됨") == "필요된다"
+    assert normalize_for_comparison("값이 있음") == "값이 있다"
+    assert normalize_for_comparison("자료 없음") == "자료 없다"
+    assert normalize_for_comparison("결과임") == "결과이다"
+    assert normalize_for_comparison("적용되었음") == "적용되었다"
+    assert normalize_for_comparison("적용되었음.") == "적용되었음."  # trailing punct blocks fold
+
+
+def test_normalize_only_folds_one_ending_and_only_at_end():
+    from src.phase_z2_verification_utils import normalize_for_comparison
+
+    # 'break' after first match: only the suffix is folded, mid-string '함' is left alone.
+    assert normalize_for_comparison("함수를 적용함") == "함수를 적용한다"
+    # No fold when the ending is not the last token.
+    assert normalize_for_comparison("적용함 그리고 종료") == "적용함 그리고 종료"
+
+
+def test_particles_sorted_longest_first():
+    from src.phase_z2_verification_utils import _PARTICLES
+
+    lengths = [len(p) for p in _PARTICLES]
+    assert lengths == sorted(lengths, reverse=True)
+    # Phase Q surface size guard (no values reused from REQUIRED_PATTERNS;
+    # this is the Korean-locale particle inventory).
+    assert "에서" in _PARTICLES and "는" in _PARTICLES
--- a/tests/phase_z2/test_pz2_vu_preservation.py
+++ b/tests/phase_z2/test_pz2_vu_preservation.py
@@ -0,0 +1,119 @@
+"""Tests for IMP-16-U1 unit u8: ``verify_text_preservation``.
+
+Locks the Phase Z port of the deterministic text-preservation check
+(Phase Q reference: ``src/content_verifier.py:206-273``). The function
+is pure and composes u2 (extract_text_from_html), u3
+(normalize_for_comparison), u6 (split_into_sentences), and u7
+(_sentence_matches_html). No Phase Q import is exercised.
+"""
+from __future__ import annotations
+
+from src.phase_z2_verification_utils import (
+    VerificationResult,
+    _MISSING_SENTENCE_REPORT_LIMIT,
+    _MISSING_SENTENCE_TRUNCATE_LEN,
+    _TEXT_PRESERVATION_DEFAULT_THRESHOLD,
+    verify_text_preservation,
+)
+
+
+def test_verify_text_preservation_defaults_locked():
+    # Locks the Phase Q caller convention: threshold default = 0.70,
+    # missing-list report cap = 5, per-item truncate length = 60.
+    assert _TEXT_PRESERVATION_DEFAULT_THRESHOLD == 0.70
+    assert _MISSING_SENTENCE_REPORT_LIMIT == 5
+    assert _MISSING_SENTENCE_TRUNCATE_LEN == 60
+
+
+def test_verify_text_preservation_empty_sentences_returns_passed():
+    # MDX that reduces to zero sentences after split_into_sentences
+    # (e.g. headers only) must return passed=True with score 1.0 and
+    # an empty errors/warnings surface. Phase Q parity: early return
+    # before any HTML extraction.
+    result = verify_text_preservation("# header only", "<p>anything</p>", "core")
+    assert isinstance(result, VerificationResult)
+    assert result.passed is True
+    assert result.area_name == "core"
+    assert result.checks == {"text_preservation": True}
+    assert result.score == 1.0
+    assert result.errors == []
+    assert result.warnings == []
+
+
+def test_verify_text_preservation_full_match_passes():
+    # All MDX sentences preserved in HTML -> score 1.0, passed True,
+    # no warnings (warnings only attached when score < 1.0), no errors.
+    mdx = "데이터 분석은 핵심 과정입니다. 시각화로 의사 결정을 지원합니다."
+    html = (
+        "<p>데이터 분석은 핵심 과정입니다.</p>"
+        "<p>시각화로 의사 결정을 지원합니다.</p>"
+    )
+    result = verify_text_preservation(mdx, html, "body")
+    assert result.passed is True
+    assert result.score == 1.0
+    assert result.warnings == []
+    assert result.errors == []
+
+
+def test_verify_text_preservation_below_threshold_reports_errors():
+    # Only one of two MDX sentences appears in the HTML -> score 0.5,
+    # below default threshold 0.70 -> passed False, errors list opens
+    # with the "누락 문장 (1/2):" header followed by quoted missing
+    # sentences (truncation gate not crossed).
+    mdx = (
+        "데이터 분석은 핵심 과정입니다.\n"
+        "전혀 다른 문맥의 두 번째 문장입니다."
+    )
+    html = "<p>데이터 분석은 핵심 과정입니다.</p>"
+    result = verify_text_preservation(mdx, html, "core")
+    assert result.passed is False
+    assert result.score == 0.5
+    assert result.checks == {"text_preservation": False}
+    assert result.errors[0] == "누락 문장 (1/2):"
+    assert any("두 번째 문장" in line for line in result.errors[1:])
+    assert result.warnings == ["보존율: 50% (1/2 문장)"]
+
+
+def test_verify_text_preservation_truncates_long_missing_sentence():
+    # A missing sentence longer than 60 chars must be rendered with
+    # the "...\"" tail. Phase Z surface lifts the 60 constant to a
+    # named module value (_MISSING_SENTENCE_TRUNCATE_LEN) so the gate
+    # is auditable.
+    long_sentence = "엄청나게 긴 문장이 들어가서 절단 동작을 검증합니다." + ("끝" * 60)
+    mdx = long_sentence + "."
+    html = "<p>관련 없는 문구</p>"
+    result = verify_text_preservation(mdx, html, "footer", threshold=0.99)
+    assert result.passed is False
+    # Header + at least one missing-line entry; the entry must end with `..."`.
+    assert len(result.errors) >= 2
+    assert result.errors[-1].endswith("...\"")
+    truncated_body = result.errors[-1].split('"', 2)[1].rstrip(".")
+    assert len(truncated_body) == _MISSING_SENTENCE_TRUNCATE_LEN
+
+
+def test_verify_text_preservation_caps_missing_report_at_limit():
+    # Generate seven MDX-only sentences with no HTML coverage.
+    # passed=False, errors list = 1 header + at most 5 missing entries
+    # (_MISSING_SENTENCE_REPORT_LIMIT). The header reports the true
+    # missing/total counts even though only 5 are surfaced.
+    mdx_lines = [f"전혀 다른 문맥의 문장 번호 {i} 입니다." for i in range(7)]
+    mdx = "\n".join(mdx_lines)
+    html = "<p>관련 없는 문구</p>"
+    result = verify_text_preservation(mdx, html, "core")
+    assert result.passed is False
+    assert result.errors[0] == "누락 문장 (7/7):"
+    assert len(result.errors) == 1 + _MISSING_SENTENCE_REPORT_LIMIT
+
+
+def test_verify_text_preservation_custom_threshold_passes_at_50_percent():
+    # Lowering the threshold to 0.50 makes a 50% preservation pass.
+    mdx = (
+        "데이터 분석은 핵심 과정입니다.\n"
+        "전혀 다른 문맥의 두 번째 문장입니다."
+    )
+    html = "<p>데이터 분석은 핵심 과정입니다.</p>"
+    result = verify_text_preservation(mdx, html, "core", threshold=0.50)
+    assert result.passed is True
+    assert result.score == 0.5
+    # Score < 1.0 so the 보존율 warning is still attached for trace surface.
+    assert result.warnings == ["보존율: 50% (1/2 문장)"]
--- a/tests/phase_z2/test_pz2_vu_sentence_split.py
+++ b/tests/phase_z2/test_pz2_vu_sentence_split.py
@@ -0,0 +1,69 @@
+"""Tests for IMP-16-U1 unit u6: split_into_sentences.
+
+Locks the Phase Z port of the H3 deterministic sentence-splitter
+surface (Phase Q reference: src/content_verifier.py:174-199). The
+function is deterministic, pure, and composes ``strip_meta_lines``;
+no Phase Q import is exercised.
+"""
+from __future__ import annotations
+
+from src.phase_z2_verification_utils import (
+    _BULLET_MARKER_PATTERN,
+    _MIN_SENTENCE_LEN,
+    _SENTENCE_SPLIT_PATTERN,
+    split_into_sentences,
+)
+
+
+def test_split_into_sentences_applies_strip_meta_lines_first():
+    text = (
+        "제목 라벨: 설계 방식의 왜곡\n"
+        "본문 첫 문장입니다.\n"
+        "본문 둘째 문장입니다."
+    )
+    result = split_into_sentences(text)
+    assert result == ["본문 첫 문장입니다.", "본문 둘째 문장입니다."]
+
+
+def test_split_into_sentences_skips_empty_and_header_lines():
+    text = "\n# 대목차\n## 소목차\n실제 본문 문장입니다.\n"
+    assert split_into_sentences(text) == ["실제 본문 문장입니다."]
+
+
+def test_split_into_sentences_strips_numeric_and_punctuated_markers():
+    assert _BULLET_MARKER_PATTERN.match("1. 첫 단계입니다.")
+    assert _BULLET_MARKER_PATTERN.match("2) 둘째 단계입니다.")
+    assert _BULLET_MARKER_PATTERN.match("-. 첫 항목입니다.")
+    assert _BULLET_MARKER_PATTERN.match("•. 둘째 항목입니다.")
+    text = (
+        "1. 첫 단계입니다.\n"
+        "2) 둘째 단계입니다.\n"
+        "-. 셋째 항목입니다."
+    )
+    assert split_into_sentences(text) == [
+        "첫 단계입니다.",
+        "둘째 단계입니다.",
+        "셋째 항목입니다.",
+    ]
+
+
+def test_split_into_sentences_keeps_bare_dash_bullet_unstripped():
+    assert _BULLET_MARKER_PATTERN.match("- 항목 하나입니다.") is None
+    text = "- 항목 하나입니다."
+    assert split_into_sentences(text) == ["- 항목 하나입니다."]
+
+
+def test_split_into_sentences_splits_on_period_boundary():
+    assert _SENTENCE_SPLIT_PATTERN.pattern == r"(?<=\.)\s+"
+    text = "첫 문장입니다. 둘째 문장입니다. 셋째 문장입니다."
+    assert split_into_sentences(text) == [
+        "첫 문장입니다.",
+        "둘째 문장입니다.",
+        "셋째 문장입니다.",
+    ]
+
+
+def test_split_into_sentences_drops_parts_shorter_than_min_len():
+    assert _MIN_SENTENCE_LEN == 5
+    text = "OK. 충분히 긴 문장입니다."
+    assert split_into_sentences(text) == ["충분히 긴 문장입니다."]