feat(IMP-16): U1 H3 verification utility port + U2 wiring design

U1 (runtime, u1-u10): new Phase Z-owned deterministic verification module src/phase_z2_verification_utils.py (335 LOC, stdlib only) porting H3 utility surface — VerificationResult, extract_text_from_html, normalize_for_comparison, extract_keywords, strip_meta_lines, split_into_sentences, verify_text_preservation, detect_invented_text. 10 unit tests under tests/phase_z2/test_pz2_vu_*.py (56 tests). u11 (design-only): docs/architecture/IMP-16-U2-WIRING-DESIGN.md fixes the Step 1/2/14/21/22 reverse-path contract, redesigned frame-contract pattern reservation (IMP-20), and IMP-07 hard-gate criteria. No runtime wiring lands in this commit — U2 stays blocked until IMP-07 reverse path is implemented + verified + runtime-hit. Guardrails: no src.content_verifier import; no FORBIDDEN_KEI_MEMOS / generate_with_retry / REQUIRED_PATTERNS / verify_structure / verify_area / verify_all_areas usage; no AI / Kei / httpx / SSE path; AI-isolation contract upheld (utility is deterministic). Tests: 56 targeted PASS (0.19s), 15 regression baseline PASS (7.59s). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 04:42:35 +09:00
parent 614c53358e
commit 23ba8b68cd
12 changed files with 1156 additions and 0 deletions
--- a/src/phase_z2_verification_utils.py
+++ b/src/phase_z2_verification_utils.py
@@ -0,0 +1,335 @@
+"""Phase Z2 deterministic verification utilities (IMP-16-U1 port).
+
+Ports the H3 deterministic subset of src/content_verifier.py into a
+Phase Z-owned module so the Phase Z pipeline never imports the Phase Q
+reference-only module (which co-hosts H4/H5 Kei/AI assets).
+
+Scope: deterministic, pure, no I/O, no LLM call, no httpx/SSE.
+Wiring into Step 1/2/14/21/22 is gated behind IMP-07 (see
+docs/architecture/IMP-16-U2-WIRING-DESIGN.md when u11 lands).
+"""
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass, field
+from difflib import SequenceMatcher
+from html.parser import HTMLParser
+
+
+@dataclass
+class VerificationResult:
+    """Single-axis deterministic verification outcome.
+
+    Mirrors the Phase Q VerificationResult shape so callers ported from
+    that surface keep their field access; the value semantics are
+    Phase Z-owned (no Phase Q area defaults baked in).
+    """
+
+    passed: bool
+    area_name: str
+    checks: dict[str, bool] = field(default_factory=dict)
+    score: float = 0.0
+    errors: list[str] = field(default_factory=list)
+    warnings: list[str] = field(default_factory=list)
+
+
+class _TextExtractor(HTMLParser):
+    """Extract visible text only. Skips <style> and <script> bodies.
+
+    Pure stdlib (html.parser). Whitespace-only data chunks are dropped;
+    surviving chunks are stripped before appending to preserve token
+    boundaries for downstream normalization / keyword logic.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.texts: list[str] = []
+        self._skip = False
+
+    def handle_starttag(self, tag, attrs):
+        if tag in ("style", "script"):
+            self._skip = True
+
+    def handle_endtag(self, tag):
+        if tag in ("style", "script"):
+            self._skip = False
+
+    def handle_data(self, data):
+        if not self._skip:
+            stripped = data.strip()
+            if stripped:
+                self.texts.append(stripped)
+
+
+def extract_text_from_html(html: str) -> list[str]:
+    """Return ordered list of visible text fragments from an HTML string.
+
+    Deterministic, pure: no I/O, no LLM, no network. Used by Phase Z
+    verification to compare reverse-path HTML against MDX text without
+    importing the Phase Q reference-only module.
+    """
+    parser = _TextExtractor()
+    parser.feed(html)
+    return parser.texts
+
+
+_PARTICLES: list[str] = sorted(
+    ["에서", "으로", "부터", "까지", "에게", "한테",
+     "은", "는", "이", "가", "을", "를", "에", "의",
+     "로", "와", "과", "도", "만", "께"],
+    key=len, reverse=True,
+)
+
+_ENDING_NORMALIZE: dict[str, str] = {
+    "있음": "있다", "됨": "된다", "함": "한다", "임": "이다",
+    "없음": "없다", "았음": "았다", "었음": "었다",
+}
+
+
+def normalize_for_comparison(text: str) -> str:
+    """Normalize text for deterministic comparison (Phase Z H3 port).
+
+    Steps (order matters): collapse whitespace, strip bullet markers,
+    decode the small HTML-entity set used by the reverse path, then
+    fold a single trailing 개조식 ending to its 서술형 form.
+    """
+    text = re.sub(r"\s+", " ", text).strip()
+    text = re.sub(r"[•◦·\-▪▸►]", "", text).strip()
+    text = text.replace("&amp;", "&").replace("&lt;", "<").replace("&gt;", ">")
+    text = text.replace("&nbsp;", " ").replace("&#39;", "'").replace("&quot;", '"')
+    for gaejo, seosul in _ENDING_NORMALIZE.items():
+        if text.endswith(gaejo):
+            text = text[: -len(gaejo)] + seosul
+            break
+    return text
+
+
+def extract_keywords(text: str) -> list[str]:
+    """Extract length>=3 tokens, then strip a trailing Korean particle.
+
+    Deterministic, pure: tokenises on the Phase Z H3 character class
+    ``[가-힣a-zA-Z0-9()]+``, drops tokens shorter than 3 characters,
+    and folds a single longest-match trailing particle from
+    ``_PARTICLES`` when the remaining stem is still length >= 2.
+    """
+    words = re.findall(r"[가-힣a-zA-Z0-9()]+", text)
+    keywords: list[str] = []
+    for w in words:
+        if len(w) < 3:
+            continue
+        for p in _PARTICLES:
+            if w.endswith(p) and len(w) - len(p) >= 2:
+                w = w[: -len(p)]
+                break
+        if len(w) >= 2:
+            keywords.append(w)
+    return keywords
+
+
+_META_PREFIXES: list[str] = [
+    "제목 라벨:",
+    "표현 의도:",
+    "슬라이드 주인공",
+    "가장 큰 시각적 비중",
+    "시각적으로",
+    "간결하게 제기",
+    "개별 증거로 제시",
+    "계층적으로 시각화",
+]
+
+_META_INLINE_FRAGMENTS: tuple[str, ...] = (
+    "현상-문제 인과관계",
+    "상위-하위 포함 관계",
+    "독립적 나열",
+)
+
+
+def strip_meta_lines(text: str) -> str:
+    """Drop Kei prompt meta/instruction lines before verification.
+
+    A line is dropped if its stripped form starts with any prefix in
+    ``_META_PREFIXES`` (e.g. ``제목 라벨:``) or contains any inline
+    expression-hint fragment in ``_META_INLINE_FRAGMENTS`` (e.g.
+    ``현상-문제 인과관계``). These are prompt directives, not slide
+    content; they must not enter sentence/keyword extraction for the
+    B-2 reverse path. Deterministic, pure: no I/O, no LLM, no regex
+    against runtime data.
+    """
+    filtered: list[str] = []
+    for line in text.split("\n"):
+        stripped = line.strip()
+        if any(stripped.startswith(prefix) for prefix in _META_PREFIXES):
+            continue
+        if any(fragment in stripped for fragment in _META_INLINE_FRAGMENTS):
+            continue
+        filtered.append(line)
+    return "\n".join(filtered)
+
+
+_BULLET_MARKER_PATTERN = re.compile(r"^[\-•◦·\d]+[.)]\s*")
+_SENTENCE_SPLIT_PATTERN = re.compile(r"(?<=\.)\s+")
+_MIN_SENTENCE_LEN = 5
+
+
+def split_into_sentences(text: str) -> list[str]:
+    """Split text into sentences for deterministic comparison.
+
+    Pipeline (order matters): drop Kei meta/instruction lines via
+    ``strip_meta_lines``, split on newline, skip empties and ``#``-led
+    header lines, strip any leading bullet/numeric marker matching
+    ``_BULLET_MARKER_PATTERN``, then split on inter-sentence whitespace
+    following a period. Parts shorter than ``_MIN_SENTENCE_LEN`` are
+    dropped so single-token noise (e.g. residual punctuation) cannot
+    enter the preservation/invented-text checks.
+    """
+    text = strip_meta_lines(text)
+    sentences: list[str] = []
+    for line in text.split("\n"):
+        line = line.strip()
+        if not line or line.startswith("#"):
+            continue
+        line = _BULLET_MARKER_PATTERN.sub("", line).strip()
+        if not line:
+            continue
+        for part in _SENTENCE_SPLIT_PATTERN.split(line):
+            part = part.strip()
+            if len(part) >= _MIN_SENTENCE_LEN:
+                sentences.append(part)
+    return sentences
+
+
+_SENTENCE_KEYWORD_MATCH_THRESHOLD = 0.6
+_SENTENCE_SEQUENCE_MATCH_THRESHOLD = 0.65
+
+
+def _sentence_matches_html(
+    sentence: str,
+    html_combined: str,
+    html_texts: list[str],
+) -> bool:
+    """Return True if ``sentence`` is preserved in the HTML side.
+
+    Two-axis match: a keyword-ratio gate against ``html_combined`` (the
+    pre-normalized join of all visible HTML text fragments) and a
+    SequenceMatcher fallback against each individual normalized HTML
+    fragment. A sentence whose keyword set is empty after normalization
+    is treated as preserved (no falsifiable signal). Pure helper used
+    by ``verify_text_preservation`` (u8); thresholds are lifted to
+    named module constants so the surface is auditable.
+    """
+    norm_orig = normalize_for_comparison(sentence)
+    keywords = extract_keywords(norm_orig)
+    if not keywords:
+        return True
+    kw_found = sum(1 for kw in keywords if kw in html_combined)
+    kw_ratio = kw_found / len(keywords)
+    best_ratio = 0.0
+    for html_text in html_texts:
+        norm_html = normalize_for_comparison(html_text)
+        ratio = SequenceMatcher(None, norm_orig, norm_html).ratio()
+        if ratio > best_ratio:
+            best_ratio = ratio
+    return (
+        kw_ratio >= _SENTENCE_KEYWORD_MATCH_THRESHOLD
+        or best_ratio >= _SENTENCE_SEQUENCE_MATCH_THRESHOLD
+    )
+
+
+_TEXT_PRESERVATION_DEFAULT_THRESHOLD = 0.70
+_MISSING_SENTENCE_REPORT_LIMIT = 5
+_MISSING_SENTENCE_TRUNCATE_LEN = 60
+
+
+def verify_text_preservation(
+    original_mdx: str,
+    generated_html: str,
+    area_name: str,
+    threshold: float = _TEXT_PRESERVATION_DEFAULT_THRESHOLD,
+) -> VerificationResult:
+    """Verify the original MDX text is preserved in the generated HTML.
+
+    Splits MDX via u6, pre-normalizes joined HTML via u2+u3, then per
+    sentence delegates to u7. Empty sentence list -> passed True,
+    score 1.0. Missing sentences are capped at the report limit and
+    each truncated to the truncate length constant.
+    """
+    original_sentences = split_into_sentences(original_mdx)
+    if not original_sentences:
+        return VerificationResult(passed=True, area_name=area_name,
+                                  checks={"text_preservation": True}, score=1.0)
+    html_texts = extract_text_from_html(generated_html)
+    html_combined = normalize_for_comparison(" ".join(html_texts))
+    matched = 0
+    missing: list[str] = []
+    for sentence in original_sentences:
+        if _sentence_matches_html(sentence, html_combined, html_texts):
+            matched += 1
+        else:
+            missing.append(sentence)
+    score = matched / len(original_sentences)
+    passed = score >= threshold
+    errors: list[str] = []
+    if not passed:
+        errors = [f"누락 문장 ({len(missing)}/{len(original_sentences)}):"]
+        for s in missing[:_MISSING_SENTENCE_REPORT_LIMIT]:
+            errors.append(
+                f"  - \"{s[:_MISSING_SENTENCE_TRUNCATE_LEN]}...\""
+                if len(s) > _MISSING_SENTENCE_TRUNCATE_LEN else f"  - \"{s}\""
+            )
+    warnings = ([f"보존율: {score:.0%} ({matched}/{len(original_sentences)} 문장)"]
+                if score < 1.0 else [])
+    return VerificationResult(
+        passed=passed, area_name=area_name,
+        checks={"text_preservation": passed}, score=score,
+        errors=errors, warnings=warnings,
+    )
+
+
+_INVENTED_TEXT_MIN_LENGTH = 15
+_INVENTED_TEXT_ALLOWED_LABELS: frozenset[str] = frozenset({
+    "용어 정의", "핵심 메시지", "상세 비교",
+})
+_INVENTED_TEXT_CSS_NUMBER_PATTERN = re.compile(r"^[\d\s.,%px#rgb()]+$")
+_INVENTED_TEXT_KEYWORD_THRESHOLD = 0.4
+_INVENTED_TEXT_TRUNCATE_LEN = 80
+
+
+def detect_invented_text(
+    original_mdx: str,
+    generated_html: str,
+    min_length: int = _INVENTED_TEXT_MIN_LENGTH,
+) -> list[str]:
+    """Detect HTML text fragments that are not anchored in the source MDX.
+
+    Phase Z port of the H3 hallucination guard (Phase Q reference:
+    ``src/content_verifier.py:276-315``). Pipeline (order matters):
+    drop short fragments (< ``min_length``), drop structural label
+    exceptions in ``_INVENTED_TEXT_ALLOWED_LABELS``, drop CSS/numeric
+    noise matching ``_INVENTED_TEXT_CSS_NUMBER_PATTERN``, then per
+    surviving fragment compute keyword ratio (via u4 ``extract_keywords``
+    on the normalized fragment, checked against the normalized MDX). A
+    fragment is flagged when ``kw_ratio < _INVENTED_TEXT_KEYWORD_THRESHOLD``;
+    flagged values are truncated to ``_INVENTED_TEXT_TRUNCATE_LEN`` chars
+    before being returned. Empty keyword sets short-circuit as
+    non-falsifiable (matches Phase Q parity). Deterministic, pure.
+    """
+    html_texts = extract_text_from_html(generated_html)
+    norm_mdx = normalize_for_comparison(original_mdx)
+    invented: list[str] = []
+    for text in html_texts:
+        text = text.strip()
+        if len(text) < min_length:
+            continue
+        if text in _INVENTED_TEXT_ALLOWED_LABELS:
+            continue
+        if _INVENTED_TEXT_CSS_NUMBER_PATTERN.match(text):
+            continue
+        norm_text = normalize_for_comparison(text)
+        keywords = extract_keywords(norm_text)
+        if not keywords:
+            continue
+        kw_found = sum(1 for kw in keywords if kw in norm_mdx)
+        kw_ratio = kw_found / len(keywords)
+        if kw_ratio < _INVENTED_TEXT_KEYWORD_THRESHOLD:
+            invented.append(text[:_INVENTED_TEXT_TRUNCATE_LEN])
+    return invented