Files
C.E.L_Slide_test2/src/phase_z2_verification_utils.py
kyeongmin 23ba8b68cd feat(IMP-16): U1 H3 verification utility port + U2 wiring design
U1 (runtime, u1-u10): new Phase Z-owned deterministic verification module
src/phase_z2_verification_utils.py (335 LOC, stdlib only) porting H3 utility
surface — VerificationResult, extract_text_from_html, normalize_for_comparison,
extract_keywords, strip_meta_lines, split_into_sentences, verify_text_preservation,
detect_invented_text. 10 unit tests under tests/phase_z2/test_pz2_vu_*.py (56 tests).

u11 (design-only): docs/architecture/IMP-16-U2-WIRING-DESIGN.md fixes the Step
1/2/14/21/22 reverse-path contract, redesigned frame-contract pattern
reservation (IMP-20), and IMP-07 hard-gate criteria. No runtime wiring lands
in this commit — U2 stays blocked until IMP-07 reverse path is implemented +
verified + runtime-hit.

Guardrails: no src.content_verifier import; no FORBIDDEN_KEI_MEMOS /
generate_with_retry / REQUIRED_PATTERNS / verify_structure / verify_area /
verify_all_areas usage; no AI / Kei / httpx / SSE path; AI-isolation contract
upheld (utility is deterministic).

Tests: 56 targeted PASS (0.19s), 15 regression baseline PASS (7.59s).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 04:42:35 +09:00

336 lines
12 KiB
Python

"""Phase Z2 deterministic verification utilities (IMP-16-U1 port).
Ports the H3 deterministic subset of src/content_verifier.py into a
Phase Z-owned module so the Phase Z pipeline never imports the Phase Q
reference-only module (which co-hosts H4/H5 Kei/AI assets).
Scope: deterministic, pure, no I/O, no LLM call, no httpx/SSE.
Wiring into Step 1/2/14/21/22 is gated behind IMP-07 (see
docs/architecture/IMP-16-U2-WIRING-DESIGN.md when u11 lands).
"""
from __future__ import annotations
import re
from dataclasses import dataclass, field
from difflib import SequenceMatcher
from html.parser import HTMLParser
@dataclass
class VerificationResult:
"""Single-axis deterministic verification outcome.
Mirrors the Phase Q VerificationResult shape so callers ported from
that surface keep their field access; the value semantics are
Phase Z-owned (no Phase Q area defaults baked in).
"""
passed: bool
area_name: str
checks: dict[str, bool] = field(default_factory=dict)
score: float = 0.0
errors: list[str] = field(default_factory=list)
warnings: list[str] = field(default_factory=list)
class _TextExtractor(HTMLParser):
"""Extract visible text only. Skips <style> and <script> bodies.
Pure stdlib (html.parser). Whitespace-only data chunks are dropped;
surviving chunks are stripped before appending to preserve token
boundaries for downstream normalization / keyword logic.
"""
def __init__(self) -> None:
super().__init__()
self.texts: list[str] = []
self._skip = False
def handle_starttag(self, tag, attrs):
if tag in ("style", "script"):
self._skip = True
def handle_endtag(self, tag):
if tag in ("style", "script"):
self._skip = False
def handle_data(self, data):
if not self._skip:
stripped = data.strip()
if stripped:
self.texts.append(stripped)
def extract_text_from_html(html: str) -> list[str]:
"""Return ordered list of visible text fragments from an HTML string.
Deterministic, pure: no I/O, no LLM, no network. Used by Phase Z
verification to compare reverse-path HTML against MDX text without
importing the Phase Q reference-only module.
"""
parser = _TextExtractor()
parser.feed(html)
return parser.texts
_PARTICLES: list[str] = sorted(
["에서", "으로", "부터", "까지", "에게", "한테",
"", "", "", "", "", "", "", "",
"", "", "", "", "", ""],
key=len, reverse=True,
)
_ENDING_NORMALIZE: dict[str, str] = {
"있음": "있다", "": "된다", "": "한다", "": "이다",
"없음": "없다", "았음": "았다", "었음": "었다",
}
def normalize_for_comparison(text: str) -> str:
"""Normalize text for deterministic comparison (Phase Z H3 port).
Steps (order matters): collapse whitespace, strip bullet markers,
decode the small HTML-entity set used by the reverse path, then
fold a single trailing 개조식 ending to its 서술형 form.
"""
text = re.sub(r"\s+", " ", text).strip()
text = re.sub(r"[•◦·\-▪▸►]", "", text).strip()
text = text.replace("&amp;", "&").replace("&lt;", "<").replace("&gt;", ">")
text = text.replace("&nbsp;", " ").replace("&#39;", "'").replace("&quot;", '"')
for gaejo, seosul in _ENDING_NORMALIZE.items():
if text.endswith(gaejo):
text = text[: -len(gaejo)] + seosul
break
return text
def extract_keywords(text: str) -> list[str]:
"""Extract length>=3 tokens, then strip a trailing Korean particle.
Deterministic, pure: tokenises on the Phase Z H3 character class
``[가-힣a-zA-Z0-9()]+``, drops tokens shorter than 3 characters,
and folds a single longest-match trailing particle from
``_PARTICLES`` when the remaining stem is still length >= 2.
"""
words = re.findall(r"[가-힣a-zA-Z0-9()]+", text)
keywords: list[str] = []
for w in words:
if len(w) < 3:
continue
for p in _PARTICLES:
if w.endswith(p) and len(w) - len(p) >= 2:
w = w[: -len(p)]
break
if len(w) >= 2:
keywords.append(w)
return keywords
_META_PREFIXES: list[str] = [
"제목 라벨:",
"표현 의도:",
"슬라이드 주인공",
"가장 큰 시각적 비중",
"시각적으로",
"간결하게 제기",
"개별 증거로 제시",
"계층적으로 시각화",
]
_META_INLINE_FRAGMENTS: tuple[str, ...] = (
"현상-문제 인과관계",
"상위-하위 포함 관계",
"독립적 나열",
)
def strip_meta_lines(text: str) -> str:
"""Drop Kei prompt meta/instruction lines before verification.
A line is dropped if its stripped form starts with any prefix in
``_META_PREFIXES`` (e.g. ``제목 라벨:``) or contains any inline
expression-hint fragment in ``_META_INLINE_FRAGMENTS`` (e.g.
``현상-문제 인과관계``). These are prompt directives, not slide
content; they must not enter sentence/keyword extraction for the
B-2 reverse path. Deterministic, pure: no I/O, no LLM, no regex
against runtime data.
"""
filtered: list[str] = []
for line in text.split("\n"):
stripped = line.strip()
if any(stripped.startswith(prefix) for prefix in _META_PREFIXES):
continue
if any(fragment in stripped for fragment in _META_INLINE_FRAGMENTS):
continue
filtered.append(line)
return "\n".join(filtered)
_BULLET_MARKER_PATTERN = re.compile(r"^[\-•◦·\d]+[.)]\s*")
_SENTENCE_SPLIT_PATTERN = re.compile(r"(?<=\.)\s+")
_MIN_SENTENCE_LEN = 5
def split_into_sentences(text: str) -> list[str]:
"""Split text into sentences for deterministic comparison.
Pipeline (order matters): drop Kei meta/instruction lines via
``strip_meta_lines``, split on newline, skip empties and ``#``-led
header lines, strip any leading bullet/numeric marker matching
``_BULLET_MARKER_PATTERN``, then split on inter-sentence whitespace
following a period. Parts shorter than ``_MIN_SENTENCE_LEN`` are
dropped so single-token noise (e.g. residual punctuation) cannot
enter the preservation/invented-text checks.
"""
text = strip_meta_lines(text)
sentences: list[str] = []
for line in text.split("\n"):
line = line.strip()
if not line or line.startswith("#"):
continue
line = _BULLET_MARKER_PATTERN.sub("", line).strip()
if not line:
continue
for part in _SENTENCE_SPLIT_PATTERN.split(line):
part = part.strip()
if len(part) >= _MIN_SENTENCE_LEN:
sentences.append(part)
return sentences
_SENTENCE_KEYWORD_MATCH_THRESHOLD = 0.6
_SENTENCE_SEQUENCE_MATCH_THRESHOLD = 0.65
def _sentence_matches_html(
sentence: str,
html_combined: str,
html_texts: list[str],
) -> bool:
"""Return True if ``sentence`` is preserved in the HTML side.
Two-axis match: a keyword-ratio gate against ``html_combined`` (the
pre-normalized join of all visible HTML text fragments) and a
SequenceMatcher fallback against each individual normalized HTML
fragment. A sentence whose keyword set is empty after normalization
is treated as preserved (no falsifiable signal). Pure helper used
by ``verify_text_preservation`` (u8); thresholds are lifted to
named module constants so the surface is auditable.
"""
norm_orig = normalize_for_comparison(sentence)
keywords = extract_keywords(norm_orig)
if not keywords:
return True
kw_found = sum(1 for kw in keywords if kw in html_combined)
kw_ratio = kw_found / len(keywords)
best_ratio = 0.0
for html_text in html_texts:
norm_html = normalize_for_comparison(html_text)
ratio = SequenceMatcher(None, norm_orig, norm_html).ratio()
if ratio > best_ratio:
best_ratio = ratio
return (
kw_ratio >= _SENTENCE_KEYWORD_MATCH_THRESHOLD
or best_ratio >= _SENTENCE_SEQUENCE_MATCH_THRESHOLD
)
_TEXT_PRESERVATION_DEFAULT_THRESHOLD = 0.70
_MISSING_SENTENCE_REPORT_LIMIT = 5
_MISSING_SENTENCE_TRUNCATE_LEN = 60
def verify_text_preservation(
original_mdx: str,
generated_html: str,
area_name: str,
threshold: float = _TEXT_PRESERVATION_DEFAULT_THRESHOLD,
) -> VerificationResult:
"""Verify the original MDX text is preserved in the generated HTML.
Splits MDX via u6, pre-normalizes joined HTML via u2+u3, then per
sentence delegates to u7. Empty sentence list -> passed True,
score 1.0. Missing sentences are capped at the report limit and
each truncated to the truncate length constant.
"""
original_sentences = split_into_sentences(original_mdx)
if not original_sentences:
return VerificationResult(passed=True, area_name=area_name,
checks={"text_preservation": True}, score=1.0)
html_texts = extract_text_from_html(generated_html)
html_combined = normalize_for_comparison(" ".join(html_texts))
matched = 0
missing: list[str] = []
for sentence in original_sentences:
if _sentence_matches_html(sentence, html_combined, html_texts):
matched += 1
else:
missing.append(sentence)
score = matched / len(original_sentences)
passed = score >= threshold
errors: list[str] = []
if not passed:
errors = [f"누락 문장 ({len(missing)}/{len(original_sentences)}):"]
for s in missing[:_MISSING_SENTENCE_REPORT_LIMIT]:
errors.append(
f" - \"{s[:_MISSING_SENTENCE_TRUNCATE_LEN]}...\""
if len(s) > _MISSING_SENTENCE_TRUNCATE_LEN else f" - \"{s}\""
)
warnings = ([f"보존율: {score:.0%} ({matched}/{len(original_sentences)} 문장)"]
if score < 1.0 else [])
return VerificationResult(
passed=passed, area_name=area_name,
checks={"text_preservation": passed}, score=score,
errors=errors, warnings=warnings,
)
_INVENTED_TEXT_MIN_LENGTH = 15
_INVENTED_TEXT_ALLOWED_LABELS: frozenset[str] = frozenset({
"용어 정의", "핵심 메시지", "상세 비교",
})
_INVENTED_TEXT_CSS_NUMBER_PATTERN = re.compile(r"^[\d\s.,%px#rgb()]+$")
_INVENTED_TEXT_KEYWORD_THRESHOLD = 0.4
_INVENTED_TEXT_TRUNCATE_LEN = 80
def detect_invented_text(
original_mdx: str,
generated_html: str,
min_length: int = _INVENTED_TEXT_MIN_LENGTH,
) -> list[str]:
"""Detect HTML text fragments that are not anchored in the source MDX.
Phase Z port of the H3 hallucination guard (Phase Q reference:
``src/content_verifier.py:276-315``). Pipeline (order matters):
drop short fragments (< ``min_length``), drop structural label
exceptions in ``_INVENTED_TEXT_ALLOWED_LABELS``, drop CSS/numeric
noise matching ``_INVENTED_TEXT_CSS_NUMBER_PATTERN``, then per
surviving fragment compute keyword ratio (via u4 ``extract_keywords``
on the normalized fragment, checked against the normalized MDX). A
fragment is flagged when ``kw_ratio < _INVENTED_TEXT_KEYWORD_THRESHOLD``;
flagged values are truncated to ``_INVENTED_TEXT_TRUNCATE_LEN`` chars
before being returned. Empty keyword sets short-circuit as
non-falsifiable (matches Phase Q parity). Deterministic, pure.
"""
html_texts = extract_text_from_html(generated_html)
norm_mdx = normalize_for_comparison(original_mdx)
invented: list[str] = []
for text in html_texts:
text = text.strip()
if len(text) < min_length:
continue
if text in _INVENTED_TEXT_ALLOWED_LABELS:
continue
if _INVENTED_TEXT_CSS_NUMBER_PATTERN.match(text):
continue
norm_text = normalize_for_comparison(text)
keywords = extract_keywords(norm_text)
if not keywords:
continue
kw_found = sum(1 for kw in keywords if kw in norm_mdx)
kw_ratio = kw_found / len(keywords)
if kw_ratio < _INVENTED_TEXT_KEYWORD_THRESHOLD:
invented.append(text[:_INVENTED_TEXT_TRUNCATE_LEN])
return invented