feat(IMP-16): U1 H3 verification utility port + U2 wiring design
U1 (runtime, u1-u10): new Phase Z-owned deterministic verification module src/phase_z2_verification_utils.py (335 LOC, stdlib only) porting H3 utility surface — VerificationResult, extract_text_from_html, normalize_for_comparison, extract_keywords, strip_meta_lines, split_into_sentences, verify_text_preservation, detect_invented_text. 10 unit tests under tests/phase_z2/test_pz2_vu_*.py (56 tests). u11 (design-only): docs/architecture/IMP-16-U2-WIRING-DESIGN.md fixes the Step 1/2/14/21/22 reverse-path contract, redesigned frame-contract pattern reservation (IMP-20), and IMP-07 hard-gate criteria. No runtime wiring lands in this commit — U2 stays blocked until IMP-07 reverse path is implemented + verified + runtime-hit. Guardrails: no src.content_verifier import; no FORBIDDEN_KEI_MEMOS / generate_with_retry / REQUIRED_PATTERNS / verify_structure / verify_area / verify_all_areas usage; no AI / Kei / httpx / SSE path; AI-isolation contract upheld (utility is deterministic). Tests: 56 targeted PASS (0.19s), 15 regression baseline PASS (7.59s). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
335
src/phase_z2_verification_utils.py
Normal file
335
src/phase_z2_verification_utils.py
Normal file
@@ -0,0 +1,335 @@
|
||||
"""Phase Z2 deterministic verification utilities (IMP-16-U1 port).
|
||||
|
||||
Ports the H3 deterministic subset of src/content_verifier.py into a
|
||||
Phase Z-owned module so the Phase Z pipeline never imports the Phase Q
|
||||
reference-only module (which co-hosts H4/H5 Kei/AI assets).
|
||||
|
||||
Scope: deterministic, pure, no I/O, no LLM call, no httpx/SSE.
|
||||
Wiring into Step 1/2/14/21/22 is gated behind IMP-07 (see
|
||||
docs/architecture/IMP-16-U2-WIRING-DESIGN.md when u11 lands).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from difflib import SequenceMatcher
|
||||
from html.parser import HTMLParser
|
||||
|
||||
|
||||
@dataclass
|
||||
class VerificationResult:
|
||||
"""Single-axis deterministic verification outcome.
|
||||
|
||||
Mirrors the Phase Q VerificationResult shape so callers ported from
|
||||
that surface keep their field access; the value semantics are
|
||||
Phase Z-owned (no Phase Q area defaults baked in).
|
||||
"""
|
||||
|
||||
passed: bool
|
||||
area_name: str
|
||||
checks: dict[str, bool] = field(default_factory=dict)
|
||||
score: float = 0.0
|
||||
errors: list[str] = field(default_factory=list)
|
||||
warnings: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
class _TextExtractor(HTMLParser):
|
||||
"""Extract visible text only. Skips <style> and <script> bodies.
|
||||
|
||||
Pure stdlib (html.parser). Whitespace-only data chunks are dropped;
|
||||
surviving chunks are stripped before appending to preserve token
|
||||
boundaries for downstream normalization / keyword logic.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.texts: list[str] = []
|
||||
self._skip = False
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag in ("style", "script"):
|
||||
self._skip = True
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag in ("style", "script"):
|
||||
self._skip = False
|
||||
|
||||
def handle_data(self, data):
|
||||
if not self._skip:
|
||||
stripped = data.strip()
|
||||
if stripped:
|
||||
self.texts.append(stripped)
|
||||
|
||||
|
||||
def extract_text_from_html(html: str) -> list[str]:
|
||||
"""Return ordered list of visible text fragments from an HTML string.
|
||||
|
||||
Deterministic, pure: no I/O, no LLM, no network. Used by Phase Z
|
||||
verification to compare reverse-path HTML against MDX text without
|
||||
importing the Phase Q reference-only module.
|
||||
"""
|
||||
parser = _TextExtractor()
|
||||
parser.feed(html)
|
||||
return parser.texts
|
||||
|
||||
|
||||
_PARTICLES: list[str] = sorted(
|
||||
["에서", "으로", "부터", "까지", "에게", "한테",
|
||||
"은", "는", "이", "가", "을", "를", "에", "의",
|
||||
"로", "와", "과", "도", "만", "께"],
|
||||
key=len, reverse=True,
|
||||
)
|
||||
|
||||
_ENDING_NORMALIZE: dict[str, str] = {
|
||||
"있음": "있다", "됨": "된다", "함": "한다", "임": "이다",
|
||||
"없음": "없다", "았음": "았다", "었음": "었다",
|
||||
}
|
||||
|
||||
|
||||
def normalize_for_comparison(text: str) -> str:
|
||||
"""Normalize text for deterministic comparison (Phase Z H3 port).
|
||||
|
||||
Steps (order matters): collapse whitespace, strip bullet markers,
|
||||
decode the small HTML-entity set used by the reverse path, then
|
||||
fold a single trailing 개조식 ending to its 서술형 form.
|
||||
"""
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
text = re.sub(r"[•◦·\-▪▸►]", "", text).strip()
|
||||
text = text.replace("&", "&").replace("<", "<").replace(">", ">")
|
||||
text = text.replace(" ", " ").replace("'", "'").replace(""", '"')
|
||||
for gaejo, seosul in _ENDING_NORMALIZE.items():
|
||||
if text.endswith(gaejo):
|
||||
text = text[: -len(gaejo)] + seosul
|
||||
break
|
||||
return text
|
||||
|
||||
|
||||
def extract_keywords(text: str) -> list[str]:
|
||||
"""Extract length>=3 tokens, then strip a trailing Korean particle.
|
||||
|
||||
Deterministic, pure: tokenises on the Phase Z H3 character class
|
||||
``[가-힣a-zA-Z0-9()]+``, drops tokens shorter than 3 characters,
|
||||
and folds a single longest-match trailing particle from
|
||||
``_PARTICLES`` when the remaining stem is still length >= 2.
|
||||
"""
|
||||
words = re.findall(r"[가-힣a-zA-Z0-9()]+", text)
|
||||
keywords: list[str] = []
|
||||
for w in words:
|
||||
if len(w) < 3:
|
||||
continue
|
||||
for p in _PARTICLES:
|
||||
if w.endswith(p) and len(w) - len(p) >= 2:
|
||||
w = w[: -len(p)]
|
||||
break
|
||||
if len(w) >= 2:
|
||||
keywords.append(w)
|
||||
return keywords
|
||||
|
||||
|
||||
_META_PREFIXES: list[str] = [
|
||||
"제목 라벨:",
|
||||
"표현 의도:",
|
||||
"슬라이드 주인공",
|
||||
"가장 큰 시각적 비중",
|
||||
"시각적으로",
|
||||
"간결하게 제기",
|
||||
"개별 증거로 제시",
|
||||
"계층적으로 시각화",
|
||||
]
|
||||
|
||||
_META_INLINE_FRAGMENTS: tuple[str, ...] = (
|
||||
"현상-문제 인과관계",
|
||||
"상위-하위 포함 관계",
|
||||
"독립적 나열",
|
||||
)
|
||||
|
||||
|
||||
def strip_meta_lines(text: str) -> str:
|
||||
"""Drop Kei prompt meta/instruction lines before verification.
|
||||
|
||||
A line is dropped if its stripped form starts with any prefix in
|
||||
``_META_PREFIXES`` (e.g. ``제목 라벨:``) or contains any inline
|
||||
expression-hint fragment in ``_META_INLINE_FRAGMENTS`` (e.g.
|
||||
``현상-문제 인과관계``). These are prompt directives, not slide
|
||||
content; they must not enter sentence/keyword extraction for the
|
||||
B-2 reverse path. Deterministic, pure: no I/O, no LLM, no regex
|
||||
against runtime data.
|
||||
"""
|
||||
filtered: list[str] = []
|
||||
for line in text.split("\n"):
|
||||
stripped = line.strip()
|
||||
if any(stripped.startswith(prefix) for prefix in _META_PREFIXES):
|
||||
continue
|
||||
if any(fragment in stripped for fragment in _META_INLINE_FRAGMENTS):
|
||||
continue
|
||||
filtered.append(line)
|
||||
return "\n".join(filtered)
|
||||
|
||||
|
||||
_BULLET_MARKER_PATTERN = re.compile(r"^[\-•◦·\d]+[.)]\s*")
|
||||
_SENTENCE_SPLIT_PATTERN = re.compile(r"(?<=\.)\s+")
|
||||
_MIN_SENTENCE_LEN = 5
|
||||
|
||||
|
||||
def split_into_sentences(text: str) -> list[str]:
|
||||
"""Split text into sentences for deterministic comparison.
|
||||
|
||||
Pipeline (order matters): drop Kei meta/instruction lines via
|
||||
``strip_meta_lines``, split on newline, skip empties and ``#``-led
|
||||
header lines, strip any leading bullet/numeric marker matching
|
||||
``_BULLET_MARKER_PATTERN``, then split on inter-sentence whitespace
|
||||
following a period. Parts shorter than ``_MIN_SENTENCE_LEN`` are
|
||||
dropped so single-token noise (e.g. residual punctuation) cannot
|
||||
enter the preservation/invented-text checks.
|
||||
"""
|
||||
text = strip_meta_lines(text)
|
||||
sentences: list[str] = []
|
||||
for line in text.split("\n"):
|
||||
line = line.strip()
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
line = _BULLET_MARKER_PATTERN.sub("", line).strip()
|
||||
if not line:
|
||||
continue
|
||||
for part in _SENTENCE_SPLIT_PATTERN.split(line):
|
||||
part = part.strip()
|
||||
if len(part) >= _MIN_SENTENCE_LEN:
|
||||
sentences.append(part)
|
||||
return sentences
|
||||
|
||||
|
||||
_SENTENCE_KEYWORD_MATCH_THRESHOLD = 0.6
|
||||
_SENTENCE_SEQUENCE_MATCH_THRESHOLD = 0.65
|
||||
|
||||
|
||||
def _sentence_matches_html(
|
||||
sentence: str,
|
||||
html_combined: str,
|
||||
html_texts: list[str],
|
||||
) -> bool:
|
||||
"""Return True if ``sentence`` is preserved in the HTML side.
|
||||
|
||||
Two-axis match: a keyword-ratio gate against ``html_combined`` (the
|
||||
pre-normalized join of all visible HTML text fragments) and a
|
||||
SequenceMatcher fallback against each individual normalized HTML
|
||||
fragment. A sentence whose keyword set is empty after normalization
|
||||
is treated as preserved (no falsifiable signal). Pure helper used
|
||||
by ``verify_text_preservation`` (u8); thresholds are lifted to
|
||||
named module constants so the surface is auditable.
|
||||
"""
|
||||
norm_orig = normalize_for_comparison(sentence)
|
||||
keywords = extract_keywords(norm_orig)
|
||||
if not keywords:
|
||||
return True
|
||||
kw_found = sum(1 for kw in keywords if kw in html_combined)
|
||||
kw_ratio = kw_found / len(keywords)
|
||||
best_ratio = 0.0
|
||||
for html_text in html_texts:
|
||||
norm_html = normalize_for_comparison(html_text)
|
||||
ratio = SequenceMatcher(None, norm_orig, norm_html).ratio()
|
||||
if ratio > best_ratio:
|
||||
best_ratio = ratio
|
||||
return (
|
||||
kw_ratio >= _SENTENCE_KEYWORD_MATCH_THRESHOLD
|
||||
or best_ratio >= _SENTENCE_SEQUENCE_MATCH_THRESHOLD
|
||||
)
|
||||
|
||||
|
||||
_TEXT_PRESERVATION_DEFAULT_THRESHOLD = 0.70
|
||||
_MISSING_SENTENCE_REPORT_LIMIT = 5
|
||||
_MISSING_SENTENCE_TRUNCATE_LEN = 60
|
||||
|
||||
|
||||
def verify_text_preservation(
|
||||
original_mdx: str,
|
||||
generated_html: str,
|
||||
area_name: str,
|
||||
threshold: float = _TEXT_PRESERVATION_DEFAULT_THRESHOLD,
|
||||
) -> VerificationResult:
|
||||
"""Verify the original MDX text is preserved in the generated HTML.
|
||||
|
||||
Splits MDX via u6, pre-normalizes joined HTML via u2+u3, then per
|
||||
sentence delegates to u7. Empty sentence list -> passed True,
|
||||
score 1.0. Missing sentences are capped at the report limit and
|
||||
each truncated to the truncate length constant.
|
||||
"""
|
||||
original_sentences = split_into_sentences(original_mdx)
|
||||
if not original_sentences:
|
||||
return VerificationResult(passed=True, area_name=area_name,
|
||||
checks={"text_preservation": True}, score=1.0)
|
||||
html_texts = extract_text_from_html(generated_html)
|
||||
html_combined = normalize_for_comparison(" ".join(html_texts))
|
||||
matched = 0
|
||||
missing: list[str] = []
|
||||
for sentence in original_sentences:
|
||||
if _sentence_matches_html(sentence, html_combined, html_texts):
|
||||
matched += 1
|
||||
else:
|
||||
missing.append(sentence)
|
||||
score = matched / len(original_sentences)
|
||||
passed = score >= threshold
|
||||
errors: list[str] = []
|
||||
if not passed:
|
||||
errors = [f"누락 문장 ({len(missing)}/{len(original_sentences)}):"]
|
||||
for s in missing[:_MISSING_SENTENCE_REPORT_LIMIT]:
|
||||
errors.append(
|
||||
f" - \"{s[:_MISSING_SENTENCE_TRUNCATE_LEN]}...\""
|
||||
if len(s) > _MISSING_SENTENCE_TRUNCATE_LEN else f" - \"{s}\""
|
||||
)
|
||||
warnings = ([f"보존율: {score:.0%} ({matched}/{len(original_sentences)} 문장)"]
|
||||
if score < 1.0 else [])
|
||||
return VerificationResult(
|
||||
passed=passed, area_name=area_name,
|
||||
checks={"text_preservation": passed}, score=score,
|
||||
errors=errors, warnings=warnings,
|
||||
)
|
||||
|
||||
|
||||
_INVENTED_TEXT_MIN_LENGTH = 15
|
||||
_INVENTED_TEXT_ALLOWED_LABELS: frozenset[str] = frozenset({
|
||||
"용어 정의", "핵심 메시지", "상세 비교",
|
||||
})
|
||||
_INVENTED_TEXT_CSS_NUMBER_PATTERN = re.compile(r"^[\d\s.,%px#rgb()]+$")
|
||||
_INVENTED_TEXT_KEYWORD_THRESHOLD = 0.4
|
||||
_INVENTED_TEXT_TRUNCATE_LEN = 80
|
||||
|
||||
|
||||
def detect_invented_text(
|
||||
original_mdx: str,
|
||||
generated_html: str,
|
||||
min_length: int = _INVENTED_TEXT_MIN_LENGTH,
|
||||
) -> list[str]:
|
||||
"""Detect HTML text fragments that are not anchored in the source MDX.
|
||||
|
||||
Phase Z port of the H3 hallucination guard (Phase Q reference:
|
||||
``src/content_verifier.py:276-315``). Pipeline (order matters):
|
||||
drop short fragments (< ``min_length``), drop structural label
|
||||
exceptions in ``_INVENTED_TEXT_ALLOWED_LABELS``, drop CSS/numeric
|
||||
noise matching ``_INVENTED_TEXT_CSS_NUMBER_PATTERN``, then per
|
||||
surviving fragment compute keyword ratio (via u4 ``extract_keywords``
|
||||
on the normalized fragment, checked against the normalized MDX). A
|
||||
fragment is flagged when ``kw_ratio < _INVENTED_TEXT_KEYWORD_THRESHOLD``;
|
||||
flagged values are truncated to ``_INVENTED_TEXT_TRUNCATE_LEN`` chars
|
||||
before being returned. Empty keyword sets short-circuit as
|
||||
non-falsifiable (matches Phase Q parity). Deterministic, pure.
|
||||
"""
|
||||
html_texts = extract_text_from_html(generated_html)
|
||||
norm_mdx = normalize_for_comparison(original_mdx)
|
||||
invented: list[str] = []
|
||||
for text in html_texts:
|
||||
text = text.strip()
|
||||
if len(text) < min_length:
|
||||
continue
|
||||
if text in _INVENTED_TEXT_ALLOWED_LABELS:
|
||||
continue
|
||||
if _INVENTED_TEXT_CSS_NUMBER_PATTERN.match(text):
|
||||
continue
|
||||
norm_text = normalize_for_comparison(text)
|
||||
keywords = extract_keywords(norm_text)
|
||||
if not keywords:
|
||||
continue
|
||||
kw_found = sum(1 for kw in keywords if kw in norm_mdx)
|
||||
kw_ratio = kw_found / len(keywords)
|
||||
if kw_ratio < _INVENTED_TEXT_KEYWORD_THRESHOLD:
|
||||
invented.append(text[:_INVENTED_TEXT_TRUNCATE_LEN])
|
||||
return invented
|
||||
Reference in New Issue
Block a user