U1 (runtime, u1-u10): new Phase Z-owned deterministic verification module src/phase_z2_verification_utils.py (335 LOC, stdlib only) porting H3 utility surface — VerificationResult, extract_text_from_html, normalize_for_comparison, extract_keywords, strip_meta_lines, split_into_sentences, verify_text_preservation, detect_invented_text. 10 unit tests under tests/phase_z2/test_pz2_vu_*.py (56 tests). u11 (design-only): docs/architecture/IMP-16-U2-WIRING-DESIGN.md fixes the Step 1/2/14/21/22 reverse-path contract, redesigned frame-contract pattern reservation (IMP-20), and IMP-07 hard-gate criteria. No runtime wiring lands in this commit — U2 stays blocked until IMP-07 reverse path is implemented + verified + runtime-hit. Guardrails: no src.content_verifier import; no FORBIDDEN_KEI_MEMOS / generate_with_retry / REQUIRED_PATTERNS / verify_structure / verify_area / verify_all_areas usage; no AI / Kei / httpx / SSE path; AI-isolation contract upheld (utility is deterministic). Tests: 56 targeted PASS (0.19s), 15 regression baseline PASS (7.59s). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
336 lines
12 KiB
Python
336 lines
12 KiB
Python
"""Phase Z2 deterministic verification utilities (IMP-16-U1 port).
|
|
|
|
Ports the H3 deterministic subset of src/content_verifier.py into a
|
|
Phase Z-owned module so the Phase Z pipeline never imports the Phase Q
|
|
reference-only module (which co-hosts H4/H5 Kei/AI assets).
|
|
|
|
Scope: deterministic, pure, no I/O, no LLM call, no httpx/SSE.
|
|
Wiring into Step 1/2/14/21/22 is gated behind IMP-07 (see
|
|
docs/architecture/IMP-16-U2-WIRING-DESIGN.md when u11 lands).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from dataclasses import dataclass, field
|
|
from difflib import SequenceMatcher
|
|
from html.parser import HTMLParser
|
|
|
|
|
|
@dataclass
|
|
class VerificationResult:
|
|
"""Single-axis deterministic verification outcome.
|
|
|
|
Mirrors the Phase Q VerificationResult shape so callers ported from
|
|
that surface keep their field access; the value semantics are
|
|
Phase Z-owned (no Phase Q area defaults baked in).
|
|
"""
|
|
|
|
passed: bool
|
|
area_name: str
|
|
checks: dict[str, bool] = field(default_factory=dict)
|
|
score: float = 0.0
|
|
errors: list[str] = field(default_factory=list)
|
|
warnings: list[str] = field(default_factory=list)
|
|
|
|
|
|
class _TextExtractor(HTMLParser):
|
|
"""Extract visible text only. Skips <style> and <script> bodies.
|
|
|
|
Pure stdlib (html.parser). Whitespace-only data chunks are dropped;
|
|
surviving chunks are stripped before appending to preserve token
|
|
boundaries for downstream normalization / keyword logic.
|
|
"""
|
|
|
|
def __init__(self) -> None:
|
|
super().__init__()
|
|
self.texts: list[str] = []
|
|
self._skip = False
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
if tag in ("style", "script"):
|
|
self._skip = True
|
|
|
|
def handle_endtag(self, tag):
|
|
if tag in ("style", "script"):
|
|
self._skip = False
|
|
|
|
def handle_data(self, data):
|
|
if not self._skip:
|
|
stripped = data.strip()
|
|
if stripped:
|
|
self.texts.append(stripped)
|
|
|
|
|
|
def extract_text_from_html(html: str) -> list[str]:
|
|
"""Return ordered list of visible text fragments from an HTML string.
|
|
|
|
Deterministic, pure: no I/O, no LLM, no network. Used by Phase Z
|
|
verification to compare reverse-path HTML against MDX text without
|
|
importing the Phase Q reference-only module.
|
|
"""
|
|
parser = _TextExtractor()
|
|
parser.feed(html)
|
|
return parser.texts
|
|
|
|
|
|
_PARTICLES: list[str] = sorted(
|
|
["에서", "으로", "부터", "까지", "에게", "한테",
|
|
"은", "는", "이", "가", "을", "를", "에", "의",
|
|
"로", "와", "과", "도", "만", "께"],
|
|
key=len, reverse=True,
|
|
)
|
|
|
|
_ENDING_NORMALIZE: dict[str, str] = {
|
|
"있음": "있다", "됨": "된다", "함": "한다", "임": "이다",
|
|
"없음": "없다", "았음": "았다", "었음": "었다",
|
|
}
|
|
|
|
|
|
def normalize_for_comparison(text: str) -> str:
|
|
"""Normalize text for deterministic comparison (Phase Z H3 port).
|
|
|
|
Steps (order matters): collapse whitespace, strip bullet markers,
|
|
decode the small HTML-entity set used by the reverse path, then
|
|
fold a single trailing 개조식 ending to its 서술형 form.
|
|
"""
|
|
text = re.sub(r"\s+", " ", text).strip()
|
|
text = re.sub(r"[•◦·\-▪▸►]", "", text).strip()
|
|
text = text.replace("&", "&").replace("<", "<").replace(">", ">")
|
|
text = text.replace(" ", " ").replace("'", "'").replace(""", '"')
|
|
for gaejo, seosul in _ENDING_NORMALIZE.items():
|
|
if text.endswith(gaejo):
|
|
text = text[: -len(gaejo)] + seosul
|
|
break
|
|
return text
|
|
|
|
|
|
def extract_keywords(text: str) -> list[str]:
|
|
"""Extract length>=3 tokens, then strip a trailing Korean particle.
|
|
|
|
Deterministic, pure: tokenises on the Phase Z H3 character class
|
|
``[가-힣a-zA-Z0-9()]+``, drops tokens shorter than 3 characters,
|
|
and folds a single longest-match trailing particle from
|
|
``_PARTICLES`` when the remaining stem is still length >= 2.
|
|
"""
|
|
words = re.findall(r"[가-힣a-zA-Z0-9()]+", text)
|
|
keywords: list[str] = []
|
|
for w in words:
|
|
if len(w) < 3:
|
|
continue
|
|
for p in _PARTICLES:
|
|
if w.endswith(p) and len(w) - len(p) >= 2:
|
|
w = w[: -len(p)]
|
|
break
|
|
if len(w) >= 2:
|
|
keywords.append(w)
|
|
return keywords
|
|
|
|
|
|
_META_PREFIXES: list[str] = [
|
|
"제목 라벨:",
|
|
"표현 의도:",
|
|
"슬라이드 주인공",
|
|
"가장 큰 시각적 비중",
|
|
"시각적으로",
|
|
"간결하게 제기",
|
|
"개별 증거로 제시",
|
|
"계층적으로 시각화",
|
|
]
|
|
|
|
_META_INLINE_FRAGMENTS: tuple[str, ...] = (
|
|
"현상-문제 인과관계",
|
|
"상위-하위 포함 관계",
|
|
"독립적 나열",
|
|
)
|
|
|
|
|
|
def strip_meta_lines(text: str) -> str:
|
|
"""Drop Kei prompt meta/instruction lines before verification.
|
|
|
|
A line is dropped if its stripped form starts with any prefix in
|
|
``_META_PREFIXES`` (e.g. ``제목 라벨:``) or contains any inline
|
|
expression-hint fragment in ``_META_INLINE_FRAGMENTS`` (e.g.
|
|
``현상-문제 인과관계``). These are prompt directives, not slide
|
|
content; they must not enter sentence/keyword extraction for the
|
|
B-2 reverse path. Deterministic, pure: no I/O, no LLM, no regex
|
|
against runtime data.
|
|
"""
|
|
filtered: list[str] = []
|
|
for line in text.split("\n"):
|
|
stripped = line.strip()
|
|
if any(stripped.startswith(prefix) for prefix in _META_PREFIXES):
|
|
continue
|
|
if any(fragment in stripped for fragment in _META_INLINE_FRAGMENTS):
|
|
continue
|
|
filtered.append(line)
|
|
return "\n".join(filtered)
|
|
|
|
|
|
_BULLET_MARKER_PATTERN = re.compile(r"^[\-•◦·\d]+[.)]\s*")
|
|
_SENTENCE_SPLIT_PATTERN = re.compile(r"(?<=\.)\s+")
|
|
_MIN_SENTENCE_LEN = 5
|
|
|
|
|
|
def split_into_sentences(text: str) -> list[str]:
|
|
"""Split text into sentences for deterministic comparison.
|
|
|
|
Pipeline (order matters): drop Kei meta/instruction lines via
|
|
``strip_meta_lines``, split on newline, skip empties and ``#``-led
|
|
header lines, strip any leading bullet/numeric marker matching
|
|
``_BULLET_MARKER_PATTERN``, then split on inter-sentence whitespace
|
|
following a period. Parts shorter than ``_MIN_SENTENCE_LEN`` are
|
|
dropped so single-token noise (e.g. residual punctuation) cannot
|
|
enter the preservation/invented-text checks.
|
|
"""
|
|
text = strip_meta_lines(text)
|
|
sentences: list[str] = []
|
|
for line in text.split("\n"):
|
|
line = line.strip()
|
|
if not line or line.startswith("#"):
|
|
continue
|
|
line = _BULLET_MARKER_PATTERN.sub("", line).strip()
|
|
if not line:
|
|
continue
|
|
for part in _SENTENCE_SPLIT_PATTERN.split(line):
|
|
part = part.strip()
|
|
if len(part) >= _MIN_SENTENCE_LEN:
|
|
sentences.append(part)
|
|
return sentences
|
|
|
|
|
|
_SENTENCE_KEYWORD_MATCH_THRESHOLD = 0.6
|
|
_SENTENCE_SEQUENCE_MATCH_THRESHOLD = 0.65
|
|
|
|
|
|
def _sentence_matches_html(
|
|
sentence: str,
|
|
html_combined: str,
|
|
html_texts: list[str],
|
|
) -> bool:
|
|
"""Return True if ``sentence`` is preserved in the HTML side.
|
|
|
|
Two-axis match: a keyword-ratio gate against ``html_combined`` (the
|
|
pre-normalized join of all visible HTML text fragments) and a
|
|
SequenceMatcher fallback against each individual normalized HTML
|
|
fragment. A sentence whose keyword set is empty after normalization
|
|
is treated as preserved (no falsifiable signal). Pure helper used
|
|
by ``verify_text_preservation`` (u8); thresholds are lifted to
|
|
named module constants so the surface is auditable.
|
|
"""
|
|
norm_orig = normalize_for_comparison(sentence)
|
|
keywords = extract_keywords(norm_orig)
|
|
if not keywords:
|
|
return True
|
|
kw_found = sum(1 for kw in keywords if kw in html_combined)
|
|
kw_ratio = kw_found / len(keywords)
|
|
best_ratio = 0.0
|
|
for html_text in html_texts:
|
|
norm_html = normalize_for_comparison(html_text)
|
|
ratio = SequenceMatcher(None, norm_orig, norm_html).ratio()
|
|
if ratio > best_ratio:
|
|
best_ratio = ratio
|
|
return (
|
|
kw_ratio >= _SENTENCE_KEYWORD_MATCH_THRESHOLD
|
|
or best_ratio >= _SENTENCE_SEQUENCE_MATCH_THRESHOLD
|
|
)
|
|
|
|
|
|
_TEXT_PRESERVATION_DEFAULT_THRESHOLD = 0.70
|
|
_MISSING_SENTENCE_REPORT_LIMIT = 5
|
|
_MISSING_SENTENCE_TRUNCATE_LEN = 60
|
|
|
|
|
|
def verify_text_preservation(
|
|
original_mdx: str,
|
|
generated_html: str,
|
|
area_name: str,
|
|
threshold: float = _TEXT_PRESERVATION_DEFAULT_THRESHOLD,
|
|
) -> VerificationResult:
|
|
"""Verify the original MDX text is preserved in the generated HTML.
|
|
|
|
Splits MDX via u6, pre-normalizes joined HTML via u2+u3, then per
|
|
sentence delegates to u7. Empty sentence list -> passed True,
|
|
score 1.0. Missing sentences are capped at the report limit and
|
|
each truncated to the truncate length constant.
|
|
"""
|
|
original_sentences = split_into_sentences(original_mdx)
|
|
if not original_sentences:
|
|
return VerificationResult(passed=True, area_name=area_name,
|
|
checks={"text_preservation": True}, score=1.0)
|
|
html_texts = extract_text_from_html(generated_html)
|
|
html_combined = normalize_for_comparison(" ".join(html_texts))
|
|
matched = 0
|
|
missing: list[str] = []
|
|
for sentence in original_sentences:
|
|
if _sentence_matches_html(sentence, html_combined, html_texts):
|
|
matched += 1
|
|
else:
|
|
missing.append(sentence)
|
|
score = matched / len(original_sentences)
|
|
passed = score >= threshold
|
|
errors: list[str] = []
|
|
if not passed:
|
|
errors = [f"누락 문장 ({len(missing)}/{len(original_sentences)}):"]
|
|
for s in missing[:_MISSING_SENTENCE_REPORT_LIMIT]:
|
|
errors.append(
|
|
f" - \"{s[:_MISSING_SENTENCE_TRUNCATE_LEN]}...\""
|
|
if len(s) > _MISSING_SENTENCE_TRUNCATE_LEN else f" - \"{s}\""
|
|
)
|
|
warnings = ([f"보존율: {score:.0%} ({matched}/{len(original_sentences)} 문장)"]
|
|
if score < 1.0 else [])
|
|
return VerificationResult(
|
|
passed=passed, area_name=area_name,
|
|
checks={"text_preservation": passed}, score=score,
|
|
errors=errors, warnings=warnings,
|
|
)
|
|
|
|
|
|
_INVENTED_TEXT_MIN_LENGTH = 15
|
|
_INVENTED_TEXT_ALLOWED_LABELS: frozenset[str] = frozenset({
|
|
"용어 정의", "핵심 메시지", "상세 비교",
|
|
})
|
|
_INVENTED_TEXT_CSS_NUMBER_PATTERN = re.compile(r"^[\d\s.,%px#rgb()]+$")
|
|
_INVENTED_TEXT_KEYWORD_THRESHOLD = 0.4
|
|
_INVENTED_TEXT_TRUNCATE_LEN = 80
|
|
|
|
|
|
def detect_invented_text(
|
|
original_mdx: str,
|
|
generated_html: str,
|
|
min_length: int = _INVENTED_TEXT_MIN_LENGTH,
|
|
) -> list[str]:
|
|
"""Detect HTML text fragments that are not anchored in the source MDX.
|
|
|
|
Phase Z port of the H3 hallucination guard (Phase Q reference:
|
|
``src/content_verifier.py:276-315``). Pipeline (order matters):
|
|
drop short fragments (< ``min_length``), drop structural label
|
|
exceptions in ``_INVENTED_TEXT_ALLOWED_LABELS``, drop CSS/numeric
|
|
noise matching ``_INVENTED_TEXT_CSS_NUMBER_PATTERN``, then per
|
|
surviving fragment compute keyword ratio (via u4 ``extract_keywords``
|
|
on the normalized fragment, checked against the normalized MDX). A
|
|
fragment is flagged when ``kw_ratio < _INVENTED_TEXT_KEYWORD_THRESHOLD``;
|
|
flagged values are truncated to ``_INVENTED_TEXT_TRUNCATE_LEN`` chars
|
|
before being returned. Empty keyword sets short-circuit as
|
|
non-falsifiable (matches Phase Q parity). Deterministic, pure.
|
|
"""
|
|
html_texts = extract_text_from_html(generated_html)
|
|
norm_mdx = normalize_for_comparison(original_mdx)
|
|
invented: list[str] = []
|
|
for text in html_texts:
|
|
text = text.strip()
|
|
if len(text) < min_length:
|
|
continue
|
|
if text in _INVENTED_TEXT_ALLOWED_LABELS:
|
|
continue
|
|
if _INVENTED_TEXT_CSS_NUMBER_PATTERN.match(text):
|
|
continue
|
|
norm_text = normalize_for_comparison(text)
|
|
keywords = extract_keywords(norm_text)
|
|
if not keywords:
|
|
continue
|
|
kw_found = sum(1 for kw in keywords if kw in norm_mdx)
|
|
kw_ratio = kw_found / len(keywords)
|
|
if kw_ratio < _INVENTED_TEXT_KEYWORD_THRESHOLD:
|
|
invented.append(text[:_INVENTED_TEXT_TRUNCATE_LEN])
|
|
return invented
|