C.E.L_Slide_test2/src/phase_z2_verification_utils.py

"""Phase Z2 deterministic verification utilities (IMP-16-U1 port).

Ports the H3 deterministic subset of src/content_verifier.py into a
Phase Z-owned module so the Phase Z pipeline never imports the Phase Q
reference-only module (which co-hosts H4/H5 Kei/AI assets).

Scope: deterministic, pure, no I/O, no LLM call, no httpx/SSE.
Wiring into Step 1/2/14/21/22 is gated behind IMP-07 (see
docs/architecture/IMP-16-U2-WIRING-DESIGN.md when u11 lands).
"""
from __future__ import annotations

import re
from dataclasses import dataclass, field
from difflib import SequenceMatcher
from html.parser import HTMLParser


@dataclass
class VerificationResult:
    """Single-axis deterministic verification outcome.

    Mirrors the Phase Q VerificationResult shape so callers ported from
    that surface keep their field access; the value semantics are
    Phase Z-owned (no Phase Q area defaults baked in).
    """

    passed: bool
    area_name: str
    checks: dict[str, bool] = field(default_factory=dict)
    score: float = 0.0
    errors: list[str] = field(default_factory=list)
    warnings: list[str] = field(default_factory=list)


class _TextExtractor(HTMLParser):
    """Extract visible text only. Skips <style> and <script> bodies.

    Pure stdlib (html.parser). Whitespace-only data chunks are dropped;
    surviving chunks are stripped before appending to preserve token
    boundaries for downstream normalization / keyword logic.
    """

    def __init__(self) -> None:
        super().__init__()
        self.texts: list[str] = []
        self._skip = False

    def handle_starttag(self, tag, attrs):
        if tag in ("style", "script"):
            self._skip = True

    def handle_endtag(self, tag):
        if tag in ("style", "script"):
            self._skip = False

    def handle_data(self, data):
        if not self._skip:
            stripped = data.strip()
            if stripped:
                self.texts.append(stripped)


def extract_text_from_html(html: str) -> list[str]:
    """Return ordered list of visible text fragments from an HTML string.

    Deterministic, pure: no I/O, no LLM, no network. Used by Phase Z
    verification to compare reverse-path HTML against MDX text without
    importing the Phase Q reference-only module.
    """
    parser = _TextExtractor()
    parser.feed(html)
    return parser.texts


_PARTICLES: list[str] = sorted(
    ["에서", "으로", "부터", "까지", "에게", "한테",
     "은", "는", "이", "가", "을", "를", "에", "의",
     "로", "와", "과", "도", "만", "께"],
    key=len, reverse=True,
)

_ENDING_NORMALIZE: dict[str, str] = {
    "있음": "있다", "됨": "된다", "함": "한다", "임": "이다",
    "없음": "없다", "았음": "았다", "었음": "었다",
}


def normalize_for_comparison(text: str) -> str:
    """Normalize text for deterministic comparison (Phase Z H3 port).

    Steps (order matters): collapse whitespace, strip bullet markers,
    decode the small HTML-entity set used by the reverse path, then
    fold a single trailing 개조식 ending to its 서술형 form.
    """
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r"[•◦·\-▪▸►]", "", text).strip()
    text = text.replace("&amp;", "&").replace("&lt;", "<").replace("&gt;", ">")
    text = text.replace("&nbsp;", " ").replace("&#39;", "'").replace("&quot;", '"')
    for gaejo, seosul in _ENDING_NORMALIZE.items():
        if text.endswith(gaejo):
            text = text[: -len(gaejo)] + seosul
            break
    return text


def extract_keywords(text: str) -> list[str]:
    """Extract length>=3 tokens, then strip a trailing Korean particle.

    Deterministic, pure: tokenises on the Phase Z H3 character class
    ``[가-힣a-zA-Z0-9()]+``, drops tokens shorter than 3 characters,
    and folds a single longest-match trailing particle from
    ``_PARTICLES`` when the remaining stem is still length >= 2.
    """
    words = re.findall(r"[가-힣a-zA-Z0-9()]+", text)
    keywords: list[str] = []
    for w in words:
        if len(w) < 3:
            continue
        for p in _PARTICLES:
            if w.endswith(p) and len(w) - len(p) >= 2:
                w = w[: -len(p)]
                break
        if len(w) >= 2:
            keywords.append(w)
    return keywords


_META_PREFIXES: list[str] = [
    "제목 라벨:",
    "표현 의도:",
    "슬라이드 주인공",
    "가장 큰 시각적 비중",
    "시각적으로",
    "간결하게 제기",
    "개별 증거로 제시",
    "계층적으로 시각화",
]

_META_INLINE_FRAGMENTS: tuple[str, ...] = (
    "현상-문제 인과관계",
    "상위-하위 포함 관계",
    "독립적 나열",
)


def strip_meta_lines(text: str) -> str:
    """Drop Kei prompt meta/instruction lines before verification.

    A line is dropped if its stripped form starts with any prefix in
    ``_META_PREFIXES`` (e.g. ``제목 라벨:``) or contains any inline
    expression-hint fragment in ``_META_INLINE_FRAGMENTS`` (e.g.
    ``현상-문제 인과관계``). These are prompt directives, not slide
    content; they must not enter sentence/keyword extraction for the
    B-2 reverse path. Deterministic, pure: no I/O, no LLM, no regex
    against runtime data.
    """
    filtered: list[str] = []
    for line in text.split("\n"):
        stripped = line.strip()
        if any(stripped.startswith(prefix) for prefix in _META_PREFIXES):
            continue
        if any(fragment in stripped for fragment in _META_INLINE_FRAGMENTS):
            continue
        filtered.append(line)
    return "\n".join(filtered)


_BULLET_MARKER_PATTERN = re.compile(r"^[\-•◦·\d]+[.)]\s*")
_SENTENCE_SPLIT_PATTERN = re.compile(r"(?<=\.)\s+")
_MIN_SENTENCE_LEN = 5


def split_into_sentences(text: str) -> list[str]:
    """Split text into sentences for deterministic comparison.

    Pipeline (order matters): drop Kei meta/instruction lines via
    ``strip_meta_lines``, split on newline, skip empties and ``#``-led
    header lines, strip any leading bullet/numeric marker matching
    ``_BULLET_MARKER_PATTERN``, then split on inter-sentence whitespace
    following a period. Parts shorter than ``_MIN_SENTENCE_LEN`` are
    dropped so single-token noise (e.g. residual punctuation) cannot
    enter the preservation/invented-text checks.
    """
    text = strip_meta_lines(text)
    sentences: list[str] = []
    for line in text.split("\n"):
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        line = _BULLET_MARKER_PATTERN.sub("", line).strip()
        if not line:
            continue
        for part in _SENTENCE_SPLIT_PATTERN.split(line):
            part = part.strip()
            if len(part) >= _MIN_SENTENCE_LEN:
                sentences.append(part)
    return sentences


_SENTENCE_KEYWORD_MATCH_THRESHOLD = 0.6
_SENTENCE_SEQUENCE_MATCH_THRESHOLD = 0.65


def _sentence_matches_html(
    sentence: str,
    html_combined: str,
    html_texts: list[str],
) -> bool:
    """Return True if ``sentence`` is preserved in the HTML side.

    Two-axis match: a keyword-ratio gate against ``html_combined`` (the
    pre-normalized join of all visible HTML text fragments) and a
    SequenceMatcher fallback against each individual normalized HTML
    fragment. A sentence whose keyword set is empty after normalization
    is treated as preserved (no falsifiable signal). Pure helper used
    by ``verify_text_preservation`` (u8); thresholds are lifted to
    named module constants so the surface is auditable.
    """
    norm_orig = normalize_for_comparison(sentence)
    keywords = extract_keywords(norm_orig)
    if not keywords:
        return True
    kw_found = sum(1 for kw in keywords if kw in html_combined)
    kw_ratio = kw_found / len(keywords)
    best_ratio = 0.0
    for html_text in html_texts:
        norm_html = normalize_for_comparison(html_text)
        ratio = SequenceMatcher(None, norm_orig, norm_html).ratio()
        if ratio > best_ratio:
            best_ratio = ratio
    return (
        kw_ratio >= _SENTENCE_KEYWORD_MATCH_THRESHOLD
        or best_ratio >= _SENTENCE_SEQUENCE_MATCH_THRESHOLD
    )


_TEXT_PRESERVATION_DEFAULT_THRESHOLD = 0.70
_MISSING_SENTENCE_REPORT_LIMIT = 5
_MISSING_SENTENCE_TRUNCATE_LEN = 60


def verify_text_preservation(
    original_mdx: str,
    generated_html: str,
    area_name: str,
    threshold: float = _TEXT_PRESERVATION_DEFAULT_THRESHOLD,
) -> VerificationResult:
    """Verify the original MDX text is preserved in the generated HTML.

    Splits MDX via u6, pre-normalizes joined HTML via u2+u3, then per
    sentence delegates to u7. Empty sentence list -> passed True,
    score 1.0. Missing sentences are capped at the report limit and
    each truncated to the truncate length constant.
    """
    original_sentences = split_into_sentences(original_mdx)
    if not original_sentences:
        return VerificationResult(passed=True, area_name=area_name,
                                  checks={"text_preservation": True}, score=1.0)
    html_texts = extract_text_from_html(generated_html)
    html_combined = normalize_for_comparison(" ".join(html_texts))
    matched = 0
    missing: list[str] = []
    for sentence in original_sentences:
        if _sentence_matches_html(sentence, html_combined, html_texts):
            matched += 1
        else:
            missing.append(sentence)
    score = matched / len(original_sentences)
    passed = score >= threshold
    errors: list[str] = []
    if not passed:
        errors = [f"누락 문장 ({len(missing)}/{len(original_sentences)}):"]
        for s in missing[:_MISSING_SENTENCE_REPORT_LIMIT]:
            errors.append(
                f"  - \"{s[:_MISSING_SENTENCE_TRUNCATE_LEN]}...\""
                if len(s) > _MISSING_SENTENCE_TRUNCATE_LEN else f"  - \"{s}\""
            )
    warnings = ([f"보존율: {score:.0%} ({matched}/{len(original_sentences)} 문장)"]
                if score < 1.0 else [])
    return VerificationResult(
        passed=passed, area_name=area_name,
        checks={"text_preservation": passed}, score=score,
        errors=errors, warnings=warnings,
    )


_INVENTED_TEXT_MIN_LENGTH = 15
_INVENTED_TEXT_ALLOWED_LABELS: frozenset[str] = frozenset({
    "용어 정의", "핵심 메시지", "상세 비교",
})
_INVENTED_TEXT_CSS_NUMBER_PATTERN = re.compile(r"^[\d\s.,%px#rgb()]+$")
_INVENTED_TEXT_KEYWORD_THRESHOLD = 0.4
_INVENTED_TEXT_TRUNCATE_LEN = 80


def detect_invented_text(
    original_mdx: str,
    generated_html: str,
    min_length: int = _INVENTED_TEXT_MIN_LENGTH,
) -> list[str]:
    """Detect HTML text fragments that are not anchored in the source MDX.

    Phase Z port of the H3 hallucination guard (Phase Q reference:
    ``src/content_verifier.py:276-315``). Pipeline (order matters):
    drop short fragments (< ``min_length``), drop structural label
    exceptions in ``_INVENTED_TEXT_ALLOWED_LABELS``, drop CSS/numeric
    noise matching ``_INVENTED_TEXT_CSS_NUMBER_PATTERN``, then per
    surviving fragment compute keyword ratio (via u4 ``extract_keywords``
    on the normalized fragment, checked against the normalized MDX). A
    fragment is flagged when ``kw_ratio < _INVENTED_TEXT_KEYWORD_THRESHOLD``;
    flagged values are truncated to ``_INVENTED_TEXT_TRUNCATE_LEN`` chars
    before being returned. Empty keyword sets short-circuit as
    non-falsifiable (matches Phase Q parity). Deterministic, pure.
    """
    html_texts = extract_text_from_html(generated_html)
    norm_mdx = normalize_for_comparison(original_mdx)
    invented: list[str] = []
    for text in html_texts:
        text = text.strip()
        if len(text) < min_length:
            continue
        if text in _INVENTED_TEXT_ALLOWED_LABELS:
            continue
        if _INVENTED_TEXT_CSS_NUMBER_PATTERN.match(text):
            continue
        norm_text = normalize_for_comparison(text)
        keywords = extract_keywords(norm_text)
        if not keywords:
            continue
        kw_found = sum(1 for kw in keywords if kw in norm_mdx)
        kw_ratio = kw_found / len(keywords)
        if kw_ratio < _INVENTED_TEXT_KEYWORD_THRESHOLD:
            invented.append(text[:_INVENTED_TEXT_TRUNCATE_LEN])
    return invented