Files
C.E.L_Slide_test2/tests/phase_z2/test_pz2_vu_invented.py
kyeongmin 23ba8b68cd feat(IMP-16): U1 H3 verification utility port + U2 wiring design
U1 (runtime, u1-u10): new Phase Z-owned deterministic verification module
src/phase_z2_verification_utils.py (335 LOC, stdlib only) porting H3 utility
surface — VerificationResult, extract_text_from_html, normalize_for_comparison,
extract_keywords, strip_meta_lines, split_into_sentences, verify_text_preservation,
detect_invented_text. 10 unit tests under tests/phase_z2/test_pz2_vu_*.py (56 tests).

u11 (design-only): docs/architecture/IMP-16-U2-WIRING-DESIGN.md fixes the Step
1/2/14/21/22 reverse-path contract, redesigned frame-contract pattern
reservation (IMP-20), and IMP-07 hard-gate criteria. No runtime wiring lands
in this commit — U2 stays blocked until IMP-07 reverse path is implemented +
verified + runtime-hit.

Guardrails: no src.content_verifier import; no FORBIDDEN_KEI_MEMOS /
generate_with_retry / REQUIRED_PATTERNS / verify_structure / verify_area /
verify_all_areas usage; no AI / Kei / httpx / SSE path; AI-isolation contract
upheld (utility is deterministic).

Tests: 56 targeted PASS (0.19s), 15 regression baseline PASS (7.59s).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 04:42:35 +09:00

85 lines
3.3 KiB
Python

"""Tests for IMP-16-U1 unit u9: ``detect_invented_text``.
Locks the Phase Z port of the deterministic hallucination guard
(Phase Q reference: ``src/content_verifier.py:276-315``). The function
is pure and composes u2 (extract_text_from_html), u3
(normalize_for_comparison), and u4 (extract_keywords). No Phase Q
import is exercised.
"""
from __future__ import annotations
from src.phase_z2_verification_utils import (
_INVENTED_TEXT_ALLOWED_LABELS,
_INVENTED_TEXT_CSS_NUMBER_PATTERN,
_INVENTED_TEXT_KEYWORD_THRESHOLD,
_INVENTED_TEXT_MIN_LENGTH,
_INVENTED_TEXT_TRUNCATE_LEN,
detect_invented_text,
)
def test_detect_invented_text_constants_locked() -> None:
"""Lock the five named module constants ported from Phase Q literals."""
assert _INVENTED_TEXT_MIN_LENGTH == 15
assert _INVENTED_TEXT_ALLOWED_LABELS == frozenset(
{"용어 정의", "핵심 메시지", "상세 비교"}
)
assert _INVENTED_TEXT_CSS_NUMBER_PATTERN.pattern == r"^[\d\s.,%px#rgb()]+$"
assert _INVENTED_TEXT_KEYWORD_THRESHOLD == 0.4
assert _INVENTED_TEXT_TRUNCATE_LEN == 80
def test_detect_invented_text_returns_empty_when_html_is_in_mdx() -> None:
"""Text whose keywords fully appear in MDX is NOT flagged."""
mdx = "원본 콘텐츠는 분석에 관한 것입니다."
html = "<p>원본 콘텐츠는 분석에 관한 것입니다.</p>"
assert detect_invented_text(mdx, html) == []
def test_detect_invented_text_flags_text_with_low_keyword_overlap() -> None:
"""Text whose keywords do not appear in MDX is flagged as invented."""
mdx = "원본 콘텐츠는 분석에 관한 것입니다."
html = "<p>완전히 다른 발명된 텍스트가 여기 있습니다 일반적이지 않은</p>"
result = detect_invented_text(mdx, html)
assert len(result) == 1
assert "발명된" in result[0]
def test_detect_invented_text_skips_short_text() -> None:
"""Text shorter than ``min_length`` is not even considered."""
mdx = "원본 콘텐츠"
html = "<p>짧은 텍스트</p>"
assert detect_invented_text(mdx, html) == []
def test_detect_invented_text_skips_allowed_structural_labels() -> None:
"""Allowed labels are skipped even when keyword overlap is zero.
Phase Q default ``min_length=15`` makes the allowed-label gate
unreachable for the bundled labels (all < 15 chars). The Phase Z
port preserves the gate verbatim — exercised here with
``min_length=0`` so the structural-label short-circuit is
actually observable.
"""
mdx = "원본 콘텐츠"
html = "<h2>용어 정의</h2><h2>핵심 메시지</h2><h2>상세 비교</h2>"
assert detect_invented_text(mdx, html, min_length=0) == []
def test_detect_invented_text_skips_css_number_pattern_fragments() -> None:
"""CSS/numeric fragments (e.g. ``100px 200px 300px``) are skipped."""
mdx = "원본 콘텐츠"
html = "<style>.x { padding: 100px; }</style><div>100px 200px 300px</div>"
assert detect_invented_text(mdx, html) == []
def test_detect_invented_text_truncates_flagged_value_to_80_chars() -> None:
"""A flagged fragment longer than 80 chars is truncated for reporting."""
mdx = "원본 콘텐츠"
invented = "발명" * 50
html = f"<p>{invented}</p>"
result = detect_invented_text(mdx, html)
assert len(result) == 1
assert len(result[0]) == 80
assert result[0] == invented[:80]