"""Tests for IMP-16-U1 unit u9: ``detect_invented_text``. Locks the Phase Z port of the deterministic hallucination guard (Phase Q reference: ``src/content_verifier.py:276-315``). The function is pure and composes u2 (extract_text_from_html), u3 (normalize_for_comparison), and u4 (extract_keywords). No Phase Q import is exercised. """ from __future__ import annotations from src.phase_z2_verification_utils import ( _INVENTED_TEXT_ALLOWED_LABELS, _INVENTED_TEXT_CSS_NUMBER_PATTERN, _INVENTED_TEXT_KEYWORD_THRESHOLD, _INVENTED_TEXT_MIN_LENGTH, _INVENTED_TEXT_TRUNCATE_LEN, detect_invented_text, ) def test_detect_invented_text_constants_locked() -> None: """Lock the five named module constants ported from Phase Q literals.""" assert _INVENTED_TEXT_MIN_LENGTH == 15 assert _INVENTED_TEXT_ALLOWED_LABELS == frozenset( {"용어 정의", "핵심 메시지", "상세 비교"} ) assert _INVENTED_TEXT_CSS_NUMBER_PATTERN.pattern == r"^[\d\s.,%px#rgb()]+$" assert _INVENTED_TEXT_KEYWORD_THRESHOLD == 0.4 assert _INVENTED_TEXT_TRUNCATE_LEN == 80 def test_detect_invented_text_returns_empty_when_html_is_in_mdx() -> None: """Text whose keywords fully appear in MDX is NOT flagged.""" mdx = "원본 콘텐츠는 분석에 관한 것입니다." html = "
원본 콘텐츠는 분석에 관한 것입니다.
" assert detect_invented_text(mdx, html) == [] def test_detect_invented_text_flags_text_with_low_keyword_overlap() -> None: """Text whose keywords do not appear in MDX is flagged as invented.""" mdx = "원본 콘텐츠는 분석에 관한 것입니다." html = "완전히 다른 발명된 텍스트가 여기 있습니다 일반적이지 않은
" result = detect_invented_text(mdx, html) assert len(result) == 1 assert "발명된" in result[0] def test_detect_invented_text_skips_short_text() -> None: """Text shorter than ``min_length`` is not even considered.""" mdx = "원본 콘텐츠" html = "짧은 텍스트
" assert detect_invented_text(mdx, html) == [] def test_detect_invented_text_skips_allowed_structural_labels() -> None: """Allowed labels are skipped even when keyword overlap is zero. Phase Q default ``min_length=15`` makes the allowed-label gate unreachable for the bundled labels (all < 15 chars). The Phase Z port preserves the gate verbatim — exercised here with ``min_length=0`` so the structural-label short-circuit is actually observable. """ mdx = "원본 콘텐츠" html = "{invented}
" result = detect_invented_text(mdx, html) assert len(result) == 1 assert len(result[0]) == 80 assert result[0] == invented[:80]