U1 (runtime, u1-u10): new Phase Z-owned deterministic verification module src/phase_z2_verification_utils.py (335 LOC, stdlib only) porting H3 utility surface — VerificationResult, extract_text_from_html, normalize_for_comparison, extract_keywords, strip_meta_lines, split_into_sentences, verify_text_preservation, detect_invented_text. 10 unit tests under tests/phase_z2/test_pz2_vu_*.py (56 tests). u11 (design-only): docs/architecture/IMP-16-U2-WIRING-DESIGN.md fixes the Step 1/2/14/21/22 reverse-path contract, redesigned frame-contract pattern reservation (IMP-20), and IMP-07 hard-gate criteria. No runtime wiring lands in this commit — U2 stays blocked until IMP-07 reverse path is implemented + verified + runtime-hit. Guardrails: no src.content_verifier import; no FORBIDDEN_KEI_MEMOS / generate_with_retry / REQUIRED_PATTERNS / verify_structure / verify_area / verify_all_areas usage; no AI / Kei / httpx / SSE path; AI-isolation contract upheld (utility is deterministic). Tests: 56 targeted PASS (0.19s), 15 regression baseline PASS (7.59s). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
55 lines
1.8 KiB
Python
55 lines
1.8 KiB
Python
"""u2 — pure HTML text extraction surface (IMP-16-U1).
|
|
|
|
Locks the deterministic visible-text extraction contract:
|
|
- <style> / <script> contents are excluded.
|
|
- Whitespace-only chunks are dropped; surviving chunks are stripped.
|
|
- Order of visible-text fragments is preserved.
|
|
- No import of src.content_verifier.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
|
|
def test_extract_plain_text_fragments_in_order():
|
|
from src.phase_z2_verification_utils import extract_text_from_html
|
|
|
|
html = "<p>first</p><p>second</p><p>third</p>"
|
|
assert extract_text_from_html(html) == ["first", "second", "third"]
|
|
|
|
|
|
def test_extract_skips_style_and_script_bodies():
|
|
from src.phase_z2_verification_utils import extract_text_from_html
|
|
|
|
html = (
|
|
"<html><head>"
|
|
"<style>body { color: red; } .x { font-size: 12px; }</style>"
|
|
"<script>var keep_out = 1;</script>"
|
|
"</head><body><p>visible</p></body></html>"
|
|
)
|
|
out = extract_text_from_html(html)
|
|
assert "visible" in out
|
|
joined = " ".join(out)
|
|
assert "color: red" not in joined
|
|
assert "keep_out" not in joined
|
|
|
|
|
|
def test_extract_drops_whitespace_only_chunks_and_strips_survivors():
|
|
from src.phase_z2_verification_utils import extract_text_from_html
|
|
|
|
html = "<div> \n\n </div><div> hello </div><span> world\t</span>"
|
|
out = extract_text_from_html(html)
|
|
assert out == ["hello", "world"]
|
|
|
|
|
|
def test_extract_preserves_korean_and_inline_markup_text():
|
|
from src.phase_z2_verification_utils import extract_text_from_html
|
|
|
|
html = "<p>설계 <strong>방식</strong>의 왜곡</p>"
|
|
out = extract_text_from_html(html)
|
|
assert out == ["설계", "방식", "의 왜곡"]
|
|
|
|
|
|
def test_extract_empty_input_returns_empty_list():
|
|
from src.phase_z2_verification_utils import extract_text_from_html
|
|
|
|
assert extract_text_from_html("") == []
|