C.E.L_Slide_test2/src/content_verifier.py

"""Phase S: 생성 HTML 콘텐츠 검증 + 재시도 루프.

생성기(html_generator)와 완전히 분리된 독립 검증.
코드 기반 검증을 먼저, LLM 검증은 코드가 못 잡는 것만.

검증 계층:
  Layer 1: 텍스트 보존 검증 (코드, $0)
  Layer 2: 금지 콘텐츠 검증 (코드, $0)
  Layer 3: 구조 검증 (코드, $0)
  Layer 4: 오버플로 검증 (Selenium, $0) — slide_measurer.py 재사용
  Layer 5: 시각 품질 검증 (Opus 비전, $$) — kei_client.py 재사용
"""
from __future__ import annotations

import logging
import re
from dataclasses import dataclass, field
from difflib import SequenceMatcher
from html.parser import HTMLParser

logger = logging.getLogger(__name__)


# ═══════════════════════════════════════════════════════════
# 데이터 구조
# ═══════════════════════════════════════════════════════════

@dataclass
class VerificationResult:
    """단일 영역의 검증 결과."""
    passed: bool
    area_name: str
    checks: dict[str, bool] = field(default_factory=dict)
    score: float = 0.0
    errors: list[str] = field(default_factory=list)
    warnings: list[str] = field(default_factory=list)


# ═══════════════════════════════════════════════════════════
# HTML 텍스트 추출
# ═══════════════════════════════════════════════════════════

class _TextExtractor(HTMLParser):
    """HTML에서 가시 텍스트만 추출. <style>, <script> 내부 제외."""

    def __init__(self):
        super().__init__()
        self.texts: list[str] = []
        self._skip = False

    def handle_starttag(self, tag, attrs):
        if tag in ("style", "script"):
            self._skip = True

    def handle_endtag(self, tag):
        if tag in ("style", "script"):
            self._skip = False

    def handle_data(self, data):
        if not self._skip:
            stripped = data.strip()
            if stripped:
                self.texts.append(stripped)


def extract_text_from_html(html: str) -> list[str]:
    """HTML에서 가시 텍스트를 추출하여 리스트로 반환."""
    parser = _TextExtractor()
    parser.feed(html)
    return parser.texts


# ═══════════════════════════════════════════════════════════
# 텍스트 정규화 + 키워드 추출
# ═══════════════════════════════════════════════════════════

# 한국어 조사 (긴 것부터 매칭하도록 정렬)
_PARTICLES = sorted([
    "에서", "으로", "부터", "까지", "에게", "한테",
    "은", "는", "이", "가", "을", "를", "에", "의",
    "로", "와", "과", "도", "만", "께",
], key=len, reverse=True)

# 개조식 어미 변환 매핑 (역변환: 개조식 → 서술형)
_ENDING_NORMALIZE = {
    "있음": "있다",
    "됨": "된다",
    "함": "한다",
    "임": "이다",
    "없음": "없다",
    "았음": "았다",
    "었음": "었다",
    "됨": "된다",
}


def normalize_for_comparison(text: str) -> str:
    """비교용 텍스트 정규화.

    1. 공백/줄바꿈 통일
    2. 불릿 마커 제거
    3. HTML 엔티티 디코딩
    4. 개조식 어미 → 서술형으로 통일 (양쪽 비교 기준 통일)
    """
    # 공백 정규화
    text = re.sub(r"\s+", " ", text).strip()
    # 불릿 마커 제거
    text = re.sub(r"[•◦·\-▪▸►]", "", text).strip()
    # HTML 엔티티
    text = text.replace("&amp;", "&").replace("&lt;", "<").replace("&gt;", ">")
    text = text.replace("&nbsp;", " ").replace("&#39;", "'").replace("&quot;", '"')
    # 개조식 어미 → 서술형 (비교 기준 통일)
    for gaejo, seosul in _ENDING_NORMALIZE.items():
        if text.endswith(gaejo):
            text = text[: -len(gaejo)] + seosul
            break
    return text


def extract_keywords(text: str) -> list[str]:
    """3글자 이상 키워드 추출. 조사 제거."""
    words = re.findall(r"[가-힣a-zA-Z0-9()]+", text)
    keywords = []
    for w in words:
        if len(w) < 3:
            continue
        # 뒤쪽 조사 제거
        for p in _PARTICLES:
            if w.endswith(p) and len(w) - len(p) >= 2:
                w = w[: -len(p)]
                break
        if len(w) >= 2:
            keywords.append(w)
    return keywords


# 검증에서 제외할 메타 라인 접두사 (Kei 분석 메타, 프롬프트 지시사항)
_META_PREFIXES = [
    "제목 라벨:",
    "표현 의도:",
    "슬라이드 주인공",
    "가장 큰 시각적 비중",
    "시각적으로",
    "간결하게 제기",
    "개별 증거로 제시",
    "계층적으로 시각화",
]


def strip_meta_lines(text: str) -> str:
    """검증 전에 메타/지시 라인을 제거.

    _map_sections_for_role()이 추가하는 expression_hint, 제목 라벨 등은
    Claude에게 보내는 지시사항이지 슬라이드에 들어갈 콘텐츠가 아니므로
    검증 대상에서 제외한다.
    """
    lines = text.split("\n")
    filtered = []
    for line in lines:
        stripped = line.strip()
        if any(stripped.startswith(prefix) for prefix in _META_PREFIXES):
            continue
        # expression_hint 내용도 제거 (문장 중간에 포함될 수 있음)
        if "현상-문제 인과관계" in stripped:
            continue
        if "상위-하위 포함 관계" in stripped:
            continue
        if "독립적 나열" in stripped:
            continue
        filtered.append(line)
    return "\n".join(filtered)


def split_into_sentences(text: str) -> list[str]:
    """텍스트를 문장 단위로 분할.

    마침표, 줄바꿈, 불릿 기준 분할.
    ## 헤더, 빈 줄, 5자 미만, 메타 라인 필터링.
    """
    # 메타 라인 제거
    text = strip_meta_lines(text)
    # 줄 단위 분할
    lines = text.split("\n")
    sentences = []
    for line in lines:
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        # 불릿 마커 제거
        line = re.sub(r"^[\-•◦·\d]+[.)]\s*", "", line).strip()
        if not line:
            continue
        # 마침표 기준 추가 분할
        parts = re.split(r"(?<=\.)\s+", line)
        for part in parts:
            part = part.strip()
            if len(part) >= 5:
                sentences.append(part)
    return sentences


# ═══════════════════════════════════════════════════════════
# Layer 1: 텍스트 보존 검증
# ═══════════════════════════════════════════════════════════

def verify_text_preservation(
    original_mdx: str,
    generated_html: str,
    area_name: str,
    threshold: float = 0.70,
) -> VerificationResult:
    """원본 MDX 텍스트가 HTML에 보존되었는지 검증.

    1. 원본을 문장 단위로 분할
    2. 각 문장의 키워드를 추출
    3. HTML 텍스트에서 키워드 존재 확인
    4. 문장별 매칭률 계산
    """
    original_sentences = split_into_sentences(original_mdx)
    if not original_sentences:
        return VerificationResult(
            passed=True, area_name=area_name,
            checks={"text_preservation": True}, score=1.0,
        )

    html_texts = extract_text_from_html(generated_html)
    html_combined = normalize_for_comparison(" ".join(html_texts))

    matched = 0
    missing: list[str] = []

    for sentence in original_sentences:
        norm_orig = normalize_for_comparison(sentence)
        keywords = extract_keywords(norm_orig)
        if not keywords:
            matched += 1
            continue

        # 키워드 매칭률
        kw_found = sum(1 for kw in keywords if kw in html_combined)
        kw_ratio = kw_found / len(keywords)

        # SequenceMatcher fallback
        best_ratio = 0.0
        for html_text in html_texts:
            norm_html = normalize_for_comparison(html_text)
            ratio = SequenceMatcher(None, norm_orig, norm_html).ratio()
            if ratio > best_ratio:
                best_ratio = ratio

        if kw_ratio >= 0.6 or best_ratio >= 0.65:
            matched += 1
        else:
            missing.append(sentence)

    score = matched / len(original_sentences)
    passed = score >= threshold

    errors = []
    if not passed:
        errors = [f"누락 문장 ({len(missing)}/{len(original_sentences)}):"]
        for s in missing[:5]:  # 최대 5개만
            errors.append(f"  - \"{s[:60]}...\"" if len(s) > 60 else f"  - \"{s}\"")

    return VerificationResult(
        passed=passed,
        area_name=area_name,
        checks={"text_preservation": passed},
        score=score,
        errors=errors,
        warnings=[f"보존율: {score:.0%} ({matched}/{len(original_sentences)} 문장)"]
        if score < 1.0 else [],
    )


def detect_invented_text(
    original_mdx: str,
    generated_html: str,
    min_length: int = 15,
) -> list[str]:
    """HTML에서 원본에 없는 발명된 텍스트를 탐지.

    min_length 이상의 연속 텍스트가 원본에 없으면 발명 텍스트로 판정.
    """
    # 허용 예외 (구조적 라벨)
    allowed_labels = {
        "용어 정의", "핵심 메시지", "상세 비교",
    }

    html_texts = extract_text_from_html(generated_html)
    norm_mdx = normalize_for_comparison(original_mdx)

    invented = []
    for text in html_texts:
        text = text.strip()
        if len(text) < min_length:
            continue
        if text in allowed_labels:
            continue
        # CSS 값, 숫자만으로 된 것 제외
        if re.match(r"^[\d\s.,%px#rgb()]+$", text):
            continue

        norm_text = normalize_for_comparison(text)
        # 핵심 키워드 추출 후 원본에서 검색
        keywords = extract_keywords(norm_text)
        if not keywords:
            continue
        kw_found = sum(1 for kw in keywords if kw in norm_mdx)
        kw_ratio = kw_found / len(keywords) if keywords else 1.0

        if kw_ratio < 0.4:
            invented.append(text[:80])

    return invented


# ═══════════════════════════════════════════════════════════
# Layer 2: 금지 콘텐츠 검증
# ═══════════════════════════════════════════════════════════

FORBIDDEN_KEI_MEMOS = [
    "간결한 문제 제기용",
    "핵심 메시지만 추출",
    "문제제기 핵심문장",
    "source_data",
    "expression_hint",
    "relation_type",
]

FORBIDDEN_LABELS_IN_KEYMSG = [
    "상위개념",
    "하위기술",
    "포함관계",
]


def verify_no_forbidden_content(
    generated_html: str,
    area_name: str,
) -> VerificationResult:
    """금지 콘텐츠가 HTML에 포함되지 않았는지 검증."""
    html_text = " ".join(extract_text_from_html(generated_html))
    found = []

    # Kei 메모 검색
    for memo in FORBIDDEN_KEI_MEMOS:
        if memo in html_text:
            found.append(f"Kei 메모 포함: \"{memo}\"")

    # key-msg 영역의 금지 라벨 (body_core만)
    if area_name == "body_core":
        # key-msg 내용만 추출
        keymsg_match = re.search(
            r'class="key-msg"[^>]*>(.*?)</div>',
            generated_html,
            re.DOTALL,
        )
        if keymsg_match:
            keymsg_text = keymsg_match.group(1)
            for label in FORBIDDEN_LABELS_IN_KEYMSG:
                if label in keymsg_text:
                    found.append(f"key-msg에 금지 라벨: \"{label}\"")

    passed = len(found) == 0
    return VerificationResult(
        passed=passed,
        area_name=area_name,
        checks={"no_forbidden": passed},
        score=1.0 if passed else 0.0,
        errors=found,
    )


# ═══════════════════════════════════════════════════════════
# Layer 3: 구조 검증
# ═══════════════════════════════════════════════════════════

REQUIRED_PATTERNS: dict[str, list[str]] = {
    "body_bg": ["overflow:hidden|overflow: hidden"],
    "body_core": [
        "overflow:hidden|overflow: hidden",
        "key-msg",
    ],
    "sidebar": [
        "overflow:hidden|overflow: hidden",
        "padding-left",
        "text-indent",
    ],
    "footer": [],
}


def verify_structure(
    generated_html: str,
    area_name: str,
    has_image: bool = False,
) -> VerificationResult:
    """필수 CSS/HTML 패턴이 존재하는지 검증."""
    patterns = REQUIRED_PATTERNS.get(area_name, [])
    missing = []

    for pattern in patterns:
        # OR 패턴: "a|b" → a 또는 b 중 하나 존재
        alternatives = pattern.split("|")
        if not any(alt in generated_html for alt in alternatives):
            missing.append(pattern)

    if has_image and area_name == "body_core":
        if "slide-img-" not in generated_html:
            missing.append("slide-img-* (이미지 태그)")

    passed = len(missing) == 0
    return VerificationResult(
        passed=passed,
        area_name=area_name,
        checks={"structure": passed},
        score=1.0 if passed else (1.0 - len(missing) / max(1, len(patterns))),
        errors=[f"필수 패턴 누락: {p}" for p in missing],
    )


# ═══════════════════════════════════════════════════════════
# 합성 검증
# ═══════════════════════════════════════════════════════════

def verify_area(
    original_text: str,
    generated_html: str,
    area_name: str,
    has_image: bool = False,
) -> VerificationResult:
    """단일 영역의 전체 검증 (L1 + L2 + L3)."""
    results = [
        verify_text_preservation(original_text, generated_html, area_name),
        verify_no_forbidden_content(generated_html, area_name),
        verify_structure(generated_html, area_name, has_image),
    ]

    all_passed = all(r.passed for r in results)
    all_checks = {}
    all_errors = []
    all_warnings = []

    for r in results:
        all_checks.update(r.checks)
        all_errors.extend(r.errors)
        all_warnings.extend(r.warnings)

    avg_score = sum(r.score for r in results) / len(results)

    return VerificationResult(
        passed=all_passed,
        area_name=area_name,
        checks=all_checks,
        score=avg_score,
        errors=all_errors,
        warnings=all_warnings,
    )


def verify_all_areas(
    generated: dict[str, str],
    area_texts: dict[str, str],
    has_image_areas: set[str] | None = None,
) -> dict[str, VerificationResult]:
    """모든 영역의 검증 결과를 반환.

    Args:
        generated: {"body_html": "...", "sidebar_html": "...", "footer_html": "..."}
        area_texts: {"body_bg": "원본 텍스트", "body_core": "...", "sidebar": "...", "footer": "..."}
        has_image_areas: 이미지가 있는 영역 이름 set
    """
    if has_image_areas is None:
        has_image_areas = set()

    results = {}

    # body_html은 bg + core 두 영역이 합쳐져 있으므로 분리 검증
    body_html = generated.get("body_html", "")

    if "body_bg" in area_texts and body_html:
        results["body_bg"] = verify_area(
            area_texts["body_bg"], body_html, "body_bg",
        )

    if "body_core" in area_texts and body_html:
        results["body_core"] = verify_area(
            area_texts["body_core"], body_html, "body_core",
            has_image="body_core" in has_image_areas,
        )

    sidebar_html = generated.get("sidebar_html", "")
    if "sidebar" in area_texts and sidebar_html:
        results["sidebar"] = verify_area(
            area_texts["sidebar"], sidebar_html, "sidebar",
        )

    footer_html = generated.get("footer_html", "")
    if "footer" in area_texts and footer_html:
        results["footer"] = verify_area(
            area_texts["footer"], footer_html, "footer",
        )

    # 로그
    for name, r in results.items():
        status = "PASS" if r.passed else "FAIL"
        logger.info(
            f"[검증] {name}: {status} (score={r.score:.0%}, "
            f"errors={len(r.errors)}, warnings={len(r.warnings)})"
        )
        for err in r.errors:
            logger.warning(f"[검증] {name} 에러: {err}")

    return results


# ═══════════════════════════════════════════════════════════
# 재시도 루프
# ═══════════════════════════════════════════════════════════

async def generate_with_retry(
    content: str,
    analysis: dict,
    container_specs: dict,
    preset: dict,
    images: list[dict] | None = None,
    max_retries: int = 2,
) -> tuple[dict[str, str], dict[str, VerificationResult]]:
    """검증 포함 생성 루프.

    1. generate_slide_html() 호출
    2. validate_and_clean_html() (보안)
    3. verify_all_areas() (콘텐츠 검증)
    4. 실패한 영역만 재생성 (에러 피드백 포함)
    5. max_retries까지 반복
    """
    from src.html_generator import generate_slide_html, regenerate_area, _slice_mdx_sections, _map_sections_for_role, _get_definitions, _get_conclusion
    from src.html_validator import validate_and_clean_html

    # 원본 텍스트 매핑 (검증 기준)
    sections = _slice_mdx_sections(content)
    page_struct = analysis.get("page_structure", {})
    topics = analysis.get("topics", [])
    topic_map = {t["id"]: t for t in topics}

    def get_topics_for_role(role: str) -> list[dict]:
        info = page_struct.get(role, {})
        if not isinstance(info, dict):
            return []
        return [topic_map[tid] for tid in info.get("topic_ids", []) if tid in topic_map]

    area_texts = {}
    bg_topics = get_topics_for_role("배경")
    if bg_topics:
        area_texts["body_bg"] = _map_sections_for_role(sections, bg_topics, ["혼용", "사례"])
    core_topics = get_topics_for_role("본심")
    if core_topics:
        area_texts["body_core"] = _map_sections_for_role(sections, core_topics, ["관계", "핵심기술", "DX"])
    ref_topics = get_topics_for_role("첨부")
    if ref_topics:
        area_texts["sidebar"] = _get_definitions(content)
    conclusion_topics = get_topics_for_role("결론")
    if conclusion_topics:
        area_texts["footer"] = _get_conclusion(content)

    has_image_areas = set()
    if images:
        core_topic_ids = {t["id"] for t in core_topics}
        for img in images:
            if img.get("topic_id") in core_topic_ids:
                has_image_areas.add("body_core")

    # 1차 생성
    logger.info("[검증 루프] 1차 생성 시작")
    generated = await generate_slide_html(
        content=content, analysis=analysis,
        container_specs=container_specs, preset=preset, images=images,
    )
    generated = validate_and_clean_html(generated)

    # 검증 루프
    for attempt in range(max_retries + 1):
        verification = verify_all_areas(generated, area_texts, has_image_areas)

        failed_areas = {name: r for name, r in verification.items() if not r.passed}

        if not failed_areas:
            logger.info(f"[검증 루프] 전체 PASS (시도 {attempt + 1}회)")
            return generated, verification

        if attempt >= max_retries:
            logger.warning(
                f"[검증 루프] {max_retries}회 재시도 후에도 실패: "
                + ", ".join(failed_areas.keys())
            )
            return generated, verification

        # 실패 영역만 재생성
        logger.info(
            f"[검증 루프] 시도 {attempt + 1}: "
            f"실패 영역 재생성 — {', '.join(failed_areas.keys())}"
        )

        for area_name, result in failed_areas.items():
            new_html = await regenerate_area(
                area_name=area_name,
                errors=result.errors,
                content=content,
                analysis=analysis,
                container_specs=container_specs,
                preset=preset,
                images=images,
            )
            if new_html:
                # 영역별로 교체
                if area_name in ("body_bg", "body_core"):
                    # body_html은 bg + core 합본이므로 전체 재생성 필요
                    # (개별 교체가 복잡하므로 body 전체를 재생성)
                    regenerated = await generate_slide_html(
                        content=content, analysis=analysis,
                        container_specs=container_specs, preset=preset,
                        images=images,
                    )
                    regenerated = validate_and_clean_html(regenerated)
                    generated["body_html"] = regenerated.get("body_html", generated["body_html"])
                    break  # body 전체를 재생성했으므로 다른 body 영역도 갱신됨
                elif area_name == "sidebar":
                    generated["sidebar_html"] = new_html
                elif area_name == "footer":
                    generated["footer_html"] = new_html

    return generated, verification