C.E.L_Slide_test2/src/frame_extractor.py

"""프레임별 텍스트 + 메타 추출기.

figma_to_html_agent/blocks/{frame_id}/texts.md를 파싱하여
TF-IDF 매칭용 데이터 구조를 만든다.

keywords 수동 생성 불필요 — texts.md의 원본 텍스트를 직접 사용.
"""
from __future__ import annotations

import logging
import re
from pathlib import Path
from typing import Any

logger = logging.getLogger(__name__)


def extract_frame_meta(texts_md_path: Path) -> dict:
    """texts.md에서 프레임 메타 추출.

    Returns:
        {
            "frame_id": "1171281190",
            "title_text": "필수조건",
            "subtitle_texts": ["기술(디지털)", "사람(역량)", ...],
            "body_texts": ["건설단계별 근본적인...", ...],
            "all_text": "필수조건 기술 디지털 ...",  ← TF-IDF용 전체 텍스트
            "item_count": 3,
            "rough_structure": "3col",
            "sections": [{"heading": "타이틀", "lines": ["필수조건"]}, ...]
        }
    """
    if not texts_md_path.exists():
        return {}

    content = texts_md_path.read_text(encoding="utf-8")
    frame_id = texts_md_path.parent.name

    # 섹션별 파싱 (## 기준)
    sections = []
    current_heading = ""
    current_lines = []

    for line in content.split("\n"):
        line = line.strip()
        if line.startswith("## "):
            if current_heading or current_lines:
                sections.append({"heading": current_heading, "lines": current_lines})
            current_heading = line.lstrip("# ").strip()
            current_lines = []
        elif line.startswith("### "):
            # 서브섹션은 heading에 포함
            current_lines.append(line.lstrip("# ").strip())
        elif line.startswith("# "):
            # 최상위 제목 (프레임 ID) — 건너뜀
            continue
        elif line.startswith(">"):
            continue
        elif line and not line.startswith("-"):
            current_lines.append(line)
        elif line.startswith("- "):
            current_lines.append(line.lstrip("- ").strip())

    if current_heading or current_lines:
        sections.append({"heading": current_heading, "lines": current_lines})

    # 층별 텍스트 분류
    title_text = ""
    subtitle_texts = []
    body_texts = []

    for sec in sections:
        heading = sec["heading"].lower()
        lines = sec["lines"]

        if "타이틀" in heading or "제목" in heading:
            title_text = " ".join(lines)
        elif "서브" in heading or "헤더" in heading or "카테고리" in heading:
            subtitle_texts.extend(lines)
        elif "열" in heading or "col" in heading.lower():
            # 열별 텍스트 → subtitle + body
            for line in lines:
                if len(line) < 20:
                    subtitle_texts.append(line)
                else:
                    body_texts.append(line)
        elif "행" in heading or "row" in heading.lower():
            # 행별 텍스트
            for line in lines:
                if len(line) < 15:
                    subtitle_texts.append(line)
                else:
                    body_texts.append(line)
        elif "결론" in heading or "요약" in heading:
            body_texts.extend(lines)
        else:
            # 기타 — 길이로 구분
            for line in lines:
                if len(line) < 20:
                    subtitle_texts.append(line)
                else:
                    body_texts.append(line)

    # rough_structure 추정
    rough_structure = _guess_structure(sections, subtitle_texts)

    # all_text: TF-IDF용 전체 텍스트 (전처리 적용)
    all_parts = [title_text] + subtitle_texts + body_texts
    all_text = " ".join(all_parts)
    all_text = _preprocess_text(all_text)

    return {
        "frame_id": frame_id,
        "title_text": title_text.strip(),
        "subtitle_texts": subtitle_texts,
        "body_texts": body_texts,
        "all_text": all_text,
        "item_count": len(subtitle_texts),
        "rough_structure": rough_structure,
        "sections": sections,
    }


def _guess_structure(sections: list[dict], subtitles: list[str]) -> str:
    """섹션 구조에서 대략적인 블록 유형 추정."""
    headings = [s["heading"].lower() for s in sections]
    heading_text = " ".join(headings)

    # 열 기반
    col_count = sum(1 for h in headings if "열" in h or "col" in h)
    if col_count >= 3:
        return "3col"
    if col_count >= 2:
        return "2col"

    # 행 기반
    row_count = sum(1 for h in headings if "행" in h or "row" in h)
    if row_count >= 2:
        return "rows"

    # 좌/우
    if any("좌" in h or "left" in h for h in headings):
        return "2col-compare"

    # 표
    if any("표" in h or "table" in h for h in headings):
        return "table"

    # 기본
    if len(subtitles) >= 3:
        return "list"

    return "unknown"


def _preprocess_text(text: str) -> str:
    """TF-IDF용 텍스트 전처리.

    - 표기 통일
    - 괄호/특수문자 정리
    - 중복 제거
    """
    # 표기 통일
    text = text.replace("S/W", "SW 소프트웨어")
    text = text.replace("H/W", "HW 하드웨어")
    text = re.sub(r'\bDX\b', 'DX 디지털전환', text)
    # [legacy Phase R'/Q example — INTEGRATION-AUDIT-01 §10.4]
    text = re.sub(r'\bBIM\b', 'BIM 건설정보모델링', text)

    # 괄호 내용 유지하되 괄호 제거
    text = text.replace("(", " ").replace(")", " ")
    text = text.replace("[", " ").replace("]", " ")

    # 특수문자 정리
    text = re.sub(r'[·•→←↔×+/]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()

    return text


def extract_all_frames(
    blocks_dir: str | Path = "figma_to_html_agent/blocks",
) -> list[dict]:
    """모든 프레임의 메타 추출.

    Returns:
        [{"frame_id": ..., "title_text": ..., "all_text": ..., ...}]
    """
    blocks_dir = Path(blocks_dir)
    if not blocks_dir.exists():
        logger.warning(f"[extractor] blocks 폴더 없음: {blocks_dir}")
        return []

    frames = []
    for frame_dir in sorted(blocks_dir.iterdir()):
        if not frame_dir.is_dir():
            continue
        texts_md = frame_dir / "texts.md"
        if texts_md.exists():
            meta = extract_frame_meta(texts_md)
            if meta:
                frames.append(meta)
                logger.debug(f"[extractor] {meta['frame_id']}: {meta['title_text']} ({meta['rough_structure']})")

    logger.info(f"[extractor] {len(frames)}개 프레임 추출 완료")
    return frames