test/handlers/semantic_mapper.py

# -*- coding: utf-8 -*-
"""
Semantic Mapper v1.0

HWPX tools 추출 결과(template_info)에서 각 요소의 "의미"를 판별.

역할:
  - 표 분류: 헤더표 / 푸터표 / 제목블록 / 데이터표
  - 섹션 감지: 본문 텍스트에서 섹션 패턴 탐색
  - 스타일 매핑 준비: charPr→HTML태그, borderFill→CSS클래스 (Phase 2에서 구현)

입력: template_info (DocTemplateAnalyzer.analyze()), parsed (HWPX 파싱 결과)
출력: semantic_map dict → semantic_map.json으로 저장

★ 위치: template_manager.py, doc_template_analyzer.py 와 같은 디렉토리
★ 호출: template_manager.extract_and_save() 내에서 analyze() 직후
"""

import re


# ================================================================
#  메인 엔트리포인트
# ================================================================

def generate(template_info: dict, parsed: dict) -> dict:
    """semantic_map 생성 — 모든 판별 로직 조합.

    Args:
        template_info: DocTemplateAnalyzer.analyze() 결과
        parsed: HWPX 파서 결과 (raw_xml, section_xml, paragraphs 등)

    Returns:
        {
            "version": "1.0",
            "table_roles":  { "0": {"role": "footer_table", ...}, ... },
            "body_tables":  [3],        # 본문에 들어갈 표 index 목록
            "title_table":  2,          # 제목 블록 index (없으면 None)
            "sections":     [...],      # 감지된 섹션 목록
            "style_mappings": {...},    # Phase 2용 스타일 매핑 (현재 빈 구조)
        }
    """
    tables = template_info.get("tables", [])
    header = template_info.get("header")
    footer = template_info.get("footer")

    # ① 표 역할 분류
    table_roles = _classify_tables(tables, header, footer)

    # ② 본문 전용 표 / 제목 블록 추출
    body_tables = sorted(
        idx for idx, info in table_roles.items()
        if info["role"] == "data_table"
    )
    title_table = next(
        (idx for idx, info in table_roles.items()
         if info["role"] == "title_block"),
        None
    )

    # ③ 섹션 감지
    sections = _detect_sections(parsed)

    # ④ 스타일 매핑 (Phase 2에서 구현, 현재는 빈 구조)
    style_mappings = _prepare_style_mappings(template_info)

    return {
        "version": "1.0",
        "table_roles": table_roles,
        "body_tables": body_tables,
        "title_table": title_table,
        "sections": sections,
        "style_mappings": style_mappings,
    }


# ================================================================
#  표 분류
# ================================================================

def _classify_tables(tables: list, header: dict | None,
                     footer: dict | None) -> dict:
    """각 표의 역할 판별: header_table / footer_table / title_block / data_table

    판별 순서:
      Pass 1 — header/footer 텍스트 매칭
      Pass 2 — 제목 블록 패턴 (1행, 좁은+넓은 열 구조)
      Pass 3 — 나머지 → 데이터 표
    """
    header_texts = _collect_hf_texts(header)
    footer_texts = _collect_hf_texts(footer)

    roles = {}
    classified = set()

    # ── Pass 1: header/footer 매칭 ──
    for tbl in tables:
        idx = tbl["index"]
        tbl_texts = _collect_table_texts(tbl)
        if not tbl_texts:
            continue

        # header 매칭
        if header_texts:
            overlap = len(tbl_texts & header_texts)
            if overlap > 0 and overlap / max(len(tbl_texts), 1) >= 0.5:
                roles[idx] = {
                    "role": "header_table",
                    "match_source": "header",
                    "matched_texts": list(tbl_texts & header_texts),
                }
                classified.add(idx)
                continue

        # footer 매칭
        if footer_texts:
            overlap = len(tbl_texts & footer_texts)
            if overlap > 0 and overlap / max(len(tbl_texts), 1) >= 0.5:
                roles[idx] = {
                    "role": "footer_table",
                    "match_source": "footer",
                    "matched_texts": list(tbl_texts & footer_texts),
                }
                classified.add(idx)
                continue

    # ── Pass 2: 제목 블록 탐지 ──
    for tbl in tables:
        idx = tbl["index"]
        if idx in classified:
            continue

        if _is_title_block(tbl):
            title_text = _extract_longest_text(tbl)
            roles[idx] = {
                "role": "title_block",
                "title_text": title_text,
            }
            classified.add(idx)
            continue

    # ── Pass 3: 나머지 → 데이터 표 ──
    for tbl in tables:
        idx = tbl["index"]
        if idx in classified:
            continue

        col_headers = _detect_table_headers(tbl)
        roles[idx] = {
            "role": "data_table",
            "header_row": 0 if col_headers else None,
            "col_headers": col_headers,
            "row_count": tbl.get("rowCnt", 0),
            "col_count": tbl.get("colCnt", 0),
        }

    return roles


# ── 표 분류 보조 함수 ──

def _collect_hf_texts(hf_info: dict | None) -> set:
    """header/footer의 table 셀 텍스트 수집"""
    if not hf_info or not hf_info.get("table"):
        return set()
    texts = set()
    for row in hf_info["table"].get("rows", []):
        for cell in row:
            t = cell.get("text", "").strip()
            if t:
                texts.add(t)
    return texts


def _collect_table_texts(tbl: dict) -> set:
    """표의 모든 셀 텍스트 수집"""
    texts = set()
    for row in tbl.get("rows", []):
        for cell in row:
            t = cell.get("text", "").strip()
            if t:
                texts.add(t)
    return texts


def _extract_longest_text(tbl: dict) -> str:
    """표에서 가장 긴 텍스트 추출 (제목 블록용)"""
    longest = ""
    for row in tbl.get("rows", []):
        for cell in row:
            t = cell.get("text", "").strip()
            if len(t) > len(longest):
                longest = t
    return longest


def _is_title_block(tbl: dict) -> bool:
    """제목 블록 패턴 판별.

    조건 (하나라도 충족):
    A) 1행 2열, 왼쪽 열 비율 ≤ 10% (불릿아이콘 + 제목)
    B) 1행 1열, 텍스트 길이 5~100자 (제목 단독)
    """
    if tbl.get("rowCnt", 0) != 1:
        return False

    col_cnt = tbl.get("colCnt", 0)
    col_pcts = tbl.get("colWidths_pct", [])

    # 패턴 A: 좁은 왼쪽 + 넓은 오른쪽
    if col_cnt == 2 and len(col_pcts) >= 2:
        if col_pcts[0] <= 10:
            return True

    # 패턴 B: 단일 셀 제목
    if col_cnt == 1:
        rows = tbl.get("rows", [])
        if rows and rows[0]:
            text = rows[0][0].get("text", "")
            if 5 < len(text) < 100:
                return True

    return False


def _detect_table_headers(tbl: dict) -> list:
    """표 첫 행의 컬럼 헤더 텍스트 반환.

    헤더 판별: 첫 행의 모든 텍스트가 짧음 (20자 이하)
    """
    rows = tbl.get("rows", [])
    if not rows or len(rows) < 2:
        return []

    first_row = rows[0]
    headers = []
    for cell in first_row:
        t = cell.get("text", "").strip()
        headers.append(t)

    # 전부 짧은 텍스트이면 헤더행
    if headers and all(len(h) <= 20 for h in headers if h):
        non_empty = [h for h in headers if h]
        if non_empty:  # 최소 1개는 텍스트가 있어야
            return headers

    return []


# ================================================================
#  섹션 감지
# ================================================================

_SECTION_PATTERNS = [
    (r'^(\d+)\.\s+(.+)',                  "numbered"),        # "1. 개요"
    (r'^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ][\.\s]+(.+)',      "roman"),           # "Ⅰ. 개요"
    (r'^제\s*(\d+)\s*([장절항])\s*(.+)',    "korean_formal"),   # "제1장 개요"
    (r'^[▶►▸●◆■□◎★☆]\s*(.+)',             "bullet_heading"),  # "▶ 개요"
]


def _detect_sections(parsed: dict) -> list:
    """parsed 텍스트에서 섹션 제목 패턴 탐색.

    Returns:
        [
            {"index": 1, "title": "▶ 개요", "pattern_type": "bullet_heading"},
            {"index": 2, "title": "▶ 발표 구성(안)", "pattern_type": "bullet_heading"},
            ...
        ]
    """
    paragraphs = _extract_paragraphs(parsed)
    sections = []
    sec_idx = 0

    for text in paragraphs:
        text = text.strip()
        if not text or len(text) > 100:
            # 너무 긴 텍스트는 제목이 아님
            continue

        for pat, pat_type in _SECTION_PATTERNS:
            m = re.match(pat, text)
            if m:
                # numbered 패턴: 숫자가 100 이상이면 섹션 번호가 아님 (연도 등 제외)
                if pat_type == "numbered" and int(m.group(1)) > 99:
                    continue
                sec_idx += 1
                sections.append({
                    "index": sec_idx,
                    "title": text,
                    "pattern_type": pat_type,
                })
                break

    return sections


def _extract_paragraphs(parsed: dict) -> list:
    """parsed에서 텍스트 단락 추출.

    우선순위:
    1. parsed["paragraphs"] (파서가 직접 제공)
    2. section_xml의 <hp:t> 태그에서 추출
    """
    paragraphs = parsed.get("paragraphs", [])
    if paragraphs:
        return [
            p.get("text", "") if isinstance(p, dict) else str(p)
            for p in paragraphs
        ]

    # section_xml에서 <hp:t> 추출
    section_xml = ""
    raw_xml = parsed.get("raw_xml", {})
    for key, val in raw_xml.items():
        if "section" in key.lower() and isinstance(val, str):
            section_xml = val
            break

    if not section_xml:
        section_xml = parsed.get("section_xml", "")

    if section_xml:
        return [
            t.strip()
            for t in re.findall(r'<hp:t>([^<]+)</hp:t>', section_xml)
            if t.strip()
        ]

    return []


# ================================================================
#  스타일 매핑 (Phase 2에서 확장)
# ================================================================

def _prepare_style_mappings(template_info: dict) -> dict:
    """스타일 매핑 빈 구조 생성.

    Phase 2에서 이 구조를 채움:
    - char_styles → CSS font/color rules
    - border_fills → CSS border/background rules
    - para_styles → CSS margin/alignment rules
    """
    mappings = {
        "char_pr": {},
        "border_fill": {},
        "para_pr": {},
    }

    # border_fills가 있으면 기본 매핑 생성
    border_fills = template_info.get("border_fills", {})
    for bf_id, bf_data in border_fills.items():
        # ★ 실제 키 구조 대응 (bg→background, sides→css/직접키)
        bg = bf_data.get("background", bf_data.get("bg", ""))

        # borders: css dict 또는 직접 키에서 추출
        borders = {}
        css_dict = bf_data.get("css", {})
        if css_dict:
            for prop, val in css_dict.items():
                if prop.startswith("border-") and val and val != "none":
                    borders[prop] = val
        else:
            # fallback: 직접 side 키
            for side in ("top", "bottom", "left", "right"):
                si = bf_data.get(side, {})
                if isinstance(si, dict) and si.get("type", "NONE").upper() != "NONE":
                    borders[f"border-{side}"] = (
                        f"{si.get('width','0.1mm')} "
                        f"{si.get('type','solid').lower()} "
                        f"{si.get('color','#000')}"
                    )

        mappings["border_fill"][str(bf_id)] = {
            "css_class": f"bf-{bf_id}",
            "bg": bg,
            "borders": borders,
        }

    return mappings