v8:문서유형 분석등록 및 추출_20260206

2026-02-20 11:46:52 +09:00
parent db6532b33c
commit c3e9e29205
57 changed files with 22138 additions and 1421 deletions
--- a/handlers/semantic_mapper.py
+++ b/handlers/semantic_mapper.py
@@ -0,0 +1,382 @@
+# -*- coding: utf-8 -*-
+"""
+Semantic Mapper v1.0
+
+HWPX tools 추출 결과(template_info)에서 각 요소의 "의미"를 판별.
+
+역할:
+  - 표 분류: 헤더표 / 푸터표 / 제목블록 / 데이터표
+  - 섹션 감지: 본문 텍스트에서 섹션 패턴 탐색
+  - 스타일 매핑 준비: charPr→HTML태그, borderFill→CSS클래스 (Phase 2에서 구현)
+
+입력: template_info (DocTemplateAnalyzer.analyze()), parsed (HWPX 파싱 결과)
+출력: semantic_map dict → semantic_map.json으로 저장
+
+★ 위치: template_manager.py, doc_template_analyzer.py 와 같은 디렉토리
+★ 호출: template_manager.extract_and_save() 내에서 analyze() 직후
+"""
+
+import re
+
+
+# ================================================================
+#  메인 엔트리포인트
+# ================================================================
+
+def generate(template_info: dict, parsed: dict) -> dict:
+    """semantic_map 생성 — 모든 판별 로직 조합.
+
+    Args:
+        template_info: DocTemplateAnalyzer.analyze() 결과
+        parsed: HWPX 파서 결과 (raw_xml, section_xml, paragraphs 등)
+
+    Returns:
+        {
+            "version": "1.0",
+            "table_roles":  { "0": {"role": "footer_table", ...}, ... },
+            "body_tables":  [3],        # 본문에 들어갈 표 index 목록
+            "title_table":  2,          # 제목 블록 index (없으면 None)
+            "sections":     [...],      # 감지된 섹션 목록
+            "style_mappings": {...},    # Phase 2용 스타일 매핑 (현재 빈 구조)
+        }
+    """
+    tables = template_info.get("tables", [])
+    header = template_info.get("header")
+    footer = template_info.get("footer")
+
+    # ① 표 역할 분류
+    table_roles = _classify_tables(tables, header, footer)
+
+    # ② 본문 전용 표 / 제목 블록 추출
+    body_tables = sorted(
+        idx for idx, info in table_roles.items()
+        if info["role"] == "data_table"
+    )
+    title_table = next(
+        (idx for idx, info in table_roles.items()
+         if info["role"] == "title_block"),
+        None
+    )
+
+    # ③ 섹션 감지
+    sections = _detect_sections(parsed)
+
+    # ④ 스타일 매핑 (Phase 2에서 구현, 현재는 빈 구조)
+    style_mappings = _prepare_style_mappings(template_info)
+
+    return {
+        "version": "1.0",
+        "table_roles": table_roles,
+        "body_tables": body_tables,
+        "title_table": title_table,
+        "sections": sections,
+        "style_mappings": style_mappings,
+    }
+
+
+# ================================================================
+#  표 분류
+# ================================================================
+
+def _classify_tables(tables: list, header: dict | None,
+                     footer: dict | None) -> dict:
+    """각 표의 역할 판별: header_table / footer_table / title_block / data_table
+
+    판별 순서:
+      Pass 1 — header/footer 텍스트 매칭
+      Pass 2 — 제목 블록 패턴 (1행, 좁은+넓은 열 구조)
+      Pass 3 — 나머지 → 데이터 표
+    """
+    header_texts = _collect_hf_texts(header)
+    footer_texts = _collect_hf_texts(footer)
+
+    roles = {}
+    classified = set()
+
+    # ── Pass 1: header/footer 매칭 ──
+    for tbl in tables:
+        idx = tbl["index"]
+        tbl_texts = _collect_table_texts(tbl)
+        if not tbl_texts:
+            continue
+
+        # header 매칭
+        if header_texts:
+            overlap = len(tbl_texts & header_texts)
+            if overlap > 0 and overlap / max(len(tbl_texts), 1) >= 0.5:
+                roles[idx] = {
+                    "role": "header_table",
+                    "match_source": "header",
+                    "matched_texts": list(tbl_texts & header_texts),
+                }
+                classified.add(idx)
+                continue
+
+        # footer 매칭
+        if footer_texts:
+            overlap = len(tbl_texts & footer_texts)
+            if overlap > 0 and overlap / max(len(tbl_texts), 1) >= 0.5:
+                roles[idx] = {
+                    "role": "footer_table",
+                    "match_source": "footer",
+                    "matched_texts": list(tbl_texts & footer_texts),
+                }
+                classified.add(idx)
+                continue
+
+    # ── Pass 2: 제목 블록 탐지 ──
+    for tbl in tables:
+        idx = tbl["index"]
+        if idx in classified:
+            continue
+
+        if _is_title_block(tbl):
+            title_text = _extract_longest_text(tbl)
+            roles[idx] = {
+                "role": "title_block",
+                "title_text": title_text,
+            }
+            classified.add(idx)
+            continue
+
+    # ── Pass 3: 나머지 → 데이터 표 ──
+    for tbl in tables:
+        idx = tbl["index"]
+        if idx in classified:
+            continue
+
+        col_headers = _detect_table_headers(tbl)
+        roles[idx] = {
+            "role": "data_table",
+            "header_row": 0 if col_headers else None,
+            "col_headers": col_headers,
+            "row_count": tbl.get("rowCnt", 0),
+            "col_count": tbl.get("colCnt", 0),
+        }
+
+    return roles
+
+
+# ── 표 분류 보조 함수 ──
+
+def _collect_hf_texts(hf_info: dict | None) -> set:
+    """header/footer의 table 셀 텍스트 수집"""
+    if not hf_info or not hf_info.get("table"):
+        return set()
+    texts = set()
+    for row in hf_info["table"].get("rows", []):
+        for cell in row:
+            t = cell.get("text", "").strip()
+            if t:
+                texts.add(t)
+    return texts
+
+
+def _collect_table_texts(tbl: dict) -> set:
+    """표의 모든 셀 텍스트 수집"""
+    texts = set()
+    for row in tbl.get("rows", []):
+        for cell in row:
+            t = cell.get("text", "").strip()
+            if t:
+                texts.add(t)
+    return texts
+
+
+def _extract_longest_text(tbl: dict) -> str:
+    """표에서 가장 긴 텍스트 추출 (제목 블록용)"""
+    longest = ""
+    for row in tbl.get("rows", []):
+        for cell in row:
+            t = cell.get("text", "").strip()
+            if len(t) > len(longest):
+                longest = t
+    return longest
+
+
+def _is_title_block(tbl: dict) -> bool:
+    """제목 블록 패턴 판별.
+
+    조건 (하나라도 충족):
+    A) 1행 2열, 왼쪽 열 비율 ≤ 10% (불릿아이콘 + 제목)
+    B) 1행 1열, 텍스트 길이 5~100자 (제목 단독)
+    """
+    if tbl.get("rowCnt", 0) != 1:
+        return False
+
+    col_cnt = tbl.get("colCnt", 0)
+    col_pcts = tbl.get("colWidths_pct", [])
+
+    # 패턴 A: 좁은 왼쪽 + 넓은 오른쪽
+    if col_cnt == 2 and len(col_pcts) >= 2:
+        if col_pcts[0] <= 10:
+            return True
+
+    # 패턴 B: 단일 셀 제목
+    if col_cnt == 1:
+        rows = tbl.get("rows", [])
+        if rows and rows[0]:
+            text = rows[0][0].get("text", "")
+            if 5 < len(text) < 100:
+                return True
+
+    return False
+
+
+def _detect_table_headers(tbl: dict) -> list:
+    """표 첫 행의 컬럼 헤더 텍스트 반환.
+
+    헤더 판별: 첫 행의 모든 텍스트가 짧음 (20자 이하)
+    """
+    rows = tbl.get("rows", [])
+    if not rows or len(rows) < 2:
+        return []
+
+    first_row = rows[0]
+    headers = []
+    for cell in first_row:
+        t = cell.get("text", "").strip()
+        headers.append(t)
+
+    # 전부 짧은 텍스트이면 헤더행
+    if headers and all(len(h) <= 20 for h in headers if h):
+        non_empty = [h for h in headers if h]
+        if non_empty:  # 최소 1개는 텍스트가 있어야
+            return headers
+
+    return []
+
+
+# ================================================================
+#  섹션 감지
+# ================================================================
+
+_SECTION_PATTERNS = [
+    (r'^(\d+)\.\s+(.+)',                  "numbered"),        # "1. 개요"
+    (r'^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ][\.\s]+(.+)',      "roman"),           # "Ⅰ. 개요"
+    (r'^제\s*(\d+)\s*([장절항])\s*(.+)',    "korean_formal"),   # "제1장 개요"
+    (r'^[▶►▸●◆■□◎★☆]\s*(.+)',             "bullet_heading"),  # "▶ 개요"
+]
+
+
+def _detect_sections(parsed: dict) -> list:
+    """parsed 텍스트에서 섹션 제목 패턴 탐색.
+
+    Returns:
+        [
+            {"index": 1, "title": "▶ 개요", "pattern_type": "bullet_heading"},
+            {"index": 2, "title": "▶ 발표 구성(안)", "pattern_type": "bullet_heading"},
+            ...
+        ]
+    """
+    paragraphs = _extract_paragraphs(parsed)
+    sections = []
+    sec_idx = 0
+
+    for text in paragraphs:
+        text = text.strip()
+        if not text or len(text) > 100:
+            # 너무 긴 텍스트는 제목이 아님
+            continue
+
+        for pat, pat_type in _SECTION_PATTERNS:
+            m = re.match(pat, text)
+            if m:
+                # numbered 패턴: 숫자가 100 이상이면 섹션 번호가 아님 (연도 등 제외)
+                if pat_type == "numbered" and int(m.group(1)) > 99:
+                    continue
+                sec_idx += 1
+                sections.append({
+                    "index": sec_idx,
+                    "title": text,
+                    "pattern_type": pat_type,
+                })
+                break
+
+    return sections
+
+
+def _extract_paragraphs(parsed: dict) -> list:
+    """parsed에서 텍스트 단락 추출.
+
+    우선순위:
+    1. parsed["paragraphs"] (파서가 직접 제공)
+    2. section_xml의 <hp:t> 태그에서 추출
+    """
+    paragraphs = parsed.get("paragraphs", [])
+    if paragraphs:
+        return [
+            p.get("text", "") if isinstance(p, dict) else str(p)
+            for p in paragraphs
+        ]
+
+    # section_xml에서 <hp:t> 추출
+    section_xml = ""
+    raw_xml = parsed.get("raw_xml", {})
+    for key, val in raw_xml.items():
+        if "section" in key.lower() and isinstance(val, str):
+            section_xml = val
+            break
+
+    if not section_xml:
+        section_xml = parsed.get("section_xml", "")
+
+    if section_xml:
+        return [
+            t.strip()
+            for t in re.findall(r'<hp:t>([^<]+)</hp:t>', section_xml)
+            if t.strip()
+        ]
+
+    return []
+
+
+# ================================================================
+#  스타일 매핑 (Phase 2에서 확장)
+# ================================================================
+
+def _prepare_style_mappings(template_info: dict) -> dict:
+    """스타일 매핑 빈 구조 생성.
+
+    Phase 2에서 이 구조를 채움:
+    - char_styles → CSS font/color rules
+    - border_fills → CSS border/background rules
+    - para_styles → CSS margin/alignment rules
+    """
+    mappings = {
+        "char_pr": {},
+        "border_fill": {},
+        "para_pr": {},
+    }
+
+    # border_fills가 있으면 기본 매핑 생성
+    border_fills = template_info.get("border_fills", {})
+    for bf_id, bf_data in border_fills.items():
+        # ★ 실제 키 구조 대응 (bg→background, sides→css/직접키)
+        bg = bf_data.get("background", bf_data.get("bg", ""))
+        
+        # borders: css dict 또는 직접 키에서 추출
+        borders = {}
+        css_dict = bf_data.get("css", {})
+        if css_dict:
+            for prop, val in css_dict.items():
+                if prop.startswith("border-") and val and val != "none":
+                    borders[prop] = val
+        else:
+            # fallback: 직접 side 키
+            for side in ("top", "bottom", "left", "right"):
+                si = bf_data.get(side, {})
+                if isinstance(si, dict) and si.get("type", "NONE").upper() != "NONE":
+                    borders[f"border-{side}"] = (
+                        f"{si.get('width','0.1mm')} "
+                        f"{si.get('type','solid').lower()} "
+                        f"{si.get('color','#000')}"
+                    )
+        
+        mappings["border_fill"][str(bf_id)] = {
+            "css_class": f"bf-{bf_id}",
+            "bg": bg,
+            "borders": borders,
+        }
+
+    return mappings