v8:문서유형 분석등록 및 추출_20260206

2026-02-20 11:46:52 +09:00
parent db6532b33c
commit c3e9e29205
57 changed files with 22138 additions and 1421 deletions
--- a/handlers/tools/header_footer.py
+++ b/handlers/tools/header_footer.py
@@ -0,0 +1,200 @@
+# -*- coding: utf-8 -*-
+"""
+§8 머리말/꼬리말(HeaderFooter) 추출
+
+HWPX 실제 태그 (section0.xml):
+  <hp:headerFooter ...>
+    <!-- 내용은 section XML 내 또는 별도 header/footer 영역 -->
+  </hp:headerFooter>
+
+  머리말/꼬리말 안에 표가 있는 경우:
+  - 표의 셀에 다중행 텍스트가 포함될 수 있음
+  - 각 셀의 colSpan, rowSpan, width, borderFillIDRef 등 추출 필요
+
+secPr 내 속성:
+  <hp:visibility hideFirstHeader="0" hideFirstFooter="0" .../>
+
+디폴트값 생성 안 함.
+"""
+
+import re
+
+from domain.hwpx.hwpx_utils import hwpunit_to_mm
+
+
+def extract_header(raw_xml: dict, parsed: dict = None) -> dict | None:
+    """머리말 구조 추출.
+
+    Returns:
+        {
+            "exists": True,
+            "type": "table" | "text",
+            "hidden": False,
+            "table": { ... } | None,    # 표가 있는 경우
+            "texts": ["부서명", ...],
+        }
+    """
+    return _extract_hf(raw_xml, parsed, "header")
+
+
+def extract_footer(raw_xml: dict, parsed: dict = None) -> dict | None:
+    """꼬리말 구조 추출."""
+    return _extract_hf(raw_xml, parsed, "footer")
+
+
+def _extract_hf(raw_xml: dict, parsed: dict, hf_type: str) -> dict | None:
+    """header 또는 footer 추출 공통 로직"""
+    # 1) parsed에서 직접 제공된 header/footer XML
+    hf_xml = None
+    if parsed:
+        key = f"page_{hf_type}_xml"
+        hf_xml = parsed.get(key, "")
+
+    # 2) section XML에서 headerFooter 블록 탐색
+    section_xml = _get_section_xml(raw_xml, parsed)
+
+    if not hf_xml and section_xml:
+        # headerFooter 태그에서 header/footer 구분
+        hf_blocks = re.findall(
+            r'<hp:headerFooter\b([^>]*)>(.*?)</hp:headerFooter>',
+            section_xml, re.DOTALL
+        )
+        for attrs, inner in hf_blocks:
+            # type 속성으로 구분 (HEADER / FOOTER)
+            type_m = re.search(r'\btype="([^"]+)"', attrs)
+            if type_m:
+                if type_m.group(1).upper() == hf_type.upper():
+                    hf_xml = inner
+                    break
+
+    if not hf_xml or not hf_xml.strip():
+        return None  # 해당 머리말/꼬리말 없음
+
+    result = {"exists": True}
+
+    # hidden 여부
+    if section_xml:
+        hide_key = f"hideFirst{'Header' if hf_type == 'header' else 'Footer'}"
+        hide_m = re.search(rf'\b{hide_key}="(\d+)"', section_xml)
+        if hide_m:
+            result["hidden"] = bool(int(hide_m.group(1)))
+
+    # 텍스트 추출
+    texts = re.findall(r'<hp:t>([^<]*)</hp:t>', hf_xml)
+    clean_texts = [t.strip() for t in texts if t.strip()]
+    if clean_texts:
+        result["texts"] = clean_texts
+
+    # 표 존재 여부
+    tbl_match = re.search(
+        r'<hp:tbl\b([^>]*)>(.*?)</hp:tbl>',
+        hf_xml, re.DOTALL
+    )
+    if tbl_match:
+        result["type"] = "table"
+        result["table"] = _parse_hf_table(tbl_match.group(1), tbl_match.group(2))
+    else:
+        result["type"] = "text"
+
+    return result
+
+
+def _parse_hf_table(tbl_attrs: str, tbl_inner: str) -> dict:
+    """머리말/꼬리말 내 표 파싱"""
+    table = {}
+
+    # rowCnt, colCnt
+    for attr in ["rowCnt", "colCnt"]:
+        m = re.search(rf'\b{attr}="(\d+)"', tbl_attrs)
+        if m:
+            table[attr] = int(m.group(1))
+
+    # 열 너비
+    wl = re.search(r'<hp:widthList>([^<]+)</hp:widthList>', tbl_inner)
+    if wl:
+        try:
+            widths = [int(w) for w in wl.group(1).strip().split()]
+            table["colWidths_hu"] = widths
+            total = sum(widths) or 1
+            table["colWidths_pct"] = [round(w / total * 100) for w in widths]
+        except ValueError:
+            pass
+
+    # 행/셀
+    rows = []
+    tr_blocks = re.findall(r'<hp:tr\b[^>]*>(.*?)</hp:tr>', tbl_inner, re.DOTALL)
+    for tr in tr_blocks:
+        cells = []
+        tc_blocks = re.finditer(
+            r'<hp:tc\b([^>]*)>(.*?)</hp:tc>', tr, re.DOTALL
+        )
+        for tc in tc_blocks:
+            cell = _parse_hf_cell(tc.group(1), tc.group(2))
+            cells.append(cell)
+        rows.append(cells)
+
+    if rows:
+        table["rows"] = rows
+
+    return table
+
+
+def _parse_hf_cell(tc_attrs: str, tc_inner: str) -> dict:
+    """머리말/꼬리말 셀 파싱"""
+    cell = {}
+
+    # borderFillIDRef
+    bf = re.search(r'\bborderFillIDRef="(\d+)"', tc_attrs)
+    if bf:
+        cell["borderFillIDRef"] = int(bf.group(1))
+
+    # cellAddr
+    addr = re.search(
+        r'<hp:cellAddr\b[^>]*\bcolAddr="(\d+)"[^>]*\browAddr="(\d+)"',
+        tc_inner
+    )
+    if addr:
+        cell["colAddr"] = int(addr.group(1))
+        cell["rowAddr"] = int(addr.group(2))
+
+    # cellSpan
+    span = re.search(r'<hp:cellSpan\b([^/]*)/?>', tc_inner)
+    if span:
+        cs = re.search(r'\bcolSpan="(\d+)"', span.group(1))
+        rs = re.search(r'\browSpan="(\d+)"', span.group(1))
+        if cs:
+            cell["colSpan"] = int(cs.group(1))
+        if rs:
+            cell["rowSpan"] = int(rs.group(1))
+
+    # cellSz
+    sz = re.search(r'<hp:cellSz\b([^/]*)/?>', tc_inner)
+    if sz:
+        w = re.search(r'\bwidth="(\d+)"', sz.group(1))
+        if w:
+            cell["width_hu"] = int(w.group(1))
+
+    # 셀 텍스트 (다중행)
+    paras = re.findall(r'<hp:p\b[^>]*>(.*?)</hp:p>', tc_inner, re.DOTALL)
+    lines = []
+    for p in paras:
+        p_texts = re.findall(r'<hp:t>([^<]*)</hp:t>', p)
+        line = " ".join(t.strip() for t in p_texts if t.strip())
+        if line:
+            lines.append(line)
+
+    if lines:
+        cell["text"] = " ".join(lines)
+        cell["lines"] = lines
+
+    return cell
+
+
+def _get_section_xml(raw_xml: dict, parsed: dict = None) -> str | None:
+    if parsed and parsed.get("section_xml"):
+        return parsed["section_xml"]
+    if isinstance(raw_xml, dict):
+        for name, content in raw_xml.items():
+            if "section" in name.lower() and isinstance(content, str):
+                return content
+    return raw_xml if isinstance(raw_xml, str) else None