test/handlers/tools/header_footer.py

# -*- coding: utf-8 -*-
"""
§8 머리말/꼬리말(HeaderFooter) 추출

HWPX 실제 태그 (section0.xml):
  <hp:headerFooter ...>
    <!-- 내용은 section XML 내 또는 별도 header/footer 영역 -->
  </hp:headerFooter>

  머리말/꼬리말 안에 표가 있는 경우:
  - 표의 셀에 다중행 텍스트가 포함될 수 있음
  - 각 셀의 colSpan, rowSpan, width, borderFillIDRef 등 추출 필요

secPr 내 속성:
  <hp:visibility hideFirstHeader="0" hideFirstFooter="0" .../>

디폴트값 생성 안 함.
"""

import re

from domain.hwpx.hwpx_utils import hwpunit_to_mm


def extract_header(raw_xml: dict, parsed: dict = None) -> dict | None:
    """머리말 구조 추출.

    Returns:
        {
            "exists": True,
            "type": "table" | "text",
            "hidden": False,
            "table": { ... } | None,    # 표가 있는 경우
            "texts": ["부서명", ...],
        }
    """
    return _extract_hf(raw_xml, parsed, "header")


def extract_footer(raw_xml: dict, parsed: dict = None) -> dict | None:
    """꼬리말 구조 추출."""
    return _extract_hf(raw_xml, parsed, "footer")


def _extract_hf(raw_xml: dict, parsed: dict, hf_type: str) -> dict | None:
    """header 또는 footer 추출 공통 로직"""
    # 1) parsed에서 직접 제공된 header/footer XML
    hf_xml = None
    if parsed:
        key = f"page_{hf_type}_xml"
        hf_xml = parsed.get(key, "")

    # 2) section XML에서 headerFooter 블록 탐색
    section_xml = _get_section_xml(raw_xml, parsed)

    if not hf_xml and section_xml:
        # headerFooter 태그에서 header/footer 구분
        hf_blocks = re.findall(
            r'<hp:headerFooter\b([^>]*)>(.*?)</hp:headerFooter>',
            section_xml, re.DOTALL
        )
        for attrs, inner in hf_blocks:
            # type 속성으로 구분 (HEADER / FOOTER)
            type_m = re.search(r'\btype="([^"]+)"', attrs)
            if type_m:
                if type_m.group(1).upper() == hf_type.upper():
                    hf_xml = inner
                    break

    if not hf_xml or not hf_xml.strip():
        return None  # 해당 머리말/꼬리말 없음

    result = {"exists": True}

    # hidden 여부
    if section_xml:
        hide_key = f"hideFirst{'Header' if hf_type == 'header' else 'Footer'}"
        hide_m = re.search(rf'\b{hide_key}="(\d+)"', section_xml)
        if hide_m:
            result["hidden"] = bool(int(hide_m.group(1)))

    # 텍스트 추출
    texts = re.findall(r'<hp:t>([^<]*)</hp:t>', hf_xml)
    clean_texts = [t.strip() for t in texts if t.strip()]
    if clean_texts:
        result["texts"] = clean_texts

    # 표 존재 여부
    tbl_match = re.search(
        r'<hp:tbl\b([^>]*)>(.*?)</hp:tbl>',
        hf_xml, re.DOTALL
    )
    if tbl_match:
        result["type"] = "table"
        result["table"] = _parse_hf_table(tbl_match.group(1), tbl_match.group(2))
    else:
        result["type"] = "text"

    return result


def _parse_hf_table(tbl_attrs: str, tbl_inner: str) -> dict:
    """머리말/꼬리말 내 표 파싱"""
    table = {}

    # rowCnt, colCnt
    for attr in ["rowCnt", "colCnt"]:
        m = re.search(rf'\b{attr}="(\d+)"', tbl_attrs)
        if m:
            table[attr] = int(m.group(1))

    # 열 너비
    wl = re.search(r'<hp:widthList>([^<]+)</hp:widthList>', tbl_inner)
    if wl:
        try:
            widths = [int(w) for w in wl.group(1).strip().split()]
            table["colWidths_hu"] = widths
            total = sum(widths) or 1
            table["colWidths_pct"] = [round(w / total * 100) for w in widths]
        except ValueError:
            pass

    # 행/셀
    rows = []
    tr_blocks = re.findall(r'<hp:tr\b[^>]*>(.*?)</hp:tr>', tbl_inner, re.DOTALL)
    for tr in tr_blocks:
        cells = []
        tc_blocks = re.finditer(
            r'<hp:tc\b([^>]*)>(.*?)</hp:tc>', tr, re.DOTALL
        )
        for tc in tc_blocks:
            cell = _parse_hf_cell(tc.group(1), tc.group(2))
            cells.append(cell)
        rows.append(cells)

    if rows:
        table["rows"] = rows

    return table


def _parse_hf_cell(tc_attrs: str, tc_inner: str) -> dict:
    """머리말/꼬리말 셀 파싱"""
    cell = {}

    # borderFillIDRef
    bf = re.search(r'\bborderFillIDRef="(\d+)"', tc_attrs)
    if bf:
        cell["borderFillIDRef"] = int(bf.group(1))

    # cellAddr
    addr = re.search(
        r'<hp:cellAddr\b[^>]*\bcolAddr="(\d+)"[^>]*\browAddr="(\d+)"',
        tc_inner
    )
    if addr:
        cell["colAddr"] = int(addr.group(1))
        cell["rowAddr"] = int(addr.group(2))

    # cellSpan
    span = re.search(r'<hp:cellSpan\b([^/]*)/?>', tc_inner)
    if span:
        cs = re.search(r'\bcolSpan="(\d+)"', span.group(1))
        rs = re.search(r'\browSpan="(\d+)"', span.group(1))
        if cs:
            cell["colSpan"] = int(cs.group(1))
        if rs:
            cell["rowSpan"] = int(rs.group(1))

    # cellSz
    sz = re.search(r'<hp:cellSz\b([^/]*)/?>', tc_inner)
    if sz:
        w = re.search(r'\bwidth="(\d+)"', sz.group(1))
        if w:
            cell["width_hu"] = int(w.group(1))

    # 셀 텍스트 (다중행)
    paras = re.findall(r'<hp:p\b[^>]*>(.*?)</hp:p>', tc_inner, re.DOTALL)
    lines = []
    for p in paras:
        p_texts = re.findall(r'<hp:t>([^<]*)</hp:t>', p)
        line = " ".join(t.strip() for t in p_texts if t.strip())
        if line:
            lines.append(line)

    if lines:
        cell["text"] = " ".join(lines)
        cell["lines"] = lines

    return cell


def _get_section_xml(raw_xml: dict, parsed: dict = None) -> str | None:
    if parsed and parsed.get("section_xml"):
        return parsed["section_xml"]
    if isinstance(raw_xml, dict):
        for name, content in raw_xml.items():
            if "section" in name.lower() and isinstance(content, str):
                return content
    return raw_xml if isinstance(raw_xml, str) else None