test/handlers/tools/page_setup.py

# -*- coding: utf-8 -*-
"""
§7 용지 설정 추출 (pagePr + margin)

HWPX 실제 태그:
  <hp:pagePr landscape="WIDELY" width="59528" height="84188" gutterType="LEFT_ONLY">
  <hp:margin header="4251" footer="4251" gutter="0"
             left="5669" right="5669" top="2834" bottom="2834"/>

디폴트값 생성 안 함. 추출 실패 시 None 반환.
"""

import re

from domain.hwpx.hwpx_utils import hwpunit_to_mm, mm_format, detect_paper_size


def extract(raw_xml: dict, parsed: dict = None) -> dict | None:
    """§7 pagePr + margin에서 용지/여백 정보 추출.

    Returns:
        {
            "paper": {"name": "A4", "width_mm": 210.0, "height_mm": 297.0,
                       "landscape": True/False},
            "margins": {"top": "10.0mm", "bottom": "10.0mm",
                        "left": "20.0mm", "right": "20.0mm",
                        "header": "15.0mm", "footer": "15.0mm",
                        "gutter": "0.0mm"}
        }
        또는 추출 실패 시 None
    """
    section_xml = _get_section_xml(raw_xml, parsed)
    if not section_xml:
        return None

    result = {}

    # ── 용지 크기 ─────────────────────────────────
    page_match = re.search(
        r'<hp:pagePr\b[^>]*'
        r'\bwidth="(\d+)"[^>]*'
        r'\bheight="(\d+)"',
        section_xml
    )
    if not page_match:
        # 속성 순서가 다를 수 있음
        page_match = re.search(
            r'<hp:pagePr\b[^>]*'
            r'\bheight="(\d+)"[^>]*'
            r'\bwidth="(\d+)"',
            section_xml
        )
        if page_match:
            h_hu, w_hu = int(page_match.group(1)), int(page_match.group(2))
        else:
            return None
    else:
        w_hu, h_hu = int(page_match.group(1)), int(page_match.group(2))

    landscape_match = re.search(
        r'<hp:pagePr\b[^>]*\blandscape="([^"]+)"', section_xml
    )
    is_landscape = False
    if landscape_match:
        is_landscape = landscape_match.group(1) == "WIDELY"

    paper_name = detect_paper_size(w_hu, h_hu)

    result["paper"] = {
        "name": paper_name,
        "width_mm": round(hwpunit_to_mm(w_hu), 1),
        "height_mm": round(hwpunit_to_mm(h_hu), 1),
        "landscape": is_landscape,
    }

    # ── 여백 ──────────────────────────────────────
    margin_match = re.search(r'<hp:margin\b([^/]*)/>', section_xml)
    if not margin_match:
        return result  # 용지 크기는 있으나 여백은 없을 수 있음

    attrs_str = margin_match.group(1)
    margins = {}
    for key in ["top", "bottom", "left", "right", "header", "footer", "gutter"]:
        m = re.search(rf'\b{key}="(\d+)"', attrs_str)
        if m:
            margins[key] = mm_format(int(m.group(1)))

    if margins:
        result["margins"] = margins

    return result


def _get_section_xml(raw_xml: dict, parsed: dict = None) -> str | None:
    """section XML 문자열을 가져온다."""
    # parsed에서 직접 제공
    if parsed and parsed.get("section_xml"):
        return parsed["section_xml"]

    # raw_xml dict에서 section 파일 찾기
    if isinstance(raw_xml, dict):
        for name, content in raw_xml.items():
            if "section" in name.lower() and isinstance(content, str):
                return content

    # raw_xml이 문자열이면 그대로
    if isinstance(raw_xml, str):
        return raw_xml

    return None