test/handlers/tools/para_style.py

# -*- coding: utf-8 -*-
"""
§5 문단 모양(ParaShape) 추출

HWPX 실제 태그 (header.xml):
  <hh:paraPr id="0" tabPrIDRef="1" condense="0" ...>
    <hh:align horizontal="JUSTIFY" vertical="BASELINE"/>
    <hh:heading type="NONE" idRef="0" level="0"/>
    <hh:breakSetting breakLatinWord="KEEP_WORD" breakNonLatinWord="KEEP_WORD"
                     widowOrphan="0" keepWithNext="0" keepLines="0"
                     pageBreakBefore="0" lineWrap="BREAK"/>
    <hp:case ...>
      <hh:margin>
        <hc:intent value="-1310" unit="HWPUNIT"/>
        <hc:left value="0" unit="HWPUNIT"/>
        <hc:right value="0" unit="HWPUNIT"/>
        <hc:prev value="0" unit="HWPUNIT"/>
        <hc:next value="0" unit="HWPUNIT"/>
      </hh:margin>
      <hh:lineSpacing type="PERCENT" value="130" unit="HWPUNIT"/>
    </hp:case>
    <hh:border borderFillIDRef="2" .../>
  </hh:paraPr>

디폴트값 생성 안 함.
"""

import re

from domain.hwpx.hwpx_utils import hwpunit_to_mm


def extract(raw_xml: dict, parsed: dict = None) -> list | None:
    """§5 paraPr 전체 목록 추출.

    Returns:
        [
            {
                "id": 0,
                "align": "JUSTIFY",
                "verticalAlign": "BASELINE",
                "heading": {"type": "NONE", "idRef": 0, "level": 0},
                "breakSetting": {
                    "widowOrphan": False, "keepWithNext": False,
                    "keepLines": False, "pageBreakBefore": False,
                    "lineWrap": "BREAK",
                    "breakLatinWord": "KEEP_WORD",
                    "breakNonLatinWord": "KEEP_WORD"
                },
                "margin": {
                    "indent_hu": -1310, "left_hu": 0, "right_hu": 0,
                    "before_hu": 0, "after_hu": 0,
                },
                "lineSpacing": {"type": "PERCENT", "value": 130},
                "borderFillIDRef": 2,
                "tabPrIDRef": 1,
            },
            ...
        ]
    """
    header_xml = _get_header_xml(raw_xml, parsed)
    if not header_xml:
        return None

    blocks = re.findall(
        r'<hh:paraPr\b([^>]*)>(.*?)</hh:paraPr>',
        header_xml, re.DOTALL
    )

    if not blocks:
        return None

    result = []
    for attrs_str, inner in blocks:
        item = {}

        # id
        id_m = re.search(r'\bid="(\d+)"', attrs_str)
        if id_m:
            item["id"] = int(id_m.group(1))

        # tabPrIDRef
        tab_m = re.search(r'\btabPrIDRef="(\d+)"', attrs_str)
        if tab_m:
            item["tabPrIDRef"] = int(tab_m.group(1))

        # align
        al = re.search(r'<hh:align\b[^>]*\bhorizontal="([^"]+)"', inner)
        if al:
            item["align"] = al.group(1)

        val = re.search(r'<hh:align\b[^>]*\bvertical="([^"]+)"', inner)
        if val:
            item["verticalAlign"] = val.group(1)

        # heading
        hd = re.search(
            r'<hh:heading\b[^>]*\btype="([^"]+)"[^>]*'
            r'\bidRef="(\d+)"[^>]*\blevel="(\d+)"', inner
        )
        if hd:
            item["heading"] = {
                "type": hd.group(1),
                "idRef": int(hd.group(2)),
                "level": int(hd.group(3)),
            }

        # breakSetting
        bs = re.search(r'<hh:breakSetting\b([^/]*)/?>', inner)
        if bs:
            bstr = bs.group(1)
            item["breakSetting"] = {
                "widowOrphan": _bool_attr(bstr, "widowOrphan"),
                "keepWithNext": _bool_attr(bstr, "keepWithNext"),
                "keepLines": _bool_attr(bstr, "keepLines"),
                "pageBreakBefore": _bool_attr(bstr, "pageBreakBefore"),
                "lineWrap": _str_attr(bstr, "lineWrap"),
                "breakLatinWord": _str_attr(bstr, "breakLatinWord"),
                "breakNonLatinWord": _str_attr(bstr, "breakNonLatinWord"),
            }

        # margin (hp:case 블록 내 첫 번째 사용 — HwpUnitChar case 우선)
        case_block = re.search(
            r'<hp:case\b[^>]*required-namespace="[^"]*HwpUnitChar[^"]*"[^>]*>'
            r'(.*?)</hp:case>',
            inner, re.DOTALL
        )
        margin_src = case_block.group(1) if case_block else inner

        margin = {}
        for tag, key in [
            ("intent", "indent_hu"),
            ("left", "left_hu"),
            ("right", "right_hu"),
            ("prev", "before_hu"),
            ("next", "after_hu"),
        ]:
            m = re.search(
                rf'<hc:{tag}\b[^>]*\bvalue="(-?\d+)"', margin_src
            )
            if m:
                margin[key] = int(m.group(1))

        if margin:
            item["margin"] = margin

        # lineSpacing
        ls = re.search(
            r'<hh:lineSpacing\b[^>]*\btype="([^"]+)"[^>]*\bvalue="(\d+)"',
            margin_src
        )
        if ls:
            item["lineSpacing"] = {
                "type": ls.group(1),
                "value": int(ls.group(2)),
            }

        # borderFillIDRef
        bf = re.search(r'<hh:border\b[^>]*\bborderFillIDRef="(\d+)"', inner)
        if bf:
            item["borderFillIDRef"] = int(bf.group(1))

        result.append(item)

    return result if result else None


def _bool_attr(s: str, name: str) -> bool | None:
    m = re.search(rf'\b{name}="(\d+)"', s)
    return bool(int(m.group(1))) if m else None


def _str_attr(s: str, name: str) -> str | None:
    m = re.search(rf'\b{name}="([^"]+)"', s)
    return m.group(1) if m else None


def _get_header_xml(raw_xml: dict, parsed: dict = None) -> str | None:
    if parsed and parsed.get("header_xml"):
        return parsed["header_xml"]
    if isinstance(raw_xml, dict):
        for name, content in raw_xml.items():
            if "header" in name.lower() and isinstance(content, str):
                return content
    return raw_xml if isinstance(raw_xml, str) else None