test/handlers/tools/char_style.py

# -*- coding: utf-8 -*-
"""
§4 글자 모양(CharShape) 추출

HWPX 실제 태그 (header.xml):
  <hh:charPr id="0" height="1000" textColor="#000000" shadeColor="none"
             useFontSpace="0" useKerning="0" symMark="NONE" borderFillIDRef="2">
    <hh:fontRef hangul="7" latin="6" hanja="6" .../>
    <hh:ratio hangul="100" latin="100" .../>
    <hh:spacing hangul="0" latin="0" .../>
    <hh:relSz hangul="100" latin="100" .../>
    <hh:offset hangul="0" latin="0" .../>
    <hh:bold/>                              <!-- 존재하면 bold -->
    <hh:italic/>                            <!-- 존재하면 italic -->
    <hh:underline type="NONE" shape="SOLID" color="#000000"/>
    <hh:strikeout shape="NONE" color="#000000"/>
  </hh:charPr>

디폴트값 생성 안 함.
"""

import re

from domain.hwpx.hwpx_utils import charsize_to_pt


def extract(raw_xml: dict, parsed: dict = None) -> list | None:
    """§4 charPr 전체 목록 추출.

    Returns:
        [
            {
                "id": 0,
                "height_pt": 10.0,
                "textColor": "#000000",
                "bold": False,
                "italic": False,
                "underline": "NONE",
                "strikeout": "NONE",
                "fontRef": {"hangul": 7, "latin": 6, ...},
                "ratio": {"hangul": 100, "latin": 100, ...},
                "spacing": {"hangul": 0, "latin": 0, ...},
                "borderFillIDRef": 2,
            },
            ...
        ]
    """
    header_xml = _get_header_xml(raw_xml, parsed)
    if not header_xml:
        return None

    # charPr 블록 추출 (self-closing이 아닌 블록)
    blocks = re.findall(
        r'<hh:charPr\b([^>]*)>(.*?)</hh:charPr>',
        header_xml, re.DOTALL
    )

    if not blocks:
        return None

    result = []
    for attrs_str, inner in blocks:
        item = {}

        # 속성 파싱
        id_m = re.search(r'\bid="(\d+)"', attrs_str)
        if id_m:
            item["id"] = int(id_m.group(1))

        height_m = re.search(r'\bheight="(\d+)"', attrs_str)
        if height_m:
            item["height_pt"] = charsize_to_pt(int(height_m.group(1)))

        color_m = re.search(r'\btextColor="([^"]+)"', attrs_str)
        if color_m:
            item["textColor"] = color_m.group(1)

        shade_m = re.search(r'\bshadeColor="([^"]+)"', attrs_str)
        if shade_m and shade_m.group(1) != "none":
            item["shadeColor"] = shade_m.group(1)

        bf_m = re.search(r'\bborderFillIDRef="(\d+)"', attrs_str)
        if bf_m:
            item["borderFillIDRef"] = int(bf_m.group(1))

        # bold / italic (태그 존재 여부로 판단)
        item["bold"] = bool(re.search(r'<hh:bold\s*/?>',  inner))
        item["italic"] = bool(re.search(r'<hh:italic\s*/?>',  inner))

        # fontRef
        fr = re.search(r'<hh:fontRef\b([^/]*)/>', inner)
        if fr:
            item["fontRef"] = _parse_lang_attrs(fr.group(1))

        # ratio
        ra = re.search(r'<hh:ratio\b([^/]*)/>', inner)
        if ra:
            item["ratio"] = _parse_lang_attrs(ra.group(1))

        # spacing
        sp = re.search(r'<hh:spacing\b([^/]*)/>', inner)
        if sp:
            item["spacing"] = _parse_lang_attrs(sp.group(1))

        # underline
        ul = re.search(r'<hh:underline\b[^>]*\btype="([^"]+)"', inner)
        if ul:
            item["underline"] = ul.group(1)

        # strikeout
        so = re.search(r'<hh:strikeout\b[^>]*\bshape="([^"]+)"', inner)
        if so:
            item["strikeout"] = so.group(1)

        result.append(item)

    return result if result else None


def _parse_lang_attrs(attrs_str: str) -> dict:
    """hangul="7" latin="6" ... → {"hangul": 7, "latin": 6, ...}"""
    pairs = re.findall(r'(\w+)="(-?\d+)"', attrs_str)
    return {k: int(v) for k, v in pairs}


def _get_header_xml(raw_xml: dict, parsed: dict = None) -> str | None:
    if parsed and parsed.get("header_xml"):
        return parsed["header_xml"]
    if isinstance(raw_xml, dict):
        for name, content in raw_xml.items():
            if "header" in name.lower() and isinstance(content, str):
                return content
    return raw_xml if isinstance(raw_xml, str) else None