test/handlers/tools/font.py

# -*- coding: utf-8 -*-
"""
§3 글꼴(FaceName) 추출

HWPX 실제 태그 (header.xml):
  <hh:fontface lang="HANGUL" fontCnt="9">
    <hh:font id="0" face="돋움" type="TTF" isEmbedded="0">
    <hh:font id="1" face="맑은 고딕" type="TTF" isEmbedded="0">
  </hh:fontface>
  <hh:fontface lang="LATIN" fontCnt="9">
    <hh:font id="0" face="돋움" type="TTF" isEmbedded="0">
  </hh:fontface>

디폴트값 생성 안 함. 추출 실패 시 None 반환.
"""

import re


def extract(raw_xml: dict, parsed: dict = None) -> dict | None:
    """§3 fontface에서 언어별 글꼴 정의 추출.

    Returns:
        {
            "HANGUL": [{"id": 0, "face": "돋움", "type": "TTF"}, ...],
            "LATIN":  [{"id": 0, "face": "돋움", "type": "TTF"}, ...],
            "HANJA":  [...],
            ...
        }
        또는 추출 실패 시 None
    """
    header_xml = _get_header_xml(raw_xml, parsed)
    if not header_xml:
        return None

    result = {}

    # fontface 블록을 lang별로 추출
    fontface_blocks = re.findall(
        r'<hh:fontface\b[^>]*\blang="([^"]+)"[^>]*>(.*?)</hh:fontface>',
        header_xml, re.DOTALL
    )

    if not fontface_blocks:
        return None

    for lang, block_content in fontface_blocks:
        fonts = []
        font_matches = re.finditer(
            r'<hh:font\b[^>]*'
            r'\bid="(\d+)"[^>]*'
            r'\bface="([^"]+)"[^>]*'
            r'\btype="([^"]+)"',
            block_content
        )
        for fm in font_matches:
            fonts.append({
                "id": int(fm.group(1)),
                "face": fm.group(2),
                "type": fm.group(3),
            })

        if fonts:
            result[lang] = fonts

    return result if result else None


def _get_header_xml(raw_xml: dict, parsed: dict = None) -> str | None:
    """header.xml 문자열을 가져온다."""
    if parsed and parsed.get("header_xml"):
        return parsed["header_xml"]

    if isinstance(raw_xml, dict):
        for name, content in raw_xml.items():
            if "header" in name.lower() and isinstance(content, str):
                return content

    if isinstance(raw_xml, str):
        return raw_xml

    return None