# -*- coding: utf-8 -*- """ §4 글자 모양(CharShape) 추출 HWPX 실제 태그 (header.xml): 디폴트값 생성 안 함. """ import re from domain.hwpx.hwpx_utils import charsize_to_pt def extract(raw_xml: dict, parsed: dict = None) -> list | None: """§4 charPr 전체 목록 추출. Returns: [ { "id": 0, "height_pt": 10.0, "textColor": "#000000", "bold": False, "italic": False, "underline": "NONE", "strikeout": "NONE", "fontRef": {"hangul": 7, "latin": 6, ...}, "ratio": {"hangul": 100, "latin": 100, ...}, "spacing": {"hangul": 0, "latin": 0, ...}, "borderFillIDRef": 2, }, ... ] """ header_xml = _get_header_xml(raw_xml, parsed) if not header_xml: return None # charPr 블록 추출 (self-closing이 아닌 블록) blocks = re.findall( r']*)>(.*?)', header_xml, re.DOTALL ) if not blocks: return None result = [] for attrs_str, inner in blocks: item = {} # 속성 파싱 id_m = re.search(r'\bid="(\d+)"', attrs_str) if id_m: item["id"] = int(id_m.group(1)) height_m = re.search(r'\bheight="(\d+)"', attrs_str) if height_m: item["height_pt"] = charsize_to_pt(int(height_m.group(1))) color_m = re.search(r'\btextColor="([^"]+)"', attrs_str) if color_m: item["textColor"] = color_m.group(1) shade_m = re.search(r'\bshadeColor="([^"]+)"', attrs_str) if shade_m and shade_m.group(1) != "none": item["shadeColor"] = shade_m.group(1) bf_m = re.search(r'\bborderFillIDRef="(\d+)"', attrs_str) if bf_m: item["borderFillIDRef"] = int(bf_m.group(1)) # bold / italic (태그 존재 여부로 판단) item["bold"] = bool(re.search(r'', inner)) item["italic"] = bool(re.search(r'', inner)) # fontRef fr = re.search(r'', inner) if fr: item["fontRef"] = _parse_lang_attrs(fr.group(1)) # ratio ra = re.search(r'', inner) if ra: item["ratio"] = _parse_lang_attrs(ra.group(1)) # spacing sp = re.search(r'', inner) if sp: item["spacing"] = _parse_lang_attrs(sp.group(1)) # underline ul = re.search(r']*\btype="([^"]+)"', inner) if ul: item["underline"] = ul.group(1) # strikeout so = re.search(r']*\bshape="([^"]+)"', inner) if so: item["strikeout"] = so.group(1) result.append(item) return result if result else None def _parse_lang_attrs(attrs_str: str) -> dict: """hangul="7" latin="6" ... → {"hangul": 7, "latin": 6, ...}""" pairs = re.findall(r'(\w+)="(-?\d+)"', attrs_str) return {k: int(v) for k, v in pairs} def _get_header_xml(raw_xml: dict, parsed: dict = None) -> str | None: if parsed and parsed.get("header_xml"): return parsed["header_xml"] if isinstance(raw_xml, dict): for name, content in raw_xml.items(): if "header" in name.lower() and isinstance(content, str): return content return raw_xml if isinstance(raw_xml, str) else None