# -*- coding: utf-8 -*- """ §5 문단 모양(ParaShape) 추출 HWPX 실제 태그 (header.xml): 디폴트값 생성 안 함. """ import re from domain.hwpx.hwpx_utils import hwpunit_to_mm def extract(raw_xml: dict, parsed: dict = None) -> list | None: """§5 paraPr 전체 목록 추출. Returns: [ { "id": 0, "align": "JUSTIFY", "verticalAlign": "BASELINE", "heading": {"type": "NONE", "idRef": 0, "level": 0}, "breakSetting": { "widowOrphan": False, "keepWithNext": False, "keepLines": False, "pageBreakBefore": False, "lineWrap": "BREAK", "breakLatinWord": "KEEP_WORD", "breakNonLatinWord": "KEEP_WORD" }, "margin": { "indent_hu": -1310, "left_hu": 0, "right_hu": 0, "before_hu": 0, "after_hu": 0, }, "lineSpacing": {"type": "PERCENT", "value": 130}, "borderFillIDRef": 2, "tabPrIDRef": 1, }, ... ] """ header_xml = _get_header_xml(raw_xml, parsed) if not header_xml: return None blocks = re.findall( r']*)>(.*?)', header_xml, re.DOTALL ) if not blocks: return None result = [] for attrs_str, inner in blocks: item = {} # id id_m = re.search(r'\bid="(\d+)"', attrs_str) if id_m: item["id"] = int(id_m.group(1)) # tabPrIDRef tab_m = re.search(r'\btabPrIDRef="(\d+)"', attrs_str) if tab_m: item["tabPrIDRef"] = int(tab_m.group(1)) # align al = re.search(r']*\bhorizontal="([^"]+)"', inner) if al: item["align"] = al.group(1) val = re.search(r']*\bvertical="([^"]+)"', inner) if val: item["verticalAlign"] = val.group(1) # heading hd = re.search( r']*\btype="([^"]+)"[^>]*' r'\bidRef="(\d+)"[^>]*\blevel="(\d+)"', inner ) if hd: item["heading"] = { "type": hd.group(1), "idRef": int(hd.group(2)), "level": int(hd.group(3)), } # breakSetting bs = re.search(r'', inner) if bs: bstr = bs.group(1) item["breakSetting"] = { "widowOrphan": _bool_attr(bstr, "widowOrphan"), "keepWithNext": _bool_attr(bstr, "keepWithNext"), "keepLines": _bool_attr(bstr, "keepLines"), "pageBreakBefore": _bool_attr(bstr, "pageBreakBefore"), "lineWrap": _str_attr(bstr, "lineWrap"), "breakLatinWord": _str_attr(bstr, "breakLatinWord"), "breakNonLatinWord": _str_attr(bstr, "breakNonLatinWord"), } # margin (hp:case 블록 내 첫 번째 사용 — HwpUnitChar case 우선) case_block = re.search( r']*required-namespace="[^"]*HwpUnitChar[^"]*"[^>]*>' r'(.*?)', inner, re.DOTALL ) margin_src = case_block.group(1) if case_block else inner margin = {} for tag, key in [ ("intent", "indent_hu"), ("left", "left_hu"), ("right", "right_hu"), ("prev", "before_hu"), ("next", "after_hu"), ]: m = re.search( rf']*\bvalue="(-?\d+)"', margin_src ) if m: margin[key] = int(m.group(1)) if margin: item["margin"] = margin # lineSpacing ls = re.search( r']*\btype="([^"]+)"[^>]*\bvalue="(\d+)"', margin_src ) if ls: item["lineSpacing"] = { "type": ls.group(1), "value": int(ls.group(2)), } # borderFillIDRef bf = re.search(r']*\bborderFillIDRef="(\d+)"', inner) if bf: item["borderFillIDRef"] = int(bf.group(1)) result.append(item) return result if result else None def _bool_attr(s: str, name: str) -> bool | None: m = re.search(rf'\b{name}="(\d+)"', s) return bool(int(m.group(1))) if m else None def _str_attr(s: str, name: str) -> str | None: m = re.search(rf'\b{name}="([^"]+)"', s) return m.group(1) if m else None def _get_header_xml(raw_xml: dict, parsed: dict = None) -> str | None: if parsed and parsed.get("header_xml"): return parsed["header_xml"] if isinstance(raw_xml, dict): for name, content in raw_xml.items(): if "header" in name.lower() and isinstance(content, str): return content return raw_xml if isinstance(raw_xml, str) else None