# -*- coding: utf-8 -*- """ 번호매기기(Numbering) / 글머리표(Bullet) 추출 HWPX 실제 태그 (header.xml): ^1. ^2. 디폴트값 생성 안 함. """ import re def extract(raw_xml: dict, parsed: dict = None) -> dict | None: """번호매기기 + 글머리표 정의 추출. Returns: { "numberings": [ { "id": 1, "start": 0, "levels": [ {"level": 1, "numFormat": "DIGIT", "pattern": "^1.", "align": "LEFT"}, {"level": 2, "numFormat": "HANGUL_SYLLABLE", "pattern": "^2."}, ... ] } ], "bullets": [ {"id": 1, "char": "-", "useImage": False} ] } """ header_xml = _get_header_xml(raw_xml, parsed) if not header_xml: return None result = {} # ── 번호매기기 ── numbering_blocks = re.findall( r']*)>(.*?)', header_xml, re.DOTALL ) if numbering_blocks: nums = [] for attrs, inner in numbering_blocks: num = {} id_m = re.search(r'\bid="(\d+)"', attrs) if id_m: num["id"] = int(id_m.group(1)) start_m = re.search(r'\bstart="(\d+)"', attrs) if start_m: num["start"] = int(start_m.group(1)) # paraHead 레벨들 levels = [] heads = re.finditer( r']*)>([^<]*)', inner ) for h in heads: h_attrs = h.group(1) h_pattern = h.group(2).strip() level = {} lv = re.search(r'\blevel="(\d+)"', h_attrs) if lv: level["level"] = int(lv.group(1)) fmt = re.search(r'\bnumFormat="([^"]+)"', h_attrs) if fmt: level["numFormat"] = fmt.group(1) al = re.search(r'\balign="([^"]+)"', h_attrs) if al: level["align"] = al.group(1) if h_pattern: level["pattern"] = h_pattern if level: levels.append(level) if levels: num["levels"] = levels nums.append(num) if nums: result["numberings"] = nums # ── 글머리표 ── bullet_blocks = re.findall( r']*)>(.*?)', header_xml, re.DOTALL ) if bullet_blocks: bullets = [] for attrs, inner in bullet_blocks: bullet = {} id_m = re.search(r'\bid="(\d+)"', attrs) if id_m: bullet["id"] = int(id_m.group(1)) char_m = re.search(r'\bchar="([^"]*)"', attrs) if char_m: bullet["char"] = char_m.group(1) img_m = re.search(r'\buseImage="(\d+)"', attrs) if img_m: bullet["useImage"] = bool(int(img_m.group(1))) bullets.append(bullet) if bullets: result["bullets"] = bullets return result if result else None def _get_header_xml(raw_xml: dict, parsed: dict = None) -> str | None: if parsed and parsed.get("header_xml"): return parsed["header_xml"] if isinstance(raw_xml, dict): for name, content in raw_xml.items(): if "header" in name.lower() and isinstance(content, str): return content return raw_xml if isinstance(raw_xml, str) else None