# -*- coding: utf-8 -*- """ 이미지/그리기 객체(ShapeObject) 추출 HWPX 실제 태그 (section0.xml): 또는 그리기 객체: ... 디폴트값 생성 안 함. """ import re from domain.hwpx.hwpx_utils import hwpunit_to_mm def extract(raw_xml: dict, parsed: dict = None) -> list | None: """이미지/그리기 객체 추출. Returns: [ { "type": "image", "binaryItemRef": "image1.JPG", "width_hu": 28346, "height_hu": 14173, "width_mm": 100.0, "height_mm": 50.0, "offset": {"x": 0, "y": 0}, }, ... ] """ section_xml = _get_section_xml(raw_xml, parsed) if not section_xml: return None result = [] # 블록 pic_blocks = re.finditer( r']*)>(.*?)', section_xml, re.DOTALL ) for pm in pic_blocks: pic_inner = pm.group(2) item = {"type": "image"} # binaryItemRef img = re.search(r']*\bbinaryItemIDRef="([^"]+)"', pic_inner) if img: item["binaryItemRef"] = img.group(1) # curSz (현재 크기) csz = re.search( r']*\bwidth="(\d+)"[^>]*\bheight="(\d+)"', pic_inner ) if csz: w, h = int(csz.group(1)), int(csz.group(2)) item["width_hu"] = w item["height_hu"] = h item["width_mm"] = round(hwpunit_to_mm(w), 1) item["height_mm"] = round(hwpunit_to_mm(h), 1) # offset off = re.search( r']*\bx="(-?\d+)"[^>]*\by="(-?\d+)"', pic_inner ) if off: item["offset"] = {"x": int(off.group(1)), "y": int(off.group(2))} result.append(item) return result if result else None def _get_section_xml(raw_xml: dict, parsed: dict = None) -> str | None: if parsed and parsed.get("section_xml"): return parsed["section_xml"] if isinstance(raw_xml, dict): for name, content in raw_xml.items(): if "section" in name.lower() and isinstance(content, str): return content return raw_xml if isinstance(raw_xml, str) else None