v8:문서유형 분석등록 및 추출_20260206
This commit is contained in:
98
handlers/tools/image.py
Normal file
98
handlers/tools/image.py
Normal file
@@ -0,0 +1,98 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
이미지/그리기 객체(ShapeObject) 추출
|
||||
|
||||
HWPX 실제 태그 (section0.xml):
|
||||
<hp:pic id="..." zOrder="..." ...>
|
||||
<hp:offset x="0" y="0"/>
|
||||
<hp:orgSz width="..." height="..."/>
|
||||
<hp:curSz width="..." height="..."/>
|
||||
<hp:imgRect>
|
||||
<hp:pt x="..." y="..."/> <!-- 4개 꼭짓점 -->
|
||||
</hp:imgRect>
|
||||
<hp:imgClip .../>
|
||||
<hp:img binaryItemIDRef="image1.JPG" .../>
|
||||
</hp:pic>
|
||||
|
||||
또는 그리기 객체:
|
||||
<hp:container id="..." ...>
|
||||
<hp:offset x="..." y="..."/>
|
||||
...
|
||||
</hp:container>
|
||||
|
||||
디폴트값 생성 안 함.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from domain.hwpx.hwpx_utils import hwpunit_to_mm
|
||||
|
||||
|
||||
def extract(raw_xml: dict, parsed: dict = None) -> list | None:
|
||||
"""이미지/그리기 객체 추출.
|
||||
|
||||
Returns:
|
||||
[
|
||||
{
|
||||
"type": "image",
|
||||
"binaryItemRef": "image1.JPG",
|
||||
"width_hu": 28346, "height_hu": 14173,
|
||||
"width_mm": 100.0, "height_mm": 50.0,
|
||||
"offset": {"x": 0, "y": 0},
|
||||
},
|
||||
...
|
||||
]
|
||||
"""
|
||||
section_xml = _get_section_xml(raw_xml, parsed)
|
||||
if not section_xml:
|
||||
return None
|
||||
|
||||
result = []
|
||||
|
||||
# <hp:pic> 블록
|
||||
pic_blocks = re.finditer(
|
||||
r'<hp:pic\b([^>]*)>(.*?)</hp:pic>',
|
||||
section_xml, re.DOTALL
|
||||
)
|
||||
for pm in pic_blocks:
|
||||
pic_inner = pm.group(2)
|
||||
item = {"type": "image"}
|
||||
|
||||
# binaryItemRef
|
||||
img = re.search(r'<hp:img\b[^>]*\bbinaryItemIDRef="([^"]+)"', pic_inner)
|
||||
if img:
|
||||
item["binaryItemRef"] = img.group(1)
|
||||
|
||||
# curSz (현재 크기)
|
||||
csz = re.search(
|
||||
r'<hp:curSz\b[^>]*\bwidth="(\d+)"[^>]*\bheight="(\d+)"',
|
||||
pic_inner
|
||||
)
|
||||
if csz:
|
||||
w, h = int(csz.group(1)), int(csz.group(2))
|
||||
item["width_hu"] = w
|
||||
item["height_hu"] = h
|
||||
item["width_mm"] = round(hwpunit_to_mm(w), 1)
|
||||
item["height_mm"] = round(hwpunit_to_mm(h), 1)
|
||||
|
||||
# offset
|
||||
off = re.search(
|
||||
r'<hp:offset\b[^>]*\bx="(-?\d+)"[^>]*\by="(-?\d+)"',
|
||||
pic_inner
|
||||
)
|
||||
if off:
|
||||
item["offset"] = {"x": int(off.group(1)), "y": int(off.group(2))}
|
||||
|
||||
result.append(item)
|
||||
|
||||
return result if result else None
|
||||
|
||||
|
||||
def _get_section_xml(raw_xml: dict, parsed: dict = None) -> str | None:
|
||||
if parsed and parsed.get("section_xml"):
|
||||
return parsed["section_xml"]
|
||||
if isinstance(raw_xml, dict):
|
||||
for name, content in raw_xml.items():
|
||||
if "section" in name.lower() and isinstance(content, str):
|
||||
return content
|
||||
return raw_xml if isinstance(raw_xml, str) else None
|
||||
Reference in New Issue
Block a user