Files
test/handlers/tools/style_def.py

68 lines
2.0 KiB
Python

# -*- coding: utf-8 -*-
"""
스타일 정의(Style) 추출
HWPX 실제 태그 (header.xml):
<hh:styles itemCnt="12">
<hh:style id="0" type="PARA" name="바탕글" engName="Normal"
paraPrIDRef="3" charPrIDRef="0" nextStyleIDRef="0"
langID="1042" lockForm="0"/>
<hh:style id="1" type="PARA" name="머리말" engName="Header"
paraPrIDRef="2" charPrIDRef="3" nextStyleIDRef="1" .../>
</hh:styles>
charPrIDRef → charPr(글자모양), paraPrIDRef → paraPr(문단모양) 연결.
디폴트값 생성 안 함.
"""
import re
def extract(raw_xml: dict, parsed: dict = None) -> list | None:
"""스타일 정의 추출.
Returns:
[
{
"id": 0, "type": "PARA",
"name": "바탕글", "engName": "Normal",
"paraPrIDRef": 3, "charPrIDRef": 0,
"nextStyleIDRef": 0,
},
...
]
"""
header_xml = _get_header_xml(raw_xml, parsed)
if not header_xml:
return None
styles = re.findall(r'<hh:style\b([^/]*)/>', header_xml)
if not styles:
return None
result = []
for s in styles:
item = {}
for attr in ["id", "paraPrIDRef", "charPrIDRef", "nextStyleIDRef"]:
m = re.search(rf'\b{attr}="(\d+)"', s)
if m:
item[attr] = int(m.group(1))
for attr in ["type", "name", "engName"]:
m = re.search(rf'\b{attr}="([^"]*)"', s)
if m:
item[attr] = m.group(1)
result.append(item)
return result if result else None
def _get_header_xml(raw_xml: dict, parsed: dict = None) -> str | None:
if parsed and parsed.get("header_xml"):
return parsed["header_xml"]
if isinstance(raw_xml, dict):
for name, content in raw_xml.items():
if "header" in name.lower() and isinstance(content, str):
return content
return raw_xml if isinstance(raw_xml, str) else None