68 lines
2.0 KiB
Python
68 lines
2.0 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""
|
|
스타일 정의(Style) 추출
|
|
|
|
HWPX 실제 태그 (header.xml):
|
|
<hh:styles itemCnt="12">
|
|
<hh:style id="0" type="PARA" name="바탕글" engName="Normal"
|
|
paraPrIDRef="3" charPrIDRef="0" nextStyleIDRef="0"
|
|
langID="1042" lockForm="0"/>
|
|
<hh:style id="1" type="PARA" name="머리말" engName="Header"
|
|
paraPrIDRef="2" charPrIDRef="3" nextStyleIDRef="1" .../>
|
|
</hh:styles>
|
|
|
|
charPrIDRef → charPr(글자모양), paraPrIDRef → paraPr(문단모양) 연결.
|
|
디폴트값 생성 안 함.
|
|
"""
|
|
|
|
import re
|
|
|
|
|
|
def extract(raw_xml: dict, parsed: dict = None) -> list | None:
|
|
"""스타일 정의 추출.
|
|
|
|
Returns:
|
|
[
|
|
{
|
|
"id": 0, "type": "PARA",
|
|
"name": "바탕글", "engName": "Normal",
|
|
"paraPrIDRef": 3, "charPrIDRef": 0,
|
|
"nextStyleIDRef": 0,
|
|
},
|
|
...
|
|
]
|
|
"""
|
|
header_xml = _get_header_xml(raw_xml, parsed)
|
|
if not header_xml:
|
|
return None
|
|
|
|
styles = re.findall(r'<hh:style\b([^/]*)/>', header_xml)
|
|
if not styles:
|
|
return None
|
|
|
|
result = []
|
|
for s in styles:
|
|
item = {}
|
|
for attr in ["id", "paraPrIDRef", "charPrIDRef", "nextStyleIDRef"]:
|
|
m = re.search(rf'\b{attr}="(\d+)"', s)
|
|
if m:
|
|
item[attr] = int(m.group(1))
|
|
|
|
for attr in ["type", "name", "engName"]:
|
|
m = re.search(rf'\b{attr}="([^"]*)"', s)
|
|
if m:
|
|
item[attr] = m.group(1)
|
|
|
|
result.append(item)
|
|
|
|
return result if result else None
|
|
|
|
|
|
def _get_header_xml(raw_xml: dict, parsed: dict = None) -> str | None:
|
|
if parsed and parsed.get("header_xml"):
|
|
return parsed["header_xml"]
|
|
if isinstance(raw_xml, dict):
|
|
for name, content in raw_xml.items():
|
|
if "header" in name.lower() and isinstance(content, str):
|
|
return content
|
|
return raw_xml if isinstance(raw_xml, str) else None |