# -*- coding: utf-8 -*-
"""
스타일 정의(Style) 추출
HWPX 실제 태그 (header.xml):
charPrIDRef → charPr(글자모양), paraPrIDRef → paraPr(문단모양) 연결.
디폴트값 생성 안 함.
"""
import re
def extract(raw_xml: dict, parsed: dict = None) -> list | None:
"""스타일 정의 추출.
Returns:
[
{
"id": 0, "type": "PARA",
"name": "바탕글", "engName": "Normal",
"paraPrIDRef": 3, "charPrIDRef": 0,
"nextStyleIDRef": 0,
},
...
]
"""
header_xml = _get_header_xml(raw_xml, parsed)
if not header_xml:
return None
styles = re.findall(r'', header_xml)
if not styles:
return None
result = []
for s in styles:
item = {}
for attr in ["id", "paraPrIDRef", "charPrIDRef", "nextStyleIDRef"]:
m = re.search(rf'\b{attr}="(\d+)"', s)
if m:
item[attr] = int(m.group(1))
for attr in ["type", "name", "engName"]:
m = re.search(rf'\b{attr}="([^"]*)"', s)
if m:
item[attr] = m.group(1)
result.append(item)
return result if result else None
def _get_header_xml(raw_xml: dict, parsed: dict = None) -> str | None:
if parsed and parsed.get("header_xml"):
return parsed["header_xml"]
if isinstance(raw_xml, dict):
for name, content in raw_xml.items():
if "header" in name.lower() and isinstance(content, str):
return content
return raw_xml if isinstance(raw_xml, str) else None