185 lines
5.7 KiB
Python
185 lines
5.7 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""
|
|
§5 문단 모양(ParaShape) 추출
|
|
|
|
HWPX 실제 태그 (header.xml):
|
|
<hh:paraPr id="0" tabPrIDRef="1" condense="0" ...>
|
|
<hh:align horizontal="JUSTIFY" vertical="BASELINE"/>
|
|
<hh:heading type="NONE" idRef="0" level="0"/>
|
|
<hh:breakSetting breakLatinWord="KEEP_WORD" breakNonLatinWord="KEEP_WORD"
|
|
widowOrphan="0" keepWithNext="0" keepLines="0"
|
|
pageBreakBefore="0" lineWrap="BREAK"/>
|
|
<hp:case ...>
|
|
<hh:margin>
|
|
<hc:intent value="-1310" unit="HWPUNIT"/>
|
|
<hc:left value="0" unit="HWPUNIT"/>
|
|
<hc:right value="0" unit="HWPUNIT"/>
|
|
<hc:prev value="0" unit="HWPUNIT"/>
|
|
<hc:next value="0" unit="HWPUNIT"/>
|
|
</hh:margin>
|
|
<hh:lineSpacing type="PERCENT" value="130" unit="HWPUNIT"/>
|
|
</hp:case>
|
|
<hh:border borderFillIDRef="2" .../>
|
|
</hh:paraPr>
|
|
|
|
디폴트값 생성 안 함.
|
|
"""
|
|
|
|
import re
|
|
|
|
from domain.hwpx.hwpx_utils import hwpunit_to_mm
|
|
|
|
|
|
def extract(raw_xml: dict, parsed: dict = None) -> list | None:
|
|
"""§5 paraPr 전체 목록 추출.
|
|
|
|
Returns:
|
|
[
|
|
{
|
|
"id": 0,
|
|
"align": "JUSTIFY",
|
|
"verticalAlign": "BASELINE",
|
|
"heading": {"type": "NONE", "idRef": 0, "level": 0},
|
|
"breakSetting": {
|
|
"widowOrphan": False, "keepWithNext": False,
|
|
"keepLines": False, "pageBreakBefore": False,
|
|
"lineWrap": "BREAK",
|
|
"breakLatinWord": "KEEP_WORD",
|
|
"breakNonLatinWord": "KEEP_WORD"
|
|
},
|
|
"margin": {
|
|
"indent_hu": -1310, "left_hu": 0, "right_hu": 0,
|
|
"before_hu": 0, "after_hu": 0,
|
|
},
|
|
"lineSpacing": {"type": "PERCENT", "value": 130},
|
|
"borderFillIDRef": 2,
|
|
"tabPrIDRef": 1,
|
|
},
|
|
...
|
|
]
|
|
"""
|
|
header_xml = _get_header_xml(raw_xml, parsed)
|
|
if not header_xml:
|
|
return None
|
|
|
|
blocks = re.findall(
|
|
r'<hh:paraPr\b([^>]*)>(.*?)</hh:paraPr>',
|
|
header_xml, re.DOTALL
|
|
)
|
|
|
|
if not blocks:
|
|
return None
|
|
|
|
result = []
|
|
for attrs_str, inner in blocks:
|
|
item = {}
|
|
|
|
# id
|
|
id_m = re.search(r'\bid="(\d+)"', attrs_str)
|
|
if id_m:
|
|
item["id"] = int(id_m.group(1))
|
|
|
|
# tabPrIDRef
|
|
tab_m = re.search(r'\btabPrIDRef="(\d+)"', attrs_str)
|
|
if tab_m:
|
|
item["tabPrIDRef"] = int(tab_m.group(1))
|
|
|
|
# align
|
|
al = re.search(r'<hh:align\b[^>]*\bhorizontal="([^"]+)"', inner)
|
|
if al:
|
|
item["align"] = al.group(1)
|
|
|
|
val = re.search(r'<hh:align\b[^>]*\bvertical="([^"]+)"', inner)
|
|
if val:
|
|
item["verticalAlign"] = val.group(1)
|
|
|
|
# heading
|
|
hd = re.search(
|
|
r'<hh:heading\b[^>]*\btype="([^"]+)"[^>]*'
|
|
r'\bidRef="(\d+)"[^>]*\blevel="(\d+)"', inner
|
|
)
|
|
if hd:
|
|
item["heading"] = {
|
|
"type": hd.group(1),
|
|
"idRef": int(hd.group(2)),
|
|
"level": int(hd.group(3)),
|
|
}
|
|
|
|
# breakSetting
|
|
bs = re.search(r'<hh:breakSetting\b([^/]*)/?>', inner)
|
|
if bs:
|
|
bstr = bs.group(1)
|
|
item["breakSetting"] = {
|
|
"widowOrphan": _bool_attr(bstr, "widowOrphan"),
|
|
"keepWithNext": _bool_attr(bstr, "keepWithNext"),
|
|
"keepLines": _bool_attr(bstr, "keepLines"),
|
|
"pageBreakBefore": _bool_attr(bstr, "pageBreakBefore"),
|
|
"lineWrap": _str_attr(bstr, "lineWrap"),
|
|
"breakLatinWord": _str_attr(bstr, "breakLatinWord"),
|
|
"breakNonLatinWord": _str_attr(bstr, "breakNonLatinWord"),
|
|
}
|
|
|
|
# margin (hp:case 블록 내 첫 번째 사용 — HwpUnitChar case 우선)
|
|
case_block = re.search(
|
|
r'<hp:case\b[^>]*required-namespace="[^"]*HwpUnitChar[^"]*"[^>]*>'
|
|
r'(.*?)</hp:case>',
|
|
inner, re.DOTALL
|
|
)
|
|
margin_src = case_block.group(1) if case_block else inner
|
|
|
|
margin = {}
|
|
for tag, key in [
|
|
("intent", "indent_hu"),
|
|
("left", "left_hu"),
|
|
("right", "right_hu"),
|
|
("prev", "before_hu"),
|
|
("next", "after_hu"),
|
|
]:
|
|
m = re.search(
|
|
rf'<hc:{tag}\b[^>]*\bvalue="(-?\d+)"', margin_src
|
|
)
|
|
if m:
|
|
margin[key] = int(m.group(1))
|
|
|
|
if margin:
|
|
item["margin"] = margin
|
|
|
|
# lineSpacing
|
|
ls = re.search(
|
|
r'<hh:lineSpacing\b[^>]*\btype="([^"]+)"[^>]*\bvalue="(\d+)"',
|
|
margin_src
|
|
)
|
|
if ls:
|
|
item["lineSpacing"] = {
|
|
"type": ls.group(1),
|
|
"value": int(ls.group(2)),
|
|
}
|
|
|
|
# borderFillIDRef
|
|
bf = re.search(r'<hh:border\b[^>]*\bborderFillIDRef="(\d+)"', inner)
|
|
if bf:
|
|
item["borderFillIDRef"] = int(bf.group(1))
|
|
|
|
result.append(item)
|
|
|
|
return result if result else None
|
|
|
|
|
|
def _bool_attr(s: str, name: str) -> bool | None:
|
|
m = re.search(rf'\b{name}="(\d+)"', s)
|
|
return bool(int(m.group(1))) if m else None
|
|
|
|
|
|
def _str_attr(s: str, name: str) -> str | None:
|
|
m = re.search(rf'\b{name}="([^"]+)"', s)
|
|
return m.group(1) if m else None
|
|
|
|
|
|
def _get_header_xml(raw_xml: dict, parsed: dict = None) -> str | None:
|
|
if parsed and parsed.get("header_xml"):
|
|
return parsed["header_xml"]
|
|
if isinstance(raw_xml, dict):
|
|
for name, content in raw_xml.items():
|
|
if "header" in name.lower() and isinstance(content, str):
|
|
return content
|
|
return raw_xml if isinstance(raw_xml, str) else None |