Files
test/handlers/tools/para_style.py

185 lines
5.7 KiB
Python

# -*- coding: utf-8 -*-
"""
§5 문단 모양(ParaShape) 추출
HWPX 실제 태그 (header.xml):
<hh:paraPr id="0" tabPrIDRef="1" condense="0" ...>
<hh:align horizontal="JUSTIFY" vertical="BASELINE"/>
<hh:heading type="NONE" idRef="0" level="0"/>
<hh:breakSetting breakLatinWord="KEEP_WORD" breakNonLatinWord="KEEP_WORD"
widowOrphan="0" keepWithNext="0" keepLines="0"
pageBreakBefore="0" lineWrap="BREAK"/>
<hp:case ...>
<hh:margin>
<hc:intent value="-1310" unit="HWPUNIT"/>
<hc:left value="0" unit="HWPUNIT"/>
<hc:right value="0" unit="HWPUNIT"/>
<hc:prev value="0" unit="HWPUNIT"/>
<hc:next value="0" unit="HWPUNIT"/>
</hh:margin>
<hh:lineSpacing type="PERCENT" value="130" unit="HWPUNIT"/>
</hp:case>
<hh:border borderFillIDRef="2" .../>
</hh:paraPr>
디폴트값 생성 안 함.
"""
import re
from domain.hwpx.hwpx_utils import hwpunit_to_mm
def extract(raw_xml: dict, parsed: dict = None) -> list | None:
"""§5 paraPr 전체 목록 추출.
Returns:
[
{
"id": 0,
"align": "JUSTIFY",
"verticalAlign": "BASELINE",
"heading": {"type": "NONE", "idRef": 0, "level": 0},
"breakSetting": {
"widowOrphan": False, "keepWithNext": False,
"keepLines": False, "pageBreakBefore": False,
"lineWrap": "BREAK",
"breakLatinWord": "KEEP_WORD",
"breakNonLatinWord": "KEEP_WORD"
},
"margin": {
"indent_hu": -1310, "left_hu": 0, "right_hu": 0,
"before_hu": 0, "after_hu": 0,
},
"lineSpacing": {"type": "PERCENT", "value": 130},
"borderFillIDRef": 2,
"tabPrIDRef": 1,
},
...
]
"""
header_xml = _get_header_xml(raw_xml, parsed)
if not header_xml:
return None
blocks = re.findall(
r'<hh:paraPr\b([^>]*)>(.*?)</hh:paraPr>',
header_xml, re.DOTALL
)
if not blocks:
return None
result = []
for attrs_str, inner in blocks:
item = {}
# id
id_m = re.search(r'\bid="(\d+)"', attrs_str)
if id_m:
item["id"] = int(id_m.group(1))
# tabPrIDRef
tab_m = re.search(r'\btabPrIDRef="(\d+)"', attrs_str)
if tab_m:
item["tabPrIDRef"] = int(tab_m.group(1))
# align
al = re.search(r'<hh:align\b[^>]*\bhorizontal="([^"]+)"', inner)
if al:
item["align"] = al.group(1)
val = re.search(r'<hh:align\b[^>]*\bvertical="([^"]+)"', inner)
if val:
item["verticalAlign"] = val.group(1)
# heading
hd = re.search(
r'<hh:heading\b[^>]*\btype="([^"]+)"[^>]*'
r'\bidRef="(\d+)"[^>]*\blevel="(\d+)"', inner
)
if hd:
item["heading"] = {
"type": hd.group(1),
"idRef": int(hd.group(2)),
"level": int(hd.group(3)),
}
# breakSetting
bs = re.search(r'<hh:breakSetting\b([^/]*)/?>', inner)
if bs:
bstr = bs.group(1)
item["breakSetting"] = {
"widowOrphan": _bool_attr(bstr, "widowOrphan"),
"keepWithNext": _bool_attr(bstr, "keepWithNext"),
"keepLines": _bool_attr(bstr, "keepLines"),
"pageBreakBefore": _bool_attr(bstr, "pageBreakBefore"),
"lineWrap": _str_attr(bstr, "lineWrap"),
"breakLatinWord": _str_attr(bstr, "breakLatinWord"),
"breakNonLatinWord": _str_attr(bstr, "breakNonLatinWord"),
}
# margin (hp:case 블록 내 첫 번째 사용 — HwpUnitChar case 우선)
case_block = re.search(
r'<hp:case\b[^>]*required-namespace="[^"]*HwpUnitChar[^"]*"[^>]*>'
r'(.*?)</hp:case>',
inner, re.DOTALL
)
margin_src = case_block.group(1) if case_block else inner
margin = {}
for tag, key in [
("intent", "indent_hu"),
("left", "left_hu"),
("right", "right_hu"),
("prev", "before_hu"),
("next", "after_hu"),
]:
m = re.search(
rf'<hc:{tag}\b[^>]*\bvalue="(-?\d+)"', margin_src
)
if m:
margin[key] = int(m.group(1))
if margin:
item["margin"] = margin
# lineSpacing
ls = re.search(
r'<hh:lineSpacing\b[^>]*\btype="([^"]+)"[^>]*\bvalue="(\d+)"',
margin_src
)
if ls:
item["lineSpacing"] = {
"type": ls.group(1),
"value": int(ls.group(2)),
}
# borderFillIDRef
bf = re.search(r'<hh:border\b[^>]*\bborderFillIDRef="(\d+)"', inner)
if bf:
item["borderFillIDRef"] = int(bf.group(1))
result.append(item)
return result if result else None
def _bool_attr(s: str, name: str) -> bool | None:
m = re.search(rf'\b{name}="(\d+)"', s)
return bool(int(m.group(1))) if m else None
def _str_attr(s: str, name: str) -> str | None:
m = re.search(rf'\b{name}="([^"]+)"', s)
return m.group(1) if m else None
def _get_header_xml(raw_xml: dict, parsed: dict = None) -> str | None:
if parsed and parsed.get("header_xml"):
return parsed["header_xml"]
if isinstance(raw_xml, dict):
for name, content in raw_xml.items():
if "header" in name.lower() and isinstance(content, str):
return content
return raw_xml if isinstance(raw_xml, str) else None