Files
test/handlers/tools/section.py

120 lines
3.6 KiB
Python

# -*- coding: utf-8 -*-
"""
§9 구역 정의(Section) 추출
HWPX 실제 태그 (section0.xml):
<hp:secPr id="" textDirection="HORIZONTAL" spaceColumns="1134"
tabStop="8000" tabStopVal="4000" tabStopUnit="HWPUNIT"
outlineShapeIDRef="1" ...>
<hp:grid lineGrid="0" charGrid="0" .../>
<hp:startNum pageStartsOn="BOTH" page="0" .../>
<hp:visibility hideFirstHeader="0" hideFirstFooter="0" .../>
<hp:pagePr landscape="WIDELY" width="59528" height="84188" ...>
<hp:margin header="4251" footer="4251" left="5669" right="5669"
top="2834" bottom="2834"/>
<hp:pageNum pos="BOTTOM_CENTER" formatType="DIGIT" sideChar="-"/>
</hp:secPr>
디폴트값 생성 안 함.
"""
import re
def extract(raw_xml: dict, parsed: dict = None) -> dict | None:
"""§9 구역 속성 추출.
Returns:
{
"textDirection": "HORIZONTAL",
"hideFirstHeader": False,
"hideFirstFooter": False,
"pageNum": {"pos": "BOTTOM_CENTER", "formatType": "DIGIT",
"sideChar": "-"},
"startNum": {"page": 0},
"colDef": None,
}
"""
section_xml = _get_section_xml(raw_xml, parsed)
if not section_xml:
return None
sec_match = re.search(
r'<hp:secPr\b([^>]*)>(.*?)</hp:secPr>',
section_xml, re.DOTALL
)
if not sec_match:
return None
attrs_str = sec_match.group(1)
inner = sec_match.group(2)
result = {}
# textDirection
td = re.search(r'\btextDirection="([^"]+)"', attrs_str)
if td:
result["textDirection"] = td.group(1)
# visibility
vis = re.search(r'<hp:visibility\b([^/]*)/?>', inner)
if vis:
v = vis.group(1)
for attr in ["hideFirstHeader", "hideFirstFooter",
"hideFirstMasterPage", "hideFirstPageNum",
"hideFirstEmptyLine"]:
m = re.search(rf'\b{attr}="(\d+)"', v)
if m:
result[attr] = bool(int(m.group(1)))
# startNum
sn = re.search(r'<hp:startNum\b([^/]*)/?>', inner)
if sn:
sns = sn.group(1)
start = {}
pso = re.search(r'\bpageStartsOn="([^"]+)"', sns)
if pso:
start["pageStartsOn"] = pso.group(1)
pg = re.search(r'\bpage="(\d+)"', sns)
if pg:
start["page"] = int(pg.group(1))
if start:
result["startNum"] = start
# pageNum
pn = re.search(r'<hp:pageNum\b([^/]*)/?>', inner)
if pn:
pns = pn.group(1)
pagenum = {}
for attr in ["pos", "formatType", "sideChar"]:
m = re.search(rf'\b{attr}="([^"]*)"', pns)
if m:
pagenum[attr] = m.group(1)
if pagenum:
result["pageNum"] = pagenum
# colDef (단 설정)
cd = re.search(r'<hp:colDef\b([^>]*)>(.*?)</hp:colDef>', inner, re.DOTALL)
if cd:
cds = cd.group(1)
coldef = {}
cnt = re.search(r'\bcount="(\d+)"', cds)
if cnt:
coldef["count"] = int(cnt.group(1))
layout = re.search(r'\blayout="([^"]+)"', cds)
if layout:
coldef["layout"] = layout.group(1)
if coldef:
result["colDef"] = coldef
return result if result else None
def _get_section_xml(raw_xml: dict, parsed: dict = None) -> str | None:
if parsed and parsed.get("section_xml"):
return parsed["section_xml"]
if isinstance(raw_xml, dict):
for name, content in raw_xml.items():
if "section" in name.lower() and isinstance(content, str):
return content
return raw_xml if isinstance(raw_xml, str) else None