120 lines
3.6 KiB
Python
120 lines
3.6 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""
|
|
§9 구역 정의(Section) 추출
|
|
|
|
HWPX 실제 태그 (section0.xml):
|
|
<hp:secPr id="" textDirection="HORIZONTAL" spaceColumns="1134"
|
|
tabStop="8000" tabStopVal="4000" tabStopUnit="HWPUNIT"
|
|
outlineShapeIDRef="1" ...>
|
|
<hp:grid lineGrid="0" charGrid="0" .../>
|
|
<hp:startNum pageStartsOn="BOTH" page="0" .../>
|
|
<hp:visibility hideFirstHeader="0" hideFirstFooter="0" .../>
|
|
<hp:pagePr landscape="WIDELY" width="59528" height="84188" ...>
|
|
<hp:margin header="4251" footer="4251" left="5669" right="5669"
|
|
top="2834" bottom="2834"/>
|
|
<hp:pageNum pos="BOTTOM_CENTER" formatType="DIGIT" sideChar="-"/>
|
|
</hp:secPr>
|
|
|
|
디폴트값 생성 안 함.
|
|
"""
|
|
|
|
import re
|
|
|
|
|
|
def extract(raw_xml: dict, parsed: dict = None) -> dict | None:
|
|
"""§9 구역 속성 추출.
|
|
|
|
Returns:
|
|
{
|
|
"textDirection": "HORIZONTAL",
|
|
"hideFirstHeader": False,
|
|
"hideFirstFooter": False,
|
|
"pageNum": {"pos": "BOTTOM_CENTER", "formatType": "DIGIT",
|
|
"sideChar": "-"},
|
|
"startNum": {"page": 0},
|
|
"colDef": None,
|
|
}
|
|
"""
|
|
section_xml = _get_section_xml(raw_xml, parsed)
|
|
if not section_xml:
|
|
return None
|
|
|
|
sec_match = re.search(
|
|
r'<hp:secPr\b([^>]*)>(.*?)</hp:secPr>',
|
|
section_xml, re.DOTALL
|
|
)
|
|
if not sec_match:
|
|
return None
|
|
|
|
attrs_str = sec_match.group(1)
|
|
inner = sec_match.group(2)
|
|
|
|
result = {}
|
|
|
|
# textDirection
|
|
td = re.search(r'\btextDirection="([^"]+)"', attrs_str)
|
|
if td:
|
|
result["textDirection"] = td.group(1)
|
|
|
|
# visibility
|
|
vis = re.search(r'<hp:visibility\b([^/]*)/?>', inner)
|
|
if vis:
|
|
v = vis.group(1)
|
|
for attr in ["hideFirstHeader", "hideFirstFooter",
|
|
"hideFirstMasterPage", "hideFirstPageNum",
|
|
"hideFirstEmptyLine"]:
|
|
m = re.search(rf'\b{attr}="(\d+)"', v)
|
|
if m:
|
|
result[attr] = bool(int(m.group(1)))
|
|
|
|
# startNum
|
|
sn = re.search(r'<hp:startNum\b([^/]*)/?>', inner)
|
|
if sn:
|
|
sns = sn.group(1)
|
|
start = {}
|
|
pso = re.search(r'\bpageStartsOn="([^"]+)"', sns)
|
|
if pso:
|
|
start["pageStartsOn"] = pso.group(1)
|
|
pg = re.search(r'\bpage="(\d+)"', sns)
|
|
if pg:
|
|
start["page"] = int(pg.group(1))
|
|
if start:
|
|
result["startNum"] = start
|
|
|
|
# pageNum
|
|
pn = re.search(r'<hp:pageNum\b([^/]*)/?>', inner)
|
|
if pn:
|
|
pns = pn.group(1)
|
|
pagenum = {}
|
|
for attr in ["pos", "formatType", "sideChar"]:
|
|
m = re.search(rf'\b{attr}="([^"]*)"', pns)
|
|
if m:
|
|
pagenum[attr] = m.group(1)
|
|
if pagenum:
|
|
result["pageNum"] = pagenum
|
|
|
|
# colDef (단 설정)
|
|
cd = re.search(r'<hp:colDef\b([^>]*)>(.*?)</hp:colDef>', inner, re.DOTALL)
|
|
if cd:
|
|
cds = cd.group(1)
|
|
coldef = {}
|
|
cnt = re.search(r'\bcount="(\d+)"', cds)
|
|
if cnt:
|
|
coldef["count"] = int(cnt.group(1))
|
|
layout = re.search(r'\blayout="([^"]+)"', cds)
|
|
if layout:
|
|
coldef["layout"] = layout.group(1)
|
|
if coldef:
|
|
result["colDef"] = coldef
|
|
|
|
return result if result else None
|
|
|
|
|
|
def _get_section_xml(raw_xml: dict, parsed: dict = None) -> str | None:
|
|
if parsed and parsed.get("section_xml"):
|
|
return parsed["section_xml"]
|
|
if isinstance(raw_xml, dict):
|
|
for name, content in raw_xml.items():
|
|
if "section" in name.lower() and isinstance(content, str):
|
|
return content
|
|
return raw_xml if isinstance(raw_xml, str) else None |