136 lines
4.1 KiB
Python
136 lines
4.1 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""
|
|
번호매기기(Numbering) / 글머리표(Bullet) 추출
|
|
|
|
HWPX 실제 태그 (header.xml):
|
|
<hh:numbering id="1" start="0">
|
|
<hh:paraHead start="1" level="1" align="LEFT" useInstWidth="1"
|
|
autoIndent="1" widthAdjust="0" textOffsetType="PERCENT"
|
|
textOffset="50" numFormat="DIGIT" charPrIDRef="4294967295"
|
|
checkable="0">^1.</hh:paraHead>
|
|
<hh:paraHead start="1" level="2" ... numFormat="HANGUL_SYLLABLE">^2.</hh:paraHead>
|
|
</hh:numbering>
|
|
|
|
<hh:bullet id="1" char="-" useImage="0">
|
|
<hh:paraHead level="0" align="LEFT" .../>
|
|
</hh:bullet>
|
|
|
|
디폴트값 생성 안 함.
|
|
"""
|
|
|
|
import re
|
|
|
|
|
|
def extract(raw_xml: dict, parsed: dict = None) -> dict | None:
|
|
"""번호매기기 + 글머리표 정의 추출.
|
|
|
|
Returns:
|
|
{
|
|
"numberings": [
|
|
{
|
|
"id": 1, "start": 0,
|
|
"levels": [
|
|
{"level": 1, "numFormat": "DIGIT", "pattern": "^1.",
|
|
"align": "LEFT"},
|
|
{"level": 2, "numFormat": "HANGUL_SYLLABLE", "pattern": "^2."},
|
|
...
|
|
]
|
|
}
|
|
],
|
|
"bullets": [
|
|
{"id": 1, "char": "-", "useImage": False}
|
|
]
|
|
}
|
|
"""
|
|
header_xml = _get_header_xml(raw_xml, parsed)
|
|
if not header_xml:
|
|
return None
|
|
|
|
result = {}
|
|
|
|
# ── 번호매기기 ──
|
|
numbering_blocks = re.findall(
|
|
r'<hh:numbering\b([^>]*)>(.*?)</hh:numbering>',
|
|
header_xml, re.DOTALL
|
|
)
|
|
if numbering_blocks:
|
|
nums = []
|
|
for attrs, inner in numbering_blocks:
|
|
num = {}
|
|
id_m = re.search(r'\bid="(\d+)"', attrs)
|
|
if id_m:
|
|
num["id"] = int(id_m.group(1))
|
|
start_m = re.search(r'\bstart="(\d+)"', attrs)
|
|
if start_m:
|
|
num["start"] = int(start_m.group(1))
|
|
|
|
# paraHead 레벨들
|
|
levels = []
|
|
heads = re.finditer(
|
|
r'<hh:paraHead\b([^>]*)>([^<]*)</hh:paraHead>',
|
|
inner
|
|
)
|
|
for h in heads:
|
|
h_attrs = h.group(1)
|
|
h_pattern = h.group(2).strip()
|
|
level = {}
|
|
|
|
lv = re.search(r'\blevel="(\d+)"', h_attrs)
|
|
if lv:
|
|
level["level"] = int(lv.group(1))
|
|
|
|
fmt = re.search(r'\bnumFormat="([^"]+)"', h_attrs)
|
|
if fmt:
|
|
level["numFormat"] = fmt.group(1)
|
|
|
|
al = re.search(r'\balign="([^"]+)"', h_attrs)
|
|
if al:
|
|
level["align"] = al.group(1)
|
|
|
|
if h_pattern:
|
|
level["pattern"] = h_pattern
|
|
|
|
if level:
|
|
levels.append(level)
|
|
|
|
if levels:
|
|
num["levels"] = levels
|
|
nums.append(num)
|
|
|
|
if nums:
|
|
result["numberings"] = nums
|
|
|
|
# ── 글머리표 ──
|
|
bullet_blocks = re.findall(
|
|
r'<hh:bullet\b([^>]*)>(.*?)</hh:bullet>',
|
|
header_xml, re.DOTALL
|
|
)
|
|
if bullet_blocks:
|
|
bullets = []
|
|
for attrs, inner in bullet_blocks:
|
|
bullet = {}
|
|
id_m = re.search(r'\bid="(\d+)"', attrs)
|
|
if id_m:
|
|
bullet["id"] = int(id_m.group(1))
|
|
char_m = re.search(r'\bchar="([^"]*)"', attrs)
|
|
if char_m:
|
|
bullet["char"] = char_m.group(1)
|
|
img_m = re.search(r'\buseImage="(\d+)"', attrs)
|
|
if img_m:
|
|
bullet["useImage"] = bool(int(img_m.group(1)))
|
|
bullets.append(bullet)
|
|
|
|
if bullets:
|
|
result["bullets"] = bullets
|
|
|
|
return result if result else None
|
|
|
|
|
|
def _get_header_xml(raw_xml: dict, parsed: dict = None) -> str | None:
|
|
if parsed and parsed.get("header_xml"):
|
|
return parsed["header_xml"]
|
|
if isinstance(raw_xml, dict):
|
|
for name, content in raw_xml.items():
|
|
if "header" in name.lower() and isinstance(content, str):
|
|
return content
|
|
return raw_xml if isinstance(raw_xml, str) else None |