Files
test/handlers/tools/numbering.py

136 lines
4.1 KiB
Python

# -*- coding: utf-8 -*-
"""
번호매기기(Numbering) / 글머리표(Bullet) 추출
HWPX 실제 태그 (header.xml):
<hh:numbering id="1" start="0">
<hh:paraHead start="1" level="1" align="LEFT" useInstWidth="1"
autoIndent="1" widthAdjust="0" textOffsetType="PERCENT"
textOffset="50" numFormat="DIGIT" charPrIDRef="4294967295"
checkable="0">^1.</hh:paraHead>
<hh:paraHead start="1" level="2" ... numFormat="HANGUL_SYLLABLE">^2.</hh:paraHead>
</hh:numbering>
<hh:bullet id="1" char="-" useImage="0">
<hh:paraHead level="0" align="LEFT" .../>
</hh:bullet>
디폴트값 생성 안 함.
"""
import re
def extract(raw_xml: dict, parsed: dict = None) -> dict | None:
"""번호매기기 + 글머리표 정의 추출.
Returns:
{
"numberings": [
{
"id": 1, "start": 0,
"levels": [
{"level": 1, "numFormat": "DIGIT", "pattern": "^1.",
"align": "LEFT"},
{"level": 2, "numFormat": "HANGUL_SYLLABLE", "pattern": "^2."},
...
]
}
],
"bullets": [
{"id": 1, "char": "-", "useImage": False}
]
}
"""
header_xml = _get_header_xml(raw_xml, parsed)
if not header_xml:
return None
result = {}
# ── 번호매기기 ──
numbering_blocks = re.findall(
r'<hh:numbering\b([^>]*)>(.*?)</hh:numbering>',
header_xml, re.DOTALL
)
if numbering_blocks:
nums = []
for attrs, inner in numbering_blocks:
num = {}
id_m = re.search(r'\bid="(\d+)"', attrs)
if id_m:
num["id"] = int(id_m.group(1))
start_m = re.search(r'\bstart="(\d+)"', attrs)
if start_m:
num["start"] = int(start_m.group(1))
# paraHead 레벨들
levels = []
heads = re.finditer(
r'<hh:paraHead\b([^>]*)>([^<]*)</hh:paraHead>',
inner
)
for h in heads:
h_attrs = h.group(1)
h_pattern = h.group(2).strip()
level = {}
lv = re.search(r'\blevel="(\d+)"', h_attrs)
if lv:
level["level"] = int(lv.group(1))
fmt = re.search(r'\bnumFormat="([^"]+)"', h_attrs)
if fmt:
level["numFormat"] = fmt.group(1)
al = re.search(r'\balign="([^"]+)"', h_attrs)
if al:
level["align"] = al.group(1)
if h_pattern:
level["pattern"] = h_pattern
if level:
levels.append(level)
if levels:
num["levels"] = levels
nums.append(num)
if nums:
result["numberings"] = nums
# ── 글머리표 ──
bullet_blocks = re.findall(
r'<hh:bullet\b([^>]*)>(.*?)</hh:bullet>',
header_xml, re.DOTALL
)
if bullet_blocks:
bullets = []
for attrs, inner in bullet_blocks:
bullet = {}
id_m = re.search(r'\bid="(\d+)"', attrs)
if id_m:
bullet["id"] = int(id_m.group(1))
char_m = re.search(r'\bchar="([^"]*)"', attrs)
if char_m:
bullet["char"] = char_m.group(1)
img_m = re.search(r'\buseImage="(\d+)"', attrs)
if img_m:
bullet["useImage"] = bool(int(img_m.group(1)))
bullets.append(bullet)
if bullets:
result["bullets"] = bullets
return result if result else None
def _get_header_xml(raw_xml: dict, parsed: dict = None) -> str | None:
if parsed and parsed.get("header_xml"):
return parsed["header_xml"]
if isinstance(raw_xml, dict):
for name, content in raw_xml.items():
if "header" in name.lower() and isinstance(content, str):
return content
return raw_xml if isinstance(raw_xml, str) else None