133 lines
4.1 KiB
Python
133 lines
4.1 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""
|
|
§4 글자 모양(CharShape) 추출
|
|
|
|
HWPX 실제 태그 (header.xml):
|
|
<hh:charPr id="0" height="1000" textColor="#000000" shadeColor="none"
|
|
useFontSpace="0" useKerning="0" symMark="NONE" borderFillIDRef="2">
|
|
<hh:fontRef hangul="7" latin="6" hanja="6" .../>
|
|
<hh:ratio hangul="100" latin="100" .../>
|
|
<hh:spacing hangul="0" latin="0" .../>
|
|
<hh:relSz hangul="100" latin="100" .../>
|
|
<hh:offset hangul="0" latin="0" .../>
|
|
<hh:bold/> <!-- 존재하면 bold -->
|
|
<hh:italic/> <!-- 존재하면 italic -->
|
|
<hh:underline type="NONE" shape="SOLID" color="#000000"/>
|
|
<hh:strikeout shape="NONE" color="#000000"/>
|
|
</hh:charPr>
|
|
|
|
디폴트값 생성 안 함.
|
|
"""
|
|
|
|
import re
|
|
|
|
from domain.hwpx.hwpx_utils import charsize_to_pt
|
|
|
|
|
|
def extract(raw_xml: dict, parsed: dict = None) -> list | None:
|
|
"""§4 charPr 전체 목록 추출.
|
|
|
|
Returns:
|
|
[
|
|
{
|
|
"id": 0,
|
|
"height_pt": 10.0,
|
|
"textColor": "#000000",
|
|
"bold": False,
|
|
"italic": False,
|
|
"underline": "NONE",
|
|
"strikeout": "NONE",
|
|
"fontRef": {"hangul": 7, "latin": 6, ...},
|
|
"ratio": {"hangul": 100, "latin": 100, ...},
|
|
"spacing": {"hangul": 0, "latin": 0, ...},
|
|
"borderFillIDRef": 2,
|
|
},
|
|
...
|
|
]
|
|
"""
|
|
header_xml = _get_header_xml(raw_xml, parsed)
|
|
if not header_xml:
|
|
return None
|
|
|
|
# charPr 블록 추출 (self-closing이 아닌 블록)
|
|
blocks = re.findall(
|
|
r'<hh:charPr\b([^>]*)>(.*?)</hh:charPr>',
|
|
header_xml, re.DOTALL
|
|
)
|
|
|
|
if not blocks:
|
|
return None
|
|
|
|
result = []
|
|
for attrs_str, inner in blocks:
|
|
item = {}
|
|
|
|
# 속성 파싱
|
|
id_m = re.search(r'\bid="(\d+)"', attrs_str)
|
|
if id_m:
|
|
item["id"] = int(id_m.group(1))
|
|
|
|
height_m = re.search(r'\bheight="(\d+)"', attrs_str)
|
|
if height_m:
|
|
item["height_pt"] = charsize_to_pt(int(height_m.group(1)))
|
|
|
|
color_m = re.search(r'\btextColor="([^"]+)"', attrs_str)
|
|
if color_m:
|
|
item["textColor"] = color_m.group(1)
|
|
|
|
shade_m = re.search(r'\bshadeColor="([^"]+)"', attrs_str)
|
|
if shade_m and shade_m.group(1) != "none":
|
|
item["shadeColor"] = shade_m.group(1)
|
|
|
|
bf_m = re.search(r'\bborderFillIDRef="(\d+)"', attrs_str)
|
|
if bf_m:
|
|
item["borderFillIDRef"] = int(bf_m.group(1))
|
|
|
|
# bold / italic (태그 존재 여부로 판단)
|
|
item["bold"] = bool(re.search(r'<hh:bold\s*/?>', inner))
|
|
item["italic"] = bool(re.search(r'<hh:italic\s*/?>', inner))
|
|
|
|
# fontRef
|
|
fr = re.search(r'<hh:fontRef\b([^/]*)/>', inner)
|
|
if fr:
|
|
item["fontRef"] = _parse_lang_attrs(fr.group(1))
|
|
|
|
# ratio
|
|
ra = re.search(r'<hh:ratio\b([^/]*)/>', inner)
|
|
if ra:
|
|
item["ratio"] = _parse_lang_attrs(ra.group(1))
|
|
|
|
# spacing
|
|
sp = re.search(r'<hh:spacing\b([^/]*)/>', inner)
|
|
if sp:
|
|
item["spacing"] = _parse_lang_attrs(sp.group(1))
|
|
|
|
# underline
|
|
ul = re.search(r'<hh:underline\b[^>]*\btype="([^"]+)"', inner)
|
|
if ul:
|
|
item["underline"] = ul.group(1)
|
|
|
|
# strikeout
|
|
so = re.search(r'<hh:strikeout\b[^>]*\bshape="([^"]+)"', inner)
|
|
if so:
|
|
item["strikeout"] = so.group(1)
|
|
|
|
result.append(item)
|
|
|
|
return result if result else None
|
|
|
|
|
|
def _parse_lang_attrs(attrs_str: str) -> dict:
|
|
"""hangul="7" latin="6" ... → {"hangul": 7, "latin": 6, ...}"""
|
|
pairs = re.findall(r'(\w+)="(-?\d+)"', attrs_str)
|
|
return {k: int(v) for k, v in pairs}
|
|
|
|
|
|
def _get_header_xml(raw_xml: dict, parsed: dict = None) -> str | None:
|
|
if parsed and parsed.get("header_xml"):
|
|
return parsed["header_xml"]
|
|
if isinstance(raw_xml, dict):
|
|
for name, content in raw_xml.items():
|
|
if "header" in name.lower() and isinstance(content, str):
|
|
return content
|
|
return raw_xml if isinstance(raw_xml, str) else None |