82 lines
2.2 KiB
Python
82 lines
2.2 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""
|
|
§3 글꼴(FaceName) 추출
|
|
|
|
HWPX 실제 태그 (header.xml):
|
|
<hh:fontface lang="HANGUL" fontCnt="9">
|
|
<hh:font id="0" face="돋움" type="TTF" isEmbedded="0">
|
|
<hh:font id="1" face="맑은 고딕" type="TTF" isEmbedded="0">
|
|
</hh:fontface>
|
|
<hh:fontface lang="LATIN" fontCnt="9">
|
|
<hh:font id="0" face="돋움" type="TTF" isEmbedded="0">
|
|
</hh:fontface>
|
|
|
|
디폴트값 생성 안 함. 추출 실패 시 None 반환.
|
|
"""
|
|
|
|
import re
|
|
|
|
|
|
def extract(raw_xml: dict, parsed: dict = None) -> dict | None:
|
|
"""§3 fontface에서 언어별 글꼴 정의 추출.
|
|
|
|
Returns:
|
|
{
|
|
"HANGUL": [{"id": 0, "face": "돋움", "type": "TTF"}, ...],
|
|
"LATIN": [{"id": 0, "face": "돋움", "type": "TTF"}, ...],
|
|
"HANJA": [...],
|
|
...
|
|
}
|
|
또는 추출 실패 시 None
|
|
"""
|
|
header_xml = _get_header_xml(raw_xml, parsed)
|
|
if not header_xml:
|
|
return None
|
|
|
|
result = {}
|
|
|
|
# fontface 블록을 lang별로 추출
|
|
fontface_blocks = re.findall(
|
|
r'<hh:fontface\b[^>]*\blang="([^"]+)"[^>]*>(.*?)</hh:fontface>',
|
|
header_xml, re.DOTALL
|
|
)
|
|
|
|
if not fontface_blocks:
|
|
return None
|
|
|
|
for lang, block_content in fontface_blocks:
|
|
fonts = []
|
|
font_matches = re.finditer(
|
|
r'<hh:font\b[^>]*'
|
|
r'\bid="(\d+)"[^>]*'
|
|
r'\bface="([^"]+)"[^>]*'
|
|
r'\btype="([^"]+)"',
|
|
block_content
|
|
)
|
|
for fm in font_matches:
|
|
fonts.append({
|
|
"id": int(fm.group(1)),
|
|
"face": fm.group(2),
|
|
"type": fm.group(3),
|
|
})
|
|
|
|
if fonts:
|
|
result[lang] = fonts
|
|
|
|
return result if result else None
|
|
|
|
|
|
def _get_header_xml(raw_xml: dict, parsed: dict = None) -> str | None:
|
|
"""header.xml 문자열을 가져온다."""
|
|
if parsed and parsed.get("header_xml"):
|
|
return parsed["header_xml"]
|
|
|
|
if isinstance(raw_xml, dict):
|
|
for name, content in raw_xml.items():
|
|
if "header" in name.lower() and isinstance(content, str):
|
|
return content
|
|
|
|
if isinstance(raw_xml, str):
|
|
return raw_xml
|
|
|
|
return None |