Upload hwpx_generator.py

This commit is contained in:
2026-03-19 09:02:25 +09:00
parent fd8cc4d5cb
commit 62e9cd54d0

View File

@@ -0,0 +1,468 @@
"""
HWPX
깃린
StyleAnalyzer 寃곌낵瑜 諛
ㅽ⑸ HWPX
import os
import zipfile
import xml.etree.ElementTree as ET
from typing import List, Dict, Optional
from dataclasses import dataclass
from pathlib import Path
from style_analyzer import StyleAnalyzer, StyledElement
from hwp_style_mapping import HwpStyleMapper, HwpStyle, ROLE_TO_STYLE_NAME
@dataclass
class HwpxConfig:
"""HWPX
"""
paper_width: int = 59528 # A4
(hwpunit, 1/7200 inch)
paper_height: int = 84188 # A4 蹂몄шщ"
default_font_size: int = 1000 # 10pt (hwpunit)
class HwpxGenerator:
"""HWPX
깃린"""
def __init__(self, config: Optional[HwpxConfig] = None):
self.config = config or HwpxConfig()
self.mapper = HwpStyleMapper()
self.used_styles: set = set()
def generate(self, elements: List[StyledElement], output_path: str) -> str:
"""
StyledElement 由ъㅽ몃遺
HWPX
Args:
elements: StyleAnalyzer濡
由ъㅽ
output_path: (.hwpx)
Returns:
"""
# ъ⑸ ㅽ
self.used_styles = {e.role for e in elements}
#
temp_dir = Path(output_path).with_suffix('.temp')
temp_dir.mkdir(parents=True, exist_ok=True)
try:
# HWPX 援ъ“
self._create_mimetype(temp_dir)
self._create_meta_inf(temp_dir)
self._create_version(temp_dir)
self._create_header(temp_dir)
self._create_content(temp_dir, elements)
self._create_settings(temp_dir)
# ZIP쇰 異
self._create_hwpx(temp_dir, output_path)
return output_path
finally:
#
import shutil
if temp_dir.exists():
shutil.rmtree(temp_dir)
def _create_mimetype(self, temp_dir: Path):
"""mimetype
"""
mimetype_path = temp_dir / "mimetype"
mimetype_path.write_text("application/hwp+zip")
def _create_meta_inf(self, temp_dir: Path):
"""META-INF/manifest.xml
"""
meta_dir = temp_dir / "META-INF"
meta_dir.mkdir(exist_ok=True)
manifest = """<?xml version="1.0" encoding="UTF-8"?>
<manifest:manifest xmlns:manifest="urn:oasis:names:tc:opendocument:xmlns:manifest:1.0">
<manifest:file-entry manifest:full-path="/" manifest:media-type="application/hwp+zip"/>
<manifest:file-entry manifest:full-path="version.xml" manifest:media-type="application/xml"/>
<manifest:file-entry manifest:full-path="Contents/header.xml" manifest:media-type="application/xml"/>
<manifest:file-entry manifest:full-path="Contents/section0.xml" manifest:media-type="application/xml"/>
<manifest:file-entry manifest:full-path="settings.xml" manifest:media-type="application/xml"/>
</manifest:manifest>"""
(meta_dir / "manifest.xml").write_text(manifest, encoding='utf-8')
def _create_version(self, temp_dir: Path):
"""version.xml
"""
version = """<?xml version="1.0" encoding="UTF-8"?>
<hh:HWPMLVersion xmlns:hh="http://www.hancom.co.kr/hwpml/2011/head" version="1.1"/>"""
(temp_dir / "version.xml").write_text(version, encoding='utf-8')
def _create_header(self, temp_dir: Path):
"""Contents/header.xml
(ㅽы )"""
contents_dir = temp_dir / "Contents"
contents_dir.mkdir(exist_ok=True)
# ㅽ
char_props_xml = self._generate_char_properties()
para_props_xml = self._generate_para_properties()
styles_xml = self._generate_styles_xml()
header = f"""<?xml version="1.0" encoding="UTF-8"?>
<hh:head xmlns:hh="http://www.hancom.co.kr/hwpml/2011/head"
xmlns:hc="http://www.hancom.co.kr/hwpml/2011/core"
xmlns:hp="http://www.hancom.co.kr/hwpml/2011/paragraph"
version="1.5" secCnt="1">
<hh:beginNum page="1" footnote="1" endnote="1" pic="1" tbl="1" equation="1"/>
<hh:refList>
<hh:fontfaces itemCnt="7">
<hh:fontface lang="HANGUL" fontCnt="2">
<hh:font id="0" face="留 怨" type="TTF" isEmbedded="0"/>
<hh:font id="1" face="蹂몄шщ" type="TTF" isEmbedded="0"/>
</hh:fontface>
<hh:fontface lang="LATIN" fontCnt="2">
<hh:font id="0" face="留 怨" type="TTF" isEmbedded="0"/>
<hh:font id="1" face="蹂몄шщ" type="TTF" isEmbedded="0"/>
</hh:fontface>
<hh:fontface lang="HANJA" fontCnt="2">
<hh:font id="0" face="留 怨" type="TTF" isEmbedded="0"/>
<hh:font id="1" face="蹂몄шщ" type="TTF" isEmbedded="0"/>
</hh:fontface>
<hh:fontface lang="JAPANESE" fontCnt="1">
<hh:font id="0" face="留 怨" type="TTF" isEmbedded="0"/>
</hh:fontface>
<hh:fontface lang="OTHER" fontCnt="1">
<hh:font id="0" face="留 怨" type="TTF" isEmbedded="0"/>
</hh:fontface>
<hh:fontface lang="SYMBOL" fontCnt="1">
<hh:font id="0" face="留 怨" type="TTF" isEmbedded="0"/>
</hh:fontface>
<hh:fontface lang="USER" fontCnt="1">
<hh:font id="0" face="留 怨" type="TTF" isEmbedded="0"/>
</hh:fontface>
</hh:fontfaces>
<hh:borderFills itemCnt="2">
<hh:borderFill id="1" threeD="0" shadow="0" centerLine="NONE">
<hh:slash type="NONE" Crooked="0" isCounter="0"/>
<hh:backSlash type="NONE" Crooked="0" isCounter="0"/>
<hh:leftBorder type="NONE" width="0.1 mm" color="#000000"/>
<hh:rightBorder type="NONE" width="0.1 mm" color="#000000"/>
<hh:topBorder type="NONE" width="0.1 mm" color="#000000"/>
<hh:bottomBorder type="NONE" width="0.1 mm" color="#000000"/>
</hh:borderFill>
<hh:borderFill id="2" threeD="0" shadow="0" centerLine="NONE">
<hh:slash type="NONE" Crooked="0" isCounter="0"/>
<hh:backSlash type="NONE" Crooked="0" isCounter="0"/>
<hh:leftBorder type="NONE" width="0.1 mm" color="#000000"/>
<hh:rightBorder type="NONE" width="0.1 mm" color="#000000"/>
<hh:topBorder type="NONE" width="0.1 mm" color="#000000"/>
<hh:bottomBorder type="NONE" width="0.1 mm" color="#000000"/>
<hc:fillBrush><hc:winBrush faceColor="none" hatchColor="#000000" alpha="0"/></hc:fillBrush>
</hh:borderFill>
</hh:borderFills>
{char_props_xml}
{para_props_xml}
{styles_xml}
</hh:refList>
<hh:compatibleDocument targetProgram="HWP201X"/>
<hh:docOption>
<hh:linkinfo path="" pageInherit="1" footnoteInherit="0"/>
</hh:docOption>
</hh:head>"""
(contents_dir / "header.xml").write_text(header, encoding='utf-8')
def _generate_char_properties(self) -> str:
"""
XML
"""
lines = [f' <hh:charProperties itemCnt="{len(self.used_styles) + 1}">']
# 湲곕낯 湲
(id=0)
lines.append(''' <hh:charPr id="0" height="1000" textColor="#000000" shadeColor="none" useFontSpace="0" useKerning="0" symMark="NONE" borderFillIDRef="1">
<hh:fontRef hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
<hh:ratio hangul="100" latin="100" hanja="100" japanese="100" other="100" symbol="100" user="100"/>
<hh:spacing hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
<hh:relSz hangul="100" latin="100" hanja="100" japanese="100" other="100" symbol="100" user="100"/>
<hh:offset hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
<hh:underline type="NONE" shape="SOLID" color="#000000"/>
<hh:strikeout shape="NONE" color="#000000"/>
<hh:outline type="NONE"/>
<hh:shadow type="NONE" color="#B2B2B2" offsetX="10" offsetY="10"/>
</hh:charPr>''')
# 湲
for idx, role in enumerate(sorted(self.used_styles), start=1):
style = self.mapper.get_style(role)
height = int(style.font_size * 100) # pt hwpunit
color = style.font_color.lstrip('#')
font_id = "1" if style.font_bold else "0" # 援듦 硫 蹂몄шщ
lines.append(f''' <hh:charPr id="{idx}" height="{height}" textColor="#{color}" shadeColor="none" useFontSpace="0" useKerning="0" symMark="NONE" borderFillIDRef="1">
<hh:fontRef hangul="{font_id}" latin="{font_id}" hanja="{font_id}" japanese="{font_id}" other="{font_id}" symbol="{font_id}" user="{font_id}"/>
<hh:ratio hangul="100" latin="100" hanja="100" japanese="100" other="100" symbol="100" user="100"/>
<hh:spacing hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
<hh:relSz hangul="100" latin="100" hanja="100" japanese="100" other="100" symbol="100" user="100"/>
<hh:offset hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
<hh:underline type="NONE" shape="SOLID" color="#000000"/>
<hh:strikeout shape="NONE" color="#000000"/>
<hh:outline type="NONE"/>
<hh:shadow type="NONE" color="#B2B2B2" offsetX="10" offsetY="10"/>
</hh:charPr>''')
lines.append(' </hh:charProperties>')
return '\n'.join(lines)
def _generate_para_properties(self) -> str:
"""臾몃
XML
"""
lines = [f' <hh:paraProperties itemCnt="{len(self.used_styles) + 1}">']
# 湲곕낯 臾몃
(id=0)
lines.append(''' <hh:paraPr id="0" tabPrIDRef="0" condense="0" fontLineHeight="0" snapToGrid="0" suppressLineNumbers="0" checked="0">
<hh:align horizontal="JUSTIFY" vertical="BASELINE"/>
<hh:heading type="NONE" idRef="0" level="0"/>
<hh:breakSetting breakLatinWord="KEEP_WORD" breakNonLatinWord="KEEP_WORD" widowOrphan="0" keepWithNext="0" keepLines="0" pageBreakBefore="0" lineWrap="BREAK"/>
<hh:autoSpacing eAsianEng="0" eAsianNum="0"/>
<hp:switch xmlns:hp="http://www.hancom.co.kr/hwpml/2011/paragraph">
<hp:case hp:required-namespace="http://www.hancom.co.kr/hwpml/2016/HwpUnitChar">
<hh:margin><hc:intent value="0" unit="HWPUNIT"/><hc:left value="0" unit="HWPUNIT"/><hc:right value="0" unit="HWPUNIT"/><hc:prev value="0" unit="HWPUNIT"/><hc:next value="0" unit="HWPUNIT"/></hh:margin>
<hh:lineSpacing type="PERCENT" value="160" unit="HWPUNIT"/>
</hp:case>
<hp:default>
<hh:margin><hc:intent value="0" unit="HWPUNIT"/><hc:left value="0" unit="HWPUNIT"/><hc:right value="0" unit="HWPUNIT"/><hc:prev value="0" unit="HWPUNIT"/><hc:next value="0" unit="HWPUNIT"/></hh:margin>
<hh:lineSpacing type="PERCENT" value="160" unit="HWPUNIT"/>
</hp:default>
</hp:switch>
<hh:border borderFillIDRef="1" offsetLeft="0" offsetRight="0" offsetTop="0" offsetBottom="0" connect="0" ignoreMargin="0"/>
</hh:paraPr>''')
# 臾몃
align_map = {"left": "LEFT", "center": "CENTER", "right": "RIGHT", "justify": "JUSTIFY"}
for idx, role in enumerate(sorted(self.used_styles), start=1):
style = self.mapper.get_style(role)
align_val = align_map.get(style.align, "JUSTIFY")
line_spacing = int(style.line_spacing)
left_margin = int(style.indent_left * 100)
indent = int(style.indent_first * 100)
space_before = int(style.space_before * 100)
space_after = int(style.space_after * 100)
lines.append(f''' <hh:paraPr id="{idx}" tabPrIDRef="0" condense="0" fontLineHeight="0" snapToGrid="0" suppressLineNumbers="0" checked="0">
<hh:align horizontal="{align_val}" vertical="BASELINE"/>
<hh:heading type="NONE" idRef="0" level="0"/>
<hh:breakSetting breakLatinWord="KEEP_WORD" breakNonLatinWord="KEEP_WORD" widowOrphan="0" keepWithNext="0" keepLines="0" pageBreakBefore="0" lineWrap="BREAK"/>
<hh:autoSpacing eAsianEng="0" eAsianNum="0"/>
<hp:switch xmlns:hp="http://www.hancom.co.kr/hwpml/2011/paragraph">
<hp:case hp:required-namespace="http://www.hancom.co.kr/hwpml/2016/HwpUnitChar">
<hh:margin><hc:intent value="{indent}" unit="HWPUNIT"/><hc:left value="{left_margin}" unit="HWPUNIT"/><hc:right value="0" unit="HWPUNIT"/><hc:prev value="{space_before}" unit="HWPUNIT"/><hc:next value="{space_after}" unit="HWPUNIT"/></hh:margin>
<hh:lineSpacing type="PERCENT" value="{line_spacing}" unit="HWPUNIT"/>
</hp:case>
<hp:default>
<hh:margin><hc:intent value="{indent}" unit="HWPUNIT"/><hc:left value="{left_margin}" unit="HWPUNIT"/><hc:right value="0" unit="HWPUNIT"/><hc:prev value="{space_before}" unit="HWPUNIT"/><hc:next value="{space_after}" unit="HWPUNIT"/></hh:margin>
<hh:lineSpacing type="PERCENT" value="{line_spacing}" unit="HWPUNIT"/>
</hp:default>
</hp:switch>
<hh:border borderFillIDRef="1" offsetLeft="0" offsetRight="0" offsetTop="0" offsetBottom="0" connect="0" ignoreMargin="0"/>
</hh:paraPr>''')
lines.append(' </hh:paraProperties>')
return '\n'.join(lines)
def _generate_styles_xml(self) -> str:
"""
(charPrIDRef, paraPrIDRef 李몄)"""
lines = [f' <hh:styles itemCnt="{len(self.used_styles) + 1}">']
# 湲곕낯 ㅽ湲)
lines.append(' <hh:style id="0" type="PARA" name="諛湲" engName="Normal" paraPrIDRef="0" charPrIDRef="0" nextStyleIDRef="0" langID="1042" lockForm="0"/>')
# ㅽ
(蹂몃Ц + ㅽ XML
paragraphs = []
current_table = None
# ㅽ깆 留ㅽ
role_to_idx = {role: idx for idx, role in enumerate(sorted(self.used_styles), start=1)}
for elem in elements:
style = self.mapper.get_style(elem.role)
style_idx = role_to_idx.get(elem.role, 0)
# 뱀
if elem.role in ["TH", "TD", "TABLE_CAPTION", "TABLE", "FIGURE"]:
continue # /洹몃┝ 蹂
# 몃
para_xml = self._create_paragraph(elem.text, style, style_idx)
paragraphs.append(para_xml)
section = f"""<?xml version="1.0" encoding="UTF-8"?>
<hs:sec xmlns:hs="http://www.hancom.co.kr/hwpml/2011/section"
xmlns:hc="http://www.hancom.co.kr/hwpml/2011/core">
{"".join(paragraphs)}
</hs:sec>"""
(contents_dir / "section0.xml").write_text(section, encoding='utf-8')
def _create_paragraph(self, text: str, style: HwpStyle, style_idx: int) -> str:
""" XML
"""
text = self._escape_xml(text)
return f'''
<hp:p xmlns:hp="http://www.hancom.co.kr/hwpml/2011/paragraph"
paraPrIDRef="{style_idx}" styleIDRef="{style_idx}" pageBreak="0" columnBreak="0" merged="0">
<hp:run charPrIDRef="{style_idx}">
<hp:t>{text}</hp:t>
</hp:run>
</hp:p>'''
def _escape_xml(self, text: str) -> str:
"""XML 뱀몄ㅼ
"""
return (text
.replace("&", "&amp;")
.replace("<", "&lt;")
.replace(">", "&gt;")
.replace('"', "&quot;")
.replace("'", "&apos;"))
def _create_settings(self, temp_dir: Path):
"""settings.xml
"""
settings = """<?xml version="1.0" encoding="UTF-8"?>
<hs:settings xmlns:hs="http://www.hancom.co.kr/hwpml/2011/settings">
<hs:viewSetting>
<hs:viewType val="printView"/>
<hs:zoom val="100"/>
</hs:viewSetting>
</hs:settings>"""
(temp_dir / "settings.xml").write_text(settings, encoding='utf-8')
def _create_hwpx(self, temp_dir: Path, output_path: str):
"""HWPX
(ZIP )"""
with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zf:
# mimetype 異 泥 踰吏몃
mimetype_path = temp_dir / "mimetype"
zf.write(mimetype_path, "mimetype", compress_type=zipfile.ZIP_STORED)
# 몄
for root, dirs, files in os.walk(temp_dir):
for file in files:
if file == "mimetype":
continue
file_path = Path(root) / file
arcname = file_path.relative_to(temp_dir)
zf.write(file_path, arcname)
def convert_html_to_hwpx(html: str, output_path: str) -> str:
"""
HTML HWPX 蹂몄
output_path:
Returns:
"""
# 1. HTML 遺
猷: {len(elements)}媛 ")
for role, count in analyzer.get_role_summary().items():
print(f" {role}: {count}")
# 2. HWPX
generator = HwpxGenerator()
result_path = generator.generate(elements, output_path)
print(f"
猷: {result_path}")
return result_path
if __name__ == "__main__":
#
test_html = """
<html>
<body>
<div class="box-cover">
<h1>嫄댁
ㅒ룻紐 DX ㅻТ吏移 </h1>
<h2> /UAV쨌GIS쨌吏 /吏諛⑤ 湲곕
1 </h1>
</div>
<h1>1. </h1>
<p> 蹂닿
嫄댁
ㅻТ 吏移
怨듯⑸.</p>
<h2>1.1 諛곌꼍</h2>
<p>理洹 濡怨 GIS 湲곗
臾닿 ш .</p>
<h3>1.1.1 湲곗 </h3>
<p>1) <strong>
</strong></p>
<p>
湲곗 諛⑹鍮
깆ш .</p>
<p>(1) <strong>RTK </strong></p>
<p>ㅼ媛
蹂댁 湲곕μ
媛異濡 .</p>
<ul>
<li>怨諛 GPS </li>
<li>
쇳곕
</li>
</ul>
</body>
</html>
"""
output = "/home/claude/test_output.hwpx"
convert_html_to_hwpx(test_html, output)