Files
test/converters/hwpx_generator.py
2026-02-20 11:34:02 +09:00

431 lines
20 KiB
Python

"""
HWPX 파일 생성기
StyleAnalyzer 결과를 받아 스타일이 적용된 HWPX 파일 생성
"""
import os
import zipfile
import xml.etree.ElementTree as ET
from typing import List, Dict, Optional
from dataclasses import dataclass
from pathlib import Path
from style_analyzer import StyleAnalyzer, StyledElement
from hwp_style_mapping import HwpStyleMapper, HwpStyle, ROLE_TO_STYLE_NAME
@dataclass
class HwpxConfig:
"""HWPX 생성 설정"""
paper_width: int = 59528 # A4 너비 (hwpunit, 1/7200 inch)
paper_height: int = 84188 # A4 높이
margin_left: int = 8504
margin_right: int = 8504
margin_top: int = 5668
margin_bottom: int = 4252
default_font: str = "함초롬바탕"
default_font_size: int = 1000 # 10pt (hwpunit)
class HwpxGenerator:
"""HWPX 파일 생성기"""
def __init__(self, config: Optional[HwpxConfig] = None):
self.config = config or HwpxConfig()
self.mapper = HwpStyleMapper()
self.used_styles: set = set()
def generate(self, elements: List[StyledElement], output_path: str) -> str:
"""
StyledElement 리스트로부터 HWPX 파일 생성
Args:
elements: StyleAnalyzer로 분류된 요소 리스트
output_path: 출력 파일 경로 (.hwpx)
Returns:
생성된 파일 경로
"""
# 사용된 스타일 수집
self.used_styles = {e.role for e in elements}
# 임시 디렉토리 생성
temp_dir = Path(output_path).with_suffix('.temp')
temp_dir.mkdir(parents=True, exist_ok=True)
try:
# HWPX 구조 생성
self._create_mimetype(temp_dir)
self._create_meta_inf(temp_dir)
self._create_version(temp_dir)
self._create_header(temp_dir)
self._create_content(temp_dir, elements)
self._create_settings(temp_dir)
# ZIP으로 압축
self._create_hwpx(temp_dir, output_path)
return output_path
finally:
# 임시 파일 정리
import shutil
if temp_dir.exists():
shutil.rmtree(temp_dir)
def _create_mimetype(self, temp_dir: Path):
"""mimetype 파일 생성"""
mimetype_path = temp_dir / "mimetype"
mimetype_path.write_text("application/hwp+zip")
def _create_meta_inf(self, temp_dir: Path):
"""META-INF/manifest.xml 생성"""
meta_dir = temp_dir / "META-INF"
meta_dir.mkdir(exist_ok=True)
manifest = """<?xml version="1.0" encoding="UTF-8"?>
<manifest:manifest xmlns:manifest="urn:oasis:names:tc:opendocument:xmlns:manifest:1.0">
<manifest:file-entry manifest:full-path="/" manifest:media-type="application/hwp+zip"/>
<manifest:file-entry manifest:full-path="version.xml" manifest:media-type="application/xml"/>
<manifest:file-entry manifest:full-path="Contents/header.xml" manifest:media-type="application/xml"/>
<manifest:file-entry manifest:full-path="Contents/section0.xml" manifest:media-type="application/xml"/>
<manifest:file-entry manifest:full-path="settings.xml" manifest:media-type="application/xml"/>
</manifest:manifest>"""
(meta_dir / "manifest.xml").write_text(manifest, encoding='utf-8')
def _create_version(self, temp_dir: Path):
"""version.xml 생성"""
version = """<?xml version="1.0" encoding="UTF-8"?>
<hh:HWPMLVersion xmlns:hh="http://www.hancom.co.kr/hwpml/2011/head" version="1.1"/>"""
(temp_dir / "version.xml").write_text(version, encoding='utf-8')
def _create_header(self, temp_dir: Path):
"""Contents/header.xml 생성 (스타일 정의 포함)"""
contents_dir = temp_dir / "Contents"
contents_dir.mkdir(exist_ok=True)
# 스타일별 속성 생성
char_props_xml = self._generate_char_properties()
para_props_xml = self._generate_para_properties()
styles_xml = self._generate_styles_xml()
header = f"""<?xml version="1.0" encoding="UTF-8"?>
<hh:head xmlns:hh="http://www.hancom.co.kr/hwpml/2011/head"
xmlns:hc="http://www.hancom.co.kr/hwpml/2011/core"
xmlns:hp="http://www.hancom.co.kr/hwpml/2011/paragraph"
version="1.5" secCnt="1">
<hh:beginNum page="1" footnote="1" endnote="1" pic="1" tbl="1" equation="1"/>
<hh:refList>
<hh:fontfaces itemCnt="7">
<hh:fontface lang="HANGUL" fontCnt="2">
<hh:font id="0" face="맑은 고딕" type="TTF" isEmbedded="0"/>
<hh:font id="1" face="함초롬돋움" type="TTF" isEmbedded="0"/>
</hh:fontface>
<hh:fontface lang="LATIN" fontCnt="2">
<hh:font id="0" face="맑은 고딕" type="TTF" isEmbedded="0"/>
<hh:font id="1" face="함초롬돋움" type="TTF" isEmbedded="0"/>
</hh:fontface>
<hh:fontface lang="HANJA" fontCnt="2">
<hh:font id="0" face="맑은 고딕" type="TTF" isEmbedded="0"/>
<hh:font id="1" face="함초롬돋움" type="TTF" isEmbedded="0"/>
</hh:fontface>
<hh:fontface lang="JAPANESE" fontCnt="1">
<hh:font id="0" face="맑은 고딕" type="TTF" isEmbedded="0"/>
</hh:fontface>
<hh:fontface lang="OTHER" fontCnt="1">
<hh:font id="0" face="맑은 고딕" type="TTF" isEmbedded="0"/>
</hh:fontface>
<hh:fontface lang="SYMBOL" fontCnt="1">
<hh:font id="0" face="맑은 고딕" type="TTF" isEmbedded="0"/>
</hh:fontface>
<hh:fontface lang="USER" fontCnt="1">
<hh:font id="0" face="맑은 고딕" type="TTF" isEmbedded="0"/>
</hh:fontface>
</hh:fontfaces>
<hh:borderFills itemCnt="2">
<hh:borderFill id="1" threeD="0" shadow="0" centerLine="NONE">
<hh:slash type="NONE" Crooked="0" isCounter="0"/>
<hh:backSlash type="NONE" Crooked="0" isCounter="0"/>
<hh:leftBorder type="NONE" width="0.1 mm" color="#000000"/>
<hh:rightBorder type="NONE" width="0.1 mm" color="#000000"/>
<hh:topBorder type="NONE" width="0.1 mm" color="#000000"/>
<hh:bottomBorder type="NONE" width="0.1 mm" color="#000000"/>
</hh:borderFill>
<hh:borderFill id="2" threeD="0" shadow="0" centerLine="NONE">
<hh:slash type="NONE" Crooked="0" isCounter="0"/>
<hh:backSlash type="NONE" Crooked="0" isCounter="0"/>
<hh:leftBorder type="NONE" width="0.1 mm" color="#000000"/>
<hh:rightBorder type="NONE" width="0.1 mm" color="#000000"/>
<hh:topBorder type="NONE" width="0.1 mm" color="#000000"/>
<hh:bottomBorder type="NONE" width="0.1 mm" color="#000000"/>
<hc:fillBrush><hc:winBrush faceColor="none" hatchColor="#000000" alpha="0"/></hc:fillBrush>
</hh:borderFill>
</hh:borderFills>
{char_props_xml}
{para_props_xml}
{styles_xml}
</hh:refList>
<hh:compatibleDocument targetProgram="HWP201X"/>
<hh:docOption>
<hh:linkinfo path="" pageInherit="1" footnoteInherit="0"/>
</hh:docOption>
</hh:head>"""
(contents_dir / "header.xml").write_text(header, encoding='utf-8')
def _generate_char_properties(self) -> str:
"""글자 속성 XML 생성"""
lines = [f' <hh:charProperties itemCnt="{len(self.used_styles) + 1}">']
# 기본 글자 속성 (id=0)
lines.append(''' <hh:charPr id="0" height="1000" textColor="#000000" shadeColor="none" useFontSpace="0" useKerning="0" symMark="NONE" borderFillIDRef="1">
<hh:fontRef hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
<hh:ratio hangul="100" latin="100" hanja="100" japanese="100" other="100" symbol="100" user="100"/>
<hh:spacing hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
<hh:relSz hangul="100" latin="100" hanja="100" japanese="100" other="100" symbol="100" user="100"/>
<hh:offset hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
<hh:underline type="NONE" shape="SOLID" color="#000000"/>
<hh:strikeout shape="NONE" color="#000000"/>
<hh:outline type="NONE"/>
<hh:shadow type="NONE" color="#B2B2B2" offsetX="10" offsetY="10"/>
</hh:charPr>''')
# 역할별 글자 속성
for idx, role in enumerate(sorted(self.used_styles), start=1):
style = self.mapper.get_style(role)
height = int(style.font_size * 100) # pt → hwpunit
color = style.font_color.lstrip('#')
font_id = "1" if style.font_bold else "0" # 굵게면 함초롬돋움
lines.append(f''' <hh:charPr id="{idx}" height="{height}" textColor="#{color}" shadeColor="none" useFontSpace="0" useKerning="0" symMark="NONE" borderFillIDRef="1">
<hh:fontRef hangul="{font_id}" latin="{font_id}" hanja="{font_id}" japanese="{font_id}" other="{font_id}" symbol="{font_id}" user="{font_id}"/>
<hh:ratio hangul="100" latin="100" hanja="100" japanese="100" other="100" symbol="100" user="100"/>
<hh:spacing hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
<hh:relSz hangul="100" latin="100" hanja="100" japanese="100" other="100" symbol="100" user="100"/>
<hh:offset hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
<hh:underline type="NONE" shape="SOLID" color="#000000"/>
<hh:strikeout shape="NONE" color="#000000"/>
<hh:outline type="NONE"/>
<hh:shadow type="NONE" color="#B2B2B2" offsetX="10" offsetY="10"/>
</hh:charPr>''')
lines.append(' </hh:charProperties>')
return '\n'.join(lines)
def _generate_para_properties(self) -> str:
"""문단 속성 XML 생성"""
lines = [f' <hh:paraProperties itemCnt="{len(self.used_styles) + 1}">']
# 기본 문단 속성 (id=0)
lines.append(''' <hh:paraPr id="0" tabPrIDRef="0" condense="0" fontLineHeight="0" snapToGrid="0" suppressLineNumbers="0" checked="0">
<hh:align horizontal="JUSTIFY" vertical="BASELINE"/>
<hh:heading type="NONE" idRef="0" level="0"/>
<hh:breakSetting breakLatinWord="KEEP_WORD" breakNonLatinWord="KEEP_WORD" widowOrphan="0" keepWithNext="0" keepLines="0" pageBreakBefore="0" lineWrap="BREAK"/>
<hh:autoSpacing eAsianEng="0" eAsianNum="0"/>
<hp:switch xmlns:hp="http://www.hancom.co.kr/hwpml/2011/paragraph">
<hp:case hp:required-namespace="http://www.hancom.co.kr/hwpml/2016/HwpUnitChar">
<hh:margin><hc:intent value="0" unit="HWPUNIT"/><hc:left value="0" unit="HWPUNIT"/><hc:right value="0" unit="HWPUNIT"/><hc:prev value="0" unit="HWPUNIT"/><hc:next value="0" unit="HWPUNIT"/></hh:margin>
<hh:lineSpacing type="PERCENT" value="160" unit="HWPUNIT"/>
</hp:case>
<hp:default>
<hh:margin><hc:intent value="0" unit="HWPUNIT"/><hc:left value="0" unit="HWPUNIT"/><hc:right value="0" unit="HWPUNIT"/><hc:prev value="0" unit="HWPUNIT"/><hc:next value="0" unit="HWPUNIT"/></hh:margin>
<hh:lineSpacing type="PERCENT" value="160" unit="HWPUNIT"/>
</hp:default>
</hp:switch>
<hh:border borderFillIDRef="1" offsetLeft="0" offsetRight="0" offsetTop="0" offsetBottom="0" connect="0" ignoreMargin="0"/>
</hh:paraPr>''')
# 역할별 문단 속성
align_map = {"left": "LEFT", "center": "CENTER", "right": "RIGHT", "justify": "JUSTIFY"}
for idx, role in enumerate(sorted(self.used_styles), start=1):
style = self.mapper.get_style(role)
align_val = align_map.get(style.align, "JUSTIFY")
line_spacing = int(style.line_spacing)
left_margin = int(style.indent_left * 100)
indent = int(style.indent_first * 100)
space_before = int(style.space_before * 100)
space_after = int(style.space_after * 100)
lines.append(f''' <hh:paraPr id="{idx}" tabPrIDRef="0" condense="0" fontLineHeight="0" snapToGrid="0" suppressLineNumbers="0" checked="0">
<hh:align horizontal="{align_val}" vertical="BASELINE"/>
<hh:heading type="NONE" idRef="0" level="0"/>
<hh:breakSetting breakLatinWord="KEEP_WORD" breakNonLatinWord="KEEP_WORD" widowOrphan="0" keepWithNext="0" keepLines="0" pageBreakBefore="0" lineWrap="BREAK"/>
<hh:autoSpacing eAsianEng="0" eAsianNum="0"/>
<hp:switch xmlns:hp="http://www.hancom.co.kr/hwpml/2011/paragraph">
<hp:case hp:required-namespace="http://www.hancom.co.kr/hwpml/2016/HwpUnitChar">
<hh:margin><hc:intent value="{indent}" unit="HWPUNIT"/><hc:left value="{left_margin}" unit="HWPUNIT"/><hc:right value="0" unit="HWPUNIT"/><hc:prev value="{space_before}" unit="HWPUNIT"/><hc:next value="{space_after}" unit="HWPUNIT"/></hh:margin>
<hh:lineSpacing type="PERCENT" value="{line_spacing}" unit="HWPUNIT"/>
</hp:case>
<hp:default>
<hh:margin><hc:intent value="{indent}" unit="HWPUNIT"/><hc:left value="{left_margin}" unit="HWPUNIT"/><hc:right value="0" unit="HWPUNIT"/><hc:prev value="{space_before}" unit="HWPUNIT"/><hc:next value="{space_after}" unit="HWPUNIT"/></hh:margin>
<hh:lineSpacing type="PERCENT" value="{line_spacing}" unit="HWPUNIT"/>
</hp:default>
</hp:switch>
<hh:border borderFillIDRef="1" offsetLeft="0" offsetRight="0" offsetTop="0" offsetBottom="0" connect="0" ignoreMargin="0"/>
</hh:paraPr>''')
lines.append(' </hh:paraProperties>')
return '\n'.join(lines)
def _generate_styles_xml(self) -> str:
"""스타일 정의 XML 생성 (charPrIDRef, paraPrIDRef 참조)"""
lines = [f' <hh:styles itemCnt="{len(self.used_styles) + 1}">']
# 기본 스타일 (id=0, 바탕글)
lines.append(' <hh:style id="0" type="PARA" name="바탕글" engName="Normal" paraPrIDRef="0" charPrIDRef="0" nextStyleIDRef="0" langID="1042" lockForm="0"/>')
# 역할별 스타일 (charPrIDRef, paraPrIDRef 참조)
for idx, role in enumerate(sorted(self.used_styles), start=1):
style = self.mapper.get_style(role)
style_name = style.name.replace('<', '&lt;').replace('>', '&gt;')
lines.append(f' <hh:style id="{idx}" type="PARA" name="{style_name}" engName="" paraPrIDRef="{idx}" charPrIDRef="{idx}" nextStyleIDRef="{idx}" langID="1042" lockForm="0"/>')
lines.append(' </hh:styles>')
return '\n'.join(lines)
def _create_content(self, temp_dir: Path, elements: List[StyledElement]):
"""Contents/section0.xml 생성 (본문 + 스타일 참조)"""
contents_dir = temp_dir / "Contents"
# 문단 XML 생성
paragraphs = []
current_table = None
# 역할 → 스타일 인덱스 매핑 생성
role_to_idx = {role: idx for idx, role in enumerate(sorted(self.used_styles), start=1)}
for elem in elements:
style = self.mapper.get_style(elem.role)
style_idx = role_to_idx.get(elem.role, 0)
# 테이블 요소는 특수 처리
if elem.role in ["TH", "TD", "TABLE_CAPTION", "TABLE", "FIGURE"]:
continue # 테이블/그림은 별도 처리 필요
# 일반 문단
para_xml = self._create_paragraph(elem.text, style, style_idx)
paragraphs.append(para_xml)
section = f"""<?xml version="1.0" encoding="UTF-8"?>
<hs:sec xmlns:hs="http://www.hancom.co.kr/hwpml/2011/section"
xmlns:hc="http://www.hancom.co.kr/hwpml/2011/core">
{"".join(paragraphs)}
</hs:sec>"""
(contents_dir / "section0.xml").write_text(section, encoding='utf-8')
def _create_paragraph(self, text: str, style: HwpStyle, style_idx: int) -> str:
"""단일 문단 XML 생성"""
text = self._escape_xml(text)
return f'''
<hp:p xmlns:hp="http://www.hancom.co.kr/hwpml/2011/paragraph"
paraPrIDRef="{style_idx}" styleIDRef="{style_idx}" pageBreak="0" columnBreak="0" merged="0">
<hp:run charPrIDRef="{style_idx}">
<hp:t>{text}</hp:t>
</hp:run>
</hp:p>'''
def _escape_xml(self, text: str) -> str:
"""XML 특수문자 이스케이프"""
return (text
.replace("&", "&amp;")
.replace("<", "&lt;")
.replace(">", "&gt;")
.replace('"', "&quot;")
.replace("'", "&apos;"))
def _create_settings(self, temp_dir: Path):
"""settings.xml 생성"""
settings = """<?xml version="1.0" encoding="UTF-8"?>
<hs:settings xmlns:hs="http://www.hancom.co.kr/hwpml/2011/settings">
<hs:viewSetting>
<hs:viewType val="printView"/>
<hs:zoom val="100"/>
</hs:viewSetting>
</hs:settings>"""
(temp_dir / "settings.xml").write_text(settings, encoding='utf-8')
def _create_hwpx(self, temp_dir: Path, output_path: str):
"""HWPX 파일 생성 (ZIP 압축)"""
with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zf:
# mimetype은 압축하지 않고 첫 번째로
mimetype_path = temp_dir / "mimetype"
zf.write(mimetype_path, "mimetype", compress_type=zipfile.ZIP_STORED)
# 나머지 파일들
for root, dirs, files in os.walk(temp_dir):
for file in files:
if file == "mimetype":
continue
file_path = Path(root) / file
arcname = file_path.relative_to(temp_dir)
zf.write(file_path, arcname)
def convert_html_to_hwpx(html: str, output_path: str) -> str:
"""
HTML → HWPX 변환 메인 함수
Args:
html: HTML 문자열
output_path: 출력 파일 경로
Returns:
생성된 파일 경로
"""
# 1. HTML 분석 → 역할 분류
analyzer = StyleAnalyzer()
elements = analyzer.analyze(html)
print(f"📊 분석 완료: {len(elements)}개 요소")
for role, count in analyzer.get_role_summary().items():
print(f" {role}: {count}")
# 2. HWPX 생성
generator = HwpxGenerator()
result_path = generator.generate(elements, output_path)
print(f"✅ 생성 완료: {result_path}")
return result_path
if __name__ == "__main__":
# 테스트
test_html = """
<html>
<body>
<div class="box-cover">
<h1>건설·토목 측량 DX 실무지침</h1>
<h2>드론/UAV·GIS·지형/지반 모델 기반</h2>
<p>2024년 1월</p>
</div>
<h1>1. 개요</h1>
<p>본 보고서는 건설 및 토목 분야의 측량 디지털 전환에 대한 실무 지침을 제공합니다.</p>
<h2>1.1 배경</h2>
<p>최근 드론과 GIS 기술의 발전으로 측량 업무가 크게 변화하고 있습니다.</p>
<h3>1.1.1 기술 동향</h3>
<p>1) <strong>드론 측량의 발전</strong></p>
<p>드론을 활용한 측량은 기존 방식 대비 효율성이 크게 향상되었습니다.</p>
<p>(1) <strong>RTK 드론</strong></p>
<p>실시간 보정 기능을 갖춘 RTK 드론이 보급되고 있습니다.</p>
<ul>
<li>고정밀 GPS 수신기 내장</li>
<li>센티미터 단위 정확도</li>
</ul>
</body>
</html>
"""
output = "/home/claude/test_output.hwpx"
convert_html_to_hwpx(test_html, output)