Files
test/converters/html_to_hwp.py

573 lines
25 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
"""
HTML → HWP 변환기 v11
✅ 이미지: sizeoption=0 (원본 크기) 또는 width/height 지정
✅ 페이지번호: ctrl 코드 방식으로 수정
✅ 나머지는 v10 유지
pip install pyhwpx beautifulsoup4 pillow
"""
from pyhwpx import Hwp
from bs4 import BeautifulSoup, NavigableString
import os, re
# PIL 선택적 import (이미지 크기 확인용)
try:
from PIL import Image
HAS_PIL = True
except ImportError:
HAS_PIL = False
print("[알림] PIL 없음 - 이미지 원본 크기로 삽입")
class Config:
MARGIN_LEFT, MARGIN_RIGHT, MARGIN_TOP, MARGIN_BOTTOM = 20, 20, 20, 15
HEADER_LEN, FOOTER_LEN = 10, 10
MAX_IMAGE_WIDTH = 150 # mm (최대 이미지 너비)
class StyleParser:
def __init__(self):
self.class_styles = {
'h1': {'font-size': '20pt', 'color': '#008000'},
'h2': {'font-size': '16pt', 'color': '#03581d'},
'h3': {'font-size': '13pt', 'color': '#228B22'},
'p': {'font-size': '11pt', 'color': '#333333'},
'li': {'font-size': '11pt', 'color': '#333333'},
'th': {'font-size': '9pt', 'color': '#006400'},
'td': {'font-size': '9.5pt', 'color': '#333333'},
'toc-lvl-1': {'font-size': '13pt', 'font-weight': '900', 'color': '#006400'},
'toc-lvl-2': {'font-size': '11pt', 'color': '#333333'},
'toc-lvl-3': {'font-size': '10pt', 'color': '#666666'},
}
def get_element_style(self, elem):
style = {}
tag = elem.name if hasattr(elem, 'name') else None
if tag and tag in self.class_styles: style.update(self.class_styles[tag])
for cls in elem.get('class', []) if hasattr(elem, 'get') else []:
if cls in self.class_styles: style.update(self.class_styles[cls])
return style
def parse_size(self, s):
m = re.search(r'([\d.]+)', str(s)) if s else None
return float(m.group(1)) if m else 11
def parse_color(self, c):
if not c: return '#000000'
c = str(c).strip().lower()
if re.match(r'^#[0-9a-fA-F]{6}$', c): return c.upper()
m = re.search(r'rgb[a]?\s*\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)', c)
return f'#{int(m.group(1)):02X}{int(m.group(2)):02X}{int(m.group(3)):02X}' if m else '#000000'
def is_bold(self, style): return style.get('font-weight', '') in ['bold', '700', '800', '900']
class HtmlToHwpConverter:
def __init__(self, visible=True):
self.hwp = Hwp(visible=visible)
self.cfg = Config()
self.sp = StyleParser()
self.base_path = ""
self.is_first_h1 = True
self.image_count = 0
def _mm(self, mm): return self.hwp.MiliToHwpUnit(mm)
def _pt(self, pt): return self.hwp.PointToHwpUnit(pt)
def _rgb(self, c):
c = c.lstrip('#')
return self.hwp.RGBColor(int(c[0:2],16), int(c[2:4],16), int(c[4:6],16)) if len(c)>=6 else self.hwp.RGBColor(0,0,0)
def _setup_page(self):
try:
self.hwp.HAction.GetDefault("PageSetup", self.hwp.HParameterSet.HSecDef.HSet)
s = self.hwp.HParameterSet.HSecDef
s.PageDef.LeftMargin = self._mm(self.cfg.MARGIN_LEFT)
s.PageDef.RightMargin = self._mm(self.cfg.MARGIN_RIGHT)
s.PageDef.TopMargin = self._mm(self.cfg.MARGIN_TOP)
s.PageDef.BottomMargin = self._mm(self.cfg.MARGIN_BOTTOM)
s.PageDef.HeaderLen = self._mm(self.cfg.HEADER_LEN)
s.PageDef.FooterLen = self._mm(self.cfg.FOOTER_LEN)
self.hwp.HAction.Execute("PageSetup", s.HSet)
except: pass
def _create_header(self, right_text=""):
print(f" → 머리말 생성: {right_text if right_text else '(초기화)'}")
try:
self.hwp.HAction.GetDefault("HeaderFooter", self.hwp.HParameterSet.HHeaderFooter.HSet)
self.hwp.HParameterSet.HHeaderFooter.HSet.SetItem("HeaderFooterStyle", 0)
self.hwp.HParameterSet.HHeaderFooter.HSet.SetItem("HeaderFooterCtrlType", 0)
self.hwp.HAction.Execute("HeaderFooter", self.hwp.HParameterSet.HHeaderFooter.HSet)
self.hwp.HAction.Run("ParagraphShapeAlignRight")
self._set_font(9, False, '#333333')
if right_text:
self.hwp.insert_text(right_text)
self.hwp.HAction.Run("CloseEx")
except Exception as e:
print(f" [경고] 머리말: {e}")
# ═══════════════════════════════════════════════════════════════
# 꼬리말 - 페이지 번호 (수정)
# ═══════════════════════════════════════════════════════════════
def _create_footer(self, left_text=""):
print(f" → 꼬리말: {left_text}")
# 1. 꼬리말 열기
self.hwp.HAction.GetDefault("HeaderFooter", self.hwp.HParameterSet.HHeaderFooter.HSet)
self.hwp.HParameterSet.HHeaderFooter.HSet.SetItem("HeaderFooterStyle", 0)
self.hwp.HParameterSet.HHeaderFooter.HSet.SetItem("HeaderFooterCtrlType", 1)
self.hwp.HAction.Execute("HeaderFooter", self.hwp.HParameterSet.HHeaderFooter.HSet)
# 2. 좌측 정렬 + 제목 8pt
self.hwp.HAction.Run("ParagraphShapeAlignLeft")
self._set_font(8, False, '#666666')
self.hwp.insert_text(left_text)
# 3. 꼬리말 닫기
self.hwp.HAction.Run("CloseEx")
# 4. 쪽번호 (우측 하단)
self.hwp.HAction.GetDefault("PageNumPos", self.hwp.HParameterSet.HPageNumPos.HSet)
self.hwp.HParameterSet.HPageNumPos.DrawPos = self.hwp.PageNumPosition("BottomRight")
self.hwp.HAction.Execute("PageNumPos", self.hwp.HParameterSet.HPageNumPos.HSet)
def _new_section_with_header(self, header_text):
"""새 구역 생성 후 머리말 설정"""
print(f" → 새 구역 머리말: {header_text}")
try:
self.hwp.HAction.Run("BreakSection")
self.hwp.HAction.GetDefault("HeaderFooter", self.hwp.HParameterSet.HHeaderFooter.HSet)
self.hwp.HParameterSet.HHeaderFooter.HSet.SetItem("HeaderFooterStyle", 0)
self.hwp.HParameterSet.HHeaderFooter.HSet.SetItem("HeaderFooterCtrlType", 0)
self.hwp.HAction.Execute("HeaderFooter", self.hwp.HParameterSet.HHeaderFooter.HSet)
self.hwp.HAction.Run("SelectAll")
self.hwp.HAction.Run("Delete")
self.hwp.HAction.Run("ParagraphShapeAlignRight")
self._set_font(9, False, '#333333')
self.hwp.insert_text(header_text)
self.hwp.HAction.Run("CloseEx")
except Exception as e:
print(f" [경고] 구역 머리말: {e}")
def _set_font(self, size=11, bold=False, color='#000000'):
self.hwp.set_font(FaceName='맑은 고딕', Height=size, Bold=bold, TextColor=self._rgb(color))
def _set_para(self, align='justify', lh=170, left=0, indent=0, before=0, after=0):
acts = {'left':'ParagraphShapeAlignLeft','center':'ParagraphShapeAlignCenter',
'right':'ParagraphShapeAlignRight','justify':'ParagraphShapeAlignJustify'}
if align in acts: self.hwp.HAction.Run(acts[align])
try:
self.hwp.HAction.GetDefault("ParagraphShape", self.hwp.HParameterSet.HParaShape.HSet)
p = self.hwp.HParameterSet.HParaShape
p.LineSpaceType, p.LineSpacing = 0, lh
p.LeftMargin = self._mm(left)
p.IndentMargin = self._mm(indent)
p.SpaceBeforePara = self._pt(before)
p.SpaceAfterPara = self._pt(after)
p.BreakNonLatinWord = 0
self.hwp.HAction.Execute("ParagraphShape", p.HSet)
except: pass
def _set_cell_bg(self, color):
try:
self.hwp.HAction.GetDefault("CellBorderFill", self.hwp.HParameterSet.HCellBorderFill.HSet)
p = self.hwp.HParameterSet.HCellBorderFill
p.FillAttr.type = self.hwp.BrushType("NullBrush|WinBrush")
p.FillAttr.WinBrushFaceStyle = self.hwp.HatchStyle("None")
p.FillAttr.WinBrushHatchColor = self._rgb('#000000')
p.FillAttr.WinBrushFaceColor = self._rgb(color)
p.FillAttr.WindowsBrush = 1
self.hwp.HAction.Execute("CellBorderFill", p.HSet)
except: pass
def _underline_box(self, text, size=14, color='#008000'):
try:
self.hwp.HAction.GetDefault("TableCreate", self.hwp.HParameterSet.HTableCreation.HSet)
t = self.hwp.HParameterSet.HTableCreation
t.Rows, t.Cols, t.WidthType, t.HeightType = 1, 1, 0, 0
t.WidthValue, t.HeightValue = self._mm(168), self._mm(10)
self.hwp.HAction.Execute("TableCreate", t.HSet)
self.hwp.HAction.GetDefault("InsertText", self.hwp.HParameterSet.HInsertText.HSet)
self.hwp.HParameterSet.HInsertText.Text = text
self.hwp.HAction.Execute("InsertText", self.hwp.HParameterSet.HInsertText.HSet)
self.hwp.HAction.Run("TableCellBlock")
self.hwp.HAction.GetDefault("CharShape", self.hwp.HParameterSet.HCharShape.HSet)
self.hwp.HParameterSet.HCharShape.Height = self._pt(size)
self.hwp.HParameterSet.HCharShape.TextColor = self._rgb(color)
self.hwp.HAction.Execute("CharShape", self.hwp.HParameterSet.HCharShape.HSet)
self.hwp.HAction.GetDefault("CellBorder", self.hwp.HParameterSet.HCellBorderFill.HSet)
c = self.hwp.HParameterSet.HCellBorderFill
c.BorderTypeTop = self.hwp.HwpLineType("None")
c.BorderTypeRight = self.hwp.HwpLineType("None")
c.BorderTypeLeft = self.hwp.HwpLineType("None")
self.hwp.HAction.Execute("CellBorder", c.HSet)
self.hwp.HAction.GetDefault("CellBorder", self.hwp.HParameterSet.HCellBorderFill.HSet)
c = self.hwp.HParameterSet.HCellBorderFill
c.BorderColorBottom = self._rgb(color)
c.BorderWidthBottom = self.hwp.HwpLineWidth("0.4mm")
self.hwp.HAction.Execute("CellBorder", c.HSet)
self.hwp.HAction.Run("Cancel")
self.hwp.HAction.Run("CloseEx")
self.hwp.HAction.Run("MoveDocEnd")
except:
self._set_font(size, True, color)
self.hwp.insert_text(text)
self.hwp.BreakPara()
def _update_header(self, new_title):
"""머리말 텍스트 업데이트"""
try:
# 기존 머리말 편집 모드로 진입
self.hwp.HAction.GetDefault("HeaderFooter", self.hwp.HParameterSet.HHeaderFooter.HSet)
self.hwp.HParameterSet.HHeaderFooter.HSet.SetItem("HeaderFooterStyle", 2) # 편집 모드
self.hwp.HParameterSet.HHeaderFooter.HSet.SetItem("HeaderFooterCtrlType", 0)
self.hwp.HAction.Execute("HeaderFooter", self.hwp.HParameterSet.HHeaderFooter.HSet)
# 기존 내용 삭제
self.hwp.HAction.Run("SelectAll")
self.hwp.HAction.Run("Delete")
# 새 내용 삽입
self.hwp.HAction.Run("ParagraphShapeAlignRight")
self._set_font(9, False, '#333333')
self.hwp.insert_text(new_title)
self.hwp.HAction.Run("CloseEx")
except Exception as e:
print(f" [경고] 머리말 업데이트: {e}")
def _insert_heading(self, elem):
lv = int(elem.name[1]) if elem.name in ['h1','h2','h3'] else 1
txt = elem.get_text(strip=True)
st = self.sp.get_element_style(elem)
sz = self.sp.parse_size(st.get('font-size','14pt'))
cl = self.sp.parse_color(st.get('color','#008000'))
if lv == 1:
if self.is_first_h1:
self._create_header(txt)
self.is_first_h1 = False
else:
self._new_section_with_header(txt)
self._set_para('left', 130, before=0, after=0)
self._underline_box(txt, sz, cl)
self.hwp.BreakPara()
self._set_para('left', 130, before=0, after=15)
self.hwp.BreakPara()
elif lv == 2:
self._set_para('left', 150, before=20, after=8)
self._set_font(sz, True, cl)
self.hwp.insert_text("" + txt)
self.hwp.BreakPara()
elif lv == 3:
self._set_para('left', 140, left=3, before=12, after=5)
self._set_font(sz, True, cl)
self.hwp.insert_text("" + txt)
self.hwp.BreakPara()
def _insert_paragraph(self, elem):
txt = elem.get_text(strip=True)
if not txt: return
st = self.sp.get_element_style(elem)
sz = self.sp.parse_size(st.get('font-size','11pt'))
cl = self.sp.parse_color(st.get('color','#333333'))
self._set_para('justify', 170, left=0, indent=3, before=0, after=3)
if elem.find(['b','strong']):
for ch in elem.children:
if isinstance(ch, NavigableString):
if str(ch).strip(): self._set_font(sz,False,cl); self.hwp.insert_text(str(ch))
elif ch.name in ['b','strong']:
if ch.get_text(): self._set_font(sz,True,cl); self.hwp.insert_text(ch.get_text())
else:
self._set_font(sz, self.sp.is_bold(st), cl)
self.hwp.insert_text(txt)
self.hwp.BreakPara()
def _insert_list(self, elem):
lt = elem.name
for i, li in enumerate(elem.find_all('li', recursive=False)):
st = self.sp.get_element_style(li)
cls = li.get('class', [])
txt = li.get_text(strip=True)
is_toc = any('toc-' in c for c in cls)
if 'toc-lvl-1' in cls: left, bef = 0, 8
elif 'toc-lvl-2' in cls: left, bef = 7, 3
elif 'toc-lvl-3' in cls: left, bef = 14, 1
else: left, bef = 4, 2
pf = f"{i+1}. " if lt == 'ol' else ""
sz = self.sp.parse_size(st.get('font-size','11pt'))
cl = self.sp.parse_color(st.get('color','#333333'))
bd = self.sp.is_bold(st)
if is_toc:
self._set_para('left', 170, left=left, indent=0, before=bef, after=1)
self._set_font(sz, bd, cl)
self.hwp.insert_text(pf + txt)
self.hwp.BreakPara()
else:
self._set_para('justify', 170, left=left, indent=0, before=bef, after=1)
self._set_font(sz, bd, cl)
self.hwp.insert_text(pf)
self.hwp.HAction.Run("ParagraphShapeIndentAtCaret")
self.hwp.insert_text(txt)
self.hwp.BreakPara()
def _insert_table(self, table_elem):
rows_data, cell_styles, occupied, max_cols = [], {}, {}, 0
for ri, tr in enumerate(table_elem.find_all('tr')):
row, ci = [], 0
for cell in tr.find_all(['td','th']):
while (ri,ci) in occupied: row.append(""); ci+=1
txt = cell.get_text(strip=True)
cs, rs = int(cell.get('colspan',1)), int(cell.get('rowspan',1))
cell_styles[(ri,ci)] = {'is_header': cell.name=='th' or ri==0}
row.append(txt)
for dr in range(rs):
for dc in range(cs):
if dr>0 or dc>0: occupied[(ri+dr,ci+dc)] = True
for _ in range(cs-1): row.append("")
ci += cs
rows_data.append(row)
max_cols = max(max_cols, len(row))
for row in rows_data:
while len(row) < max_cols: row.append("")
rc = len(rows_data)
if rc == 0 or max_cols == 0: return
print(f" 표: {rc}× {max_cols}")
self._set_para('left', 130, before=5, after=0)
self.hwp.create_table(rc, max_cols, treat_as_char=True)
for ri, row in enumerate(rows_data):
for ci in range(max_cols):
if (ri,ci) in occupied: self.hwp.HAction.Run("MoveRight"); continue
txt = row[ci] if ci < len(row) else ""
hdr = cell_styles.get((ri,ci),{}).get('is_header', False)
if hdr: self._set_cell_bg('#E8F5E9')
self.hwp.HAction.Run("ParagraphShapeAlignCenter")
self._set_font(9 if hdr else 9.5, hdr, '#006400' if hdr else '#333333')
self.hwp.insert_text(str(txt))
if not (ri==rc-1 and ci==max_cols-1): self.hwp.HAction.Run("MoveRight")
self.hwp.HAction.Run("Cancel")
self.hwp.HAction.Run("CloseEx")
self.hwp.HAction.Run("MoveDocEnd")
self._set_para('left', 130, before=5, after=5)
self.hwp.BreakPara()
# ═══════════════════════════════════════════════════════════════
# 이미지 삽입 - sizeoption 수정 ★
# ═══════════════════════════════════════════════════════════════
def _insert_image(self, src, caption=""):
self.image_count += 1
print(f" 📷 이미지 #{self.image_count}: {os.path.basename(src)}")
if not src:
return
# 상대경로 → 절대경로
if not os.path.isabs(src):
full_path = os.path.normpath(os.path.join(self.base_path, src))
else:
full_path = src
if not os.path.exists(full_path):
print(f" ❌ 파일 없음: {full_path}")
self._set_font(9, False, '#999999')
self._set_para('center', 130)
self.hwp.insert_text(f"[이미지 없음: {os.path.basename(src)}]")
self.hwp.BreakPara()
return
try:
self._set_para('center', 130, before=5, after=3)
# ★ sizeoption=0: 원본 크기
# ★ sizeoption=2: 지정 크기 (width, height 필요)
# ★ 둘 다 안되면 sizeoption 없이 시도
inserted = False
# 방법 1: sizeoption=0 (원본 크기)
try:
self.hwp.insert_picture(full_path, sizeoption=0)
inserted = True
print(f" ✅ 삽입 성공 (원본 크기)")
except Exception as e1:
pass
# 방법 2: width/height 지정
if not inserted and HAS_PIL:
try:
with Image.open(full_path) as img:
w_px, h_px = img.size
# px → mm 변환 (96 DPI 기준)
w_mm = w_px * 25.4 / 96
h_mm = h_px * 25.4 / 96
# 최대 너비 제한
if w_mm > self.cfg.MAX_IMAGE_WIDTH:
ratio = self.cfg.MAX_IMAGE_WIDTH / w_mm
w_mm = self.cfg.MAX_IMAGE_WIDTH
h_mm = h_mm * ratio
self.hwp.insert_picture(full_path, sizeoption=1,
width=self._mm(w_mm), height=self._mm(h_mm))
inserted = True
print(f" ✅ 삽입 성공 ({w_mm:.0f}×{h_mm:.0f}mm)")
except Exception as e2:
pass
# 방법 3: 기본값
if not inserted:
try:
self.hwp.insert_picture(full_path)
inserted = True
print(f" ✅ 삽입 성공 (기본)")
except Exception as e3:
print(f" ❌ 삽입 실패: {e3}")
self._set_font(9, False, '#FF0000')
self.hwp.insert_text(f"[이미지 오류: {os.path.basename(src)}]")
self.hwp.BreakPara()
if caption and inserted:
self._set_font(9.5, True, '#666666')
self._set_para('center', 130, before=0, after=5)
self.hwp.insert_text(caption)
self.hwp.BreakPara()
except Exception as e:
print(f" ❌ 오류: {e}")
def _insert_highlight_box(self, elem):
txt = elem.get_text(strip=True)
if not txt: return
self._set_para('left', 130, before=5, after=0)
self.hwp.create_table(1, 1, treat_as_char=True)
self._set_cell_bg('#E2ECE2')
self._set_font(11, False, '#333333')
self.hwp.insert_text(txt)
self.hwp.HAction.Run("Cancel")
self.hwp.HAction.Run("CloseEx")
self.hwp.HAction.Run("MoveDocEnd")
self._set_para('left', 130, before=0, after=5)
self.hwp.BreakPara()
def _process(self, elem):
if isinstance(elem, NavigableString): return
tag = elem.name
if not tag or tag in ['script','style','template','noscript','head']: return
if tag == 'figure':
img = elem.find('img')
if img:
figcaption = elem.find('figcaption')
caption = figcaption.get_text(strip=True) if figcaption else ""
self._insert_image(img.get('src', ''), caption)
return
if tag == 'img':
self._insert_image(elem.get('src', ''))
return
if tag in ['h1','h2','h3']: self._insert_heading(elem)
elif tag == 'p': self._insert_paragraph(elem)
elif tag == 'table': self._insert_table(elem)
elif tag in ['ul','ol']: self._insert_list(elem)
elif 'highlight-box' in elem.get('class',[]): self._insert_highlight_box(elem)
elif tag in ['div','section','article','main','body','html','span']:
for ch in elem.children: self._process(ch)
def convert(self, html_path, output_path):
print("="*60)
print("HTML → HWP 변환기 v11")
print(" ✓ 이미지: sizeoption 수정")
print(" ✓ 페이지번호: 다중 방법 시도")
print("="*60)
self.base_path = os.path.dirname(os.path.abspath(html_path))
self.is_first_h1 = True
self.image_count = 0
print(f"\n입력: {html_path}")
print(f"출력: {output_path}\n")
with open(html_path, 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f.read(), 'html.parser')
title_tag = soup.find('title')
if title_tag:
full_title = title_tag.get_text(strip=True)
footer_title = full_title.split(':')[0].strip() # ":" 이전
else:
footer_title = ""
self.hwp.FileNew()
self._setup_page()
self._create_footer(footer_title)
raw = soup.find(id='raw-container')
if raw:
cover = raw.find(id='box-cover')
if cover:
print(" → 표지")
for ch in cover.children: self._process(ch)
self.hwp.HAction.Run("BreakPage")
toc = raw.find(id='box-toc')
if toc:
print(" → 목차")
self.is_first_h1 = True
self._underline_box("목 차", 20, '#008000')
self.hwp.BreakPara(); self.hwp.BreakPara()
self._insert_list(toc.find('ul') or toc)
self.hwp.HAction.Run("BreakPage")
summary = raw.find(id='box-summary')
if summary:
print(" → 요약")
self.is_first_h1 = True
self._process(summary)
self.hwp.HAction.Run("BreakPage")
content = raw.find(id='box-content')
if content:
print(" → 본문")
self.is_first_h1 = True
self._process(content)
else:
self._process(soup.find('body') or soup)
self.hwp.SaveAs(output_path)
print(f"\n✅ 저장: {output_path}")
print(f" 이미지: {self.image_count}개 처리")
def close(self):
try: self.hwp.Quit()
except: pass
def main():
html_path = r"D:\for python\survey_test\output\generated\report.html"
output_path = r"D:\for python\survey_test\output\generated\report_v12.hwp"
try:
conv = HtmlToHwpConverter(visible=True)
conv.convert(html_path, output_path)
input("\nEnter를 누르면 HWP가 닫힙니다...") # ← 선택사항
conv.close()
except Exception as e:
print(f"\n[에러] {e}")
import traceback; traceback.print_exc()
if __name__ == "__main__":
main()