Files
doc2md/converters/hwp.py
minsung 1d40d90242 fix: LaTeX 백슬래시 복원, HWP 인코딩 오류 수정, 다이어그램 감지 튜닝
- pdf.py: marker-pdf가 손상시킨 \times·\frac 등 LaTeX 백슬래시 복원 후처리 추가
- pdf.py: 다이어그램 감지에 절대 drawing 수 기준(>= 40) 추가 (대형 엔지니어링 페이지 대응)
- hwp.py: COM 타임아웃 메시지의 em dash → ASCII (cp949 인코딩 오류 수정)
- convert.py: Windows stdout/stderr UTF-8 강제 설정

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-20 15:56:11 +09:00

213 lines
7.6 KiB
Python

#!/usr/bin/env python3
"""HWP → Markdown (COM 자동화 우선, pyhwp fallback)"""
from __future__ import annotations
import re
import shutil
import tempfile
from pathlib import Path
def _com_hwp_to_hml(hwp_path: Path, hml_path: Path, timeout: int = 15) -> bool:
import threading
result = [False]
def _run():
try:
import pythoncom, win32com.client
except ImportError:
return
hwp = None
try:
pythoncom.CoInitialize()
hwp = win32com.client.Dispatch('HWPFrame.HwpObject')
try:
hwp.RegisterModule('FilePathCheckDLL', 'SecurityModule')
except Exception:
pass
ok = hwp.Open(str(hwp_path).replace('/', '\\'), 'HWP', 'forceopen:true')
if not ok:
return
hwp.SaveAs(str(hml_path).replace('/', '\\'), 'HML', '')
result[0] = hml_path.exists()
except Exception as e:
print(f' COM 오류: {e}')
finally:
if hwp:
try: hwp.Quit()
except Exception: pass
try: pythoncom.CoUninitialize()
except Exception: pass
t = threading.Thread(target=_run, daemon=True)
t.start()
t.join(timeout)
if t.is_alive():
print(f' COM 타임아웃 ({timeout}초) -> pyhwp로 전환')
return result[0]
def _table_to_md(table_elem) -> str:
from bs4 import Tag
rows = table_elem.find_all('tr', recursive=False) or table_elem.find_all('tr')
if not rows:
return ''
has_merge = False
parsed = []
for tr in rows:
cells = []
for td in tr.find_all(['td', 'th']):
cs = int(td.get('colspan', 1))
rs = int(td.get('rowspan', 1))
if cs > 1 or rs > 1:
has_merge = True
cells.append((cs, rs, td.get_text(separator='<br>', strip=True)))
if cells:
parsed.append(cells)
if not parsed:
return ''
if has_merge:
lines = ['<table>']
for ri, cells in enumerate(parsed):
lines.append('<tr>')
tag = 'th' if ri == 0 else 'td'
for cs, rs, text in cells:
attrs = (f' colspan="{cs}"' if cs > 1 else '') + (f' rowspan="{rs}"' if rs > 1 else '')
lines.append(f'<{tag}{attrs}>{text}</{tag}>')
lines.append('</tr>')
lines.append('</table>')
return '\n'.join(lines)
else:
rows_text = [[text for _, _, text in cells] for cells in parsed]
mc = max(len(r) for r in rows_text)
for r in rows_text:
r += [''] * (mc - len(r))
def esc(s): return s.replace('|', '\\|')
lines = ['| ' + ' | '.join(esc(c) for c in rows_text[0]) + ' |',
'| ' + ' | '.join(['---'] * mc) + ' |']
for row in rows_text[1:]:
lines.append('| ' + ' | '.join(esc(c) for c in row) + ' |')
return '\n'.join(lines)
def _detect_structure(text: str):
if not text: return 'paragraph', 0, text
if re.match(r'^\d+\.\d+\.\d+\s', text): return 'heading', 4, text
if re.match(r'^\d+\.\d+\s', text): return 'heading', 3, text
if re.match(r'^\d+\.\s.+', text): return 'heading', 2, text
if re.match(r'^[\d가-힣]+\)\s+.+', text):return 'heading', 3, text
if re.match(r'^[□■]\s*.+', text): return 'heading', 2, text
if re.match(r'^[○●◎]\s*.+', text): return 'heading', 3, text
if re.match(r'^[▶▷]\s*.+', text): return 'heading', 4, text
if re.match(r'^[▪▫\-]\s*.+', text): return 'bullet', 0, text
if re.match(r'^[※]', text): return 'paragraph', 0, f'> {text}'
return 'paragraph', 0, text
def _pyhwp_hwp_to_md(hwp_path: Path, output_path: Path, base_name: str) -> bool:
try:
from hwp5.hwp5html import HTMLTransform
from hwp5.xmlmodel import Hwp5File
from bs4 import BeautifulSoup
except ImportError as e:
print(f' pyhwp/bs4 미설치: {e}')
return False
tmp_dir = Path(tempfile.mkdtemp())
try:
f = Hwp5File(str(hwp_path))
HTMLTransform().transform_hwp5_to_dir(f, str(tmp_dir))
xhtml_path = tmp_dir / 'index.xhtml'
if not xhtml_path.exists():
return False
images_dir = output_path.parent / f'{base_name}_images'
images_dir.mkdir(exist_ok=True)
img_map = {}
bindata_dir = tmp_dir / 'bindata'
if bindata_dir.exists():
for img in bindata_dir.iterdir():
shutil.copy(img, images_dir / img.name)
img_map[img.name] = img.name
soup = BeautifulSoup(xhtml_path.read_text(encoding='utf-8'), 'lxml-xml')
for area in soup.find_all(class_=re.compile(r'^(HeaderArea|FooterArea|Header parashape|Footer parashape)$')):
area.decompose()
md_lines = []
img_counter = [0]
for elem in soup.find_all(['p', 'table']):
if elem.find_parent('table'):
continue
if elem.name == 'table':
if not elem.find_parent('p'):
md = _table_to_md(elem)
if md:
md_lines += [md, '']
elif elem.name == 'p':
for img in elem.find_all('img'):
fn = Path(img.get('src', '')).name
if fn in img_map:
img_counter[0] += 1
md_lines += [f'![그림 {img_counter[0]}]({base_name}_images/{fn})', '']
inner = elem.find('table')
if inner:
md = _table_to_md(inner)
if md:
md_lines += [md, '']
continue
text = re.sub(r'\s+', ' ', elem.get_text(separator=' ', strip=True)).strip()
if not text:
continue
kind, level, fmt = _detect_structure(text)
if kind == 'heading':
if md_lines and md_lines[-1] != '':
md_lines.append('')
md_lines += [f'{"#" * level} {fmt}', '']
elif kind == 'bullet':
md_lines.append(f'- {re.sub(r"^[▪▫-]\\s*", "", fmt)}')
else:
md_lines += [fmt, '']
output_path.write_text(re.sub(r'\n{3,}', '\n\n', '\n'.join(md_lines)), encoding='utf-8')
return True
except Exception as e:
print(f' pyhwp 오류: {e}')
return False
finally:
shutil.rmtree(tmp_dir, ignore_errors=True)
def convert_hwp(hwp_path: Path, output_dir: Path) -> dict:
"""HWP → MD. AGENT_GUIDE 스펙 dict 반환."""
hwp_path = Path(hwp_path)
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
md_path = output_dir / f'{hwp_path.stem}.md'
result = {
"status": "ok", "input": str(hwp_path),
"output": str(md_path), "format": "hwp",
}
try:
hml_path = md_path.with_suffix('.hml')
if _com_hwp_to_hml(hwp_path, hml_path):
try:
from converters.hml import convert_hml
r = convert_hml(hml_path, output_dir)
hml_path.unlink(missing_ok=True)
if r['status'] == 'ok':
return result
except Exception:
hml_path.unlink(missing_ok=True)
if _pyhwp_hwp_to_md(hwp_path, md_path, hwp_path.stem):
return result
result['status'] = 'error'
result['error'] = 'COM + pyhwp 모두 실패'
except Exception as e:
result['status'] = 'error'
result['error'] = str(e)
return result