refactor: HWP 변환을 exe(HWP→HWPX) 단일 경로로 교체, 이미지 경로 URL 인코딩

- hwp.py: COM/pyhwp 제거, HwpToPdfConverter.exe → hwpx 컨버터 재사용으로 단순화 - hwpx.py, hml.py: 이미지 경로의 공백/대괄호 URL 인코딩(%20, %5B, %5D) 추가 (Obsidian 등 Markdown 뷰어에서 [기본이론] 포함 파일명 이미지 표시 오류 수정) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-30 08:41:20 +09:00
parent 1d40d90242
commit b944a8f526
3 changed files with 46 additions and 185 deletions
--- a/converters/hml.py
+++ b/converters/hml.py
@@ -6,6 +6,8 @@ import base64
 import re
 import xml.etree.ElementTree as ET
 from pathlib import Path
 def _esc_path(s: str) -> str:
    return s.replace(' ', '%20').replace('[', '%5B').replace(']', '%5D')
 def _extract_images(tree, images_dir: Path) -> tuple[dict, list]:
@@ -125,7 +127,7 @@ def _process_p(p_elem, pic_counter: list, bin_order: list, id_to_file: dict, bas
                pic_counter[0] += 1
                bid = bin_order[idx] if idx < len(bin_order) else None
                filename = id_to_file.get(bid, '') if bid else ''
-                ref = f'{base_name}_images/{filename}' if filename else f'그림_{idx+1}.png'
+                ref = f'{_esc_path(base_name)}_images/{_esc_path(filename)}' if filename else f'그림_{idx+1}.png'
                lines.append(f'![그림 {idx+1}]({ref})')
    if not has_content:
        text = _extract_text(p_elem)
--- a/converters/hwp.py
+++ b/converters/hwp.py
@@ -1,181 +1,36 @@
 #!/usr/bin/env python3
-"""HWP → Markdown (COM 자동화 우선, pyhwp fallback)"""
+"""HWP → Markdown (HwpToPdfConverter.exe → HWPX → MD)"""
 from __future__ import annotations
-import re
+import subprocess
 import shutil
 import tempfile
 from pathlib import Path
-
+_EXE_PATH = Path(__file__).parent.parent / 'HwpToHwpxConverter_260420' / 'HwpToPdfConverter.exe'
 def _com_hwp_to_hml(hwp_path: Path, hml_path: Path, timeout: int = 15) -> bool:
    import threading
    result = [False]
    def _run():
        try:
            import pythoncom, win32com.client
        except ImportError:
            return
        hwp = None
        try:
            pythoncom.CoInitialize()
            hwp = win32com.client.Dispatch('HWPFrame.HwpObject')
            try:
                hwp.RegisterModule('FilePathCheckDLL', 'SecurityModule')
            except Exception:
                pass
            ok = hwp.Open(str(hwp_path).replace('/', '\\'), 'HWP', 'forceopen:true')
            if not ok:
                return
            hwp.SaveAs(str(hml_path).replace('/', '\\'), 'HML', '')
            result[0] = hml_path.exists()
        except Exception as e:
            print(f'  COM 오류: {e}')
        finally:
            if hwp:
                try: hwp.Quit()
                except Exception: pass
            try: pythoncom.CoUninitialize()
            except Exception: pass
    t = threading.Thread(target=_run, daemon=True)
    t.start()
    t.join(timeout)
    if t.is_alive():
        print(f'  COM 타임아웃 ({timeout}초) -> pyhwp로 전환')
    return result[0]
-def _table_to_md(table_elem) -> str:
+def _exe_hwp_to_hwpx(hwp_path: Path, timeout: int = 30) -> Path | None:
-    from bs4 import Tag
+    """exe로 HWP → HWPX 변환. 성공 시 생성된 .hwpx 경로 반환."""
-    rows = table_elem.find_all('tr', recursive=False) or table_elem.find_all('tr')
+    if not _EXE_PATH.exists():
-    if not rows:
+        print(f'  [경고] exe 없음: {_EXE_PATH}')
-        return ''
+        return None
-    has_merge = False
+    hwpx_path = hwp_path.with_suffix('.hwpx')
-    parsed = []
+    existed_before = hwpx_path.exists()
    for tr in rows:
        cells = []
        for td in tr.find_all(['td', 'th']):
            cs = int(td.get('colspan', 1))
            rs = int(td.get('rowspan', 1))
            if cs > 1 or rs > 1:
                has_merge = True
            cells.append((cs, rs, td.get_text(separator='<br>', strip=True)))
        if cells:
            parsed.append(cells)
    if not parsed:
        return ''
    if has_merge:
        lines = ['<table>']
        for ri, cells in enumerate(parsed):
            lines.append('<tr>')
            tag = 'th' if ri == 0 else 'td'
            for cs, rs, text in cells:
                attrs = (f' colspan="{cs}"' if cs > 1 else '') + (f' rowspan="{rs}"' if rs > 1 else '')
                lines.append(f'<{tag}{attrs}>{text}</{tag}>')
            lines.append('</tr>')
        lines.append('</table>')
        return '\n'.join(lines)
    else:
        rows_text = [[text for _, _, text in cells] for cells in parsed]
        mc = max(len(r) for r in rows_text)
        for r in rows_text:
            r += [''] * (mc - len(r))
        def esc(s): return s.replace('|', '\\|')
        lines = ['| ' + ' | '.join(esc(c) for c in rows_text[0]) + ' |',
                 '| ' + ' | '.join(['---'] * mc) + ' |']
        for row in rows_text[1:]:
            lines.append('| ' + ' | '.join(esc(c) for c in row) + ' |')
        return '\n'.join(lines)
 def _detect_structure(text: str):
    if not text: return 'paragraph', 0, text
    if re.match(r'^\d+\.\d+\.\d+\s', text): return 'heading', 4, text
    if re.match(r'^\d+\.\d+\s', text):       return 'heading', 3, text
    if re.match(r'^\d+\.\s.+', text):        return 'heading', 2, text
    if re.match(r'^[\d가-힣]+\)\s+.+', text):return 'heading', 3, text
    if re.match(r'^[□■]\s*.+', text):        return 'heading', 2, text
    if re.match(r'^[○●◎]\s*.+', text):       return 'heading', 3, text
    if re.match(r'^[▶▷]\s*.+', text):        return 'heading', 4, text
    if re.match(r'^[▪▫\-]\s*.+', text):      return 'bullet',  0, text
    if re.match(r'^[※]', text):              return 'paragraph', 0, f'> {text}'
    return 'paragraph', 0, text
 def _pyhwp_hwp_to_md(hwp_path: Path, output_path: Path, base_name: str) -> bool:
    try:
-        from hwp5.hwp5html import HTMLTransform
+        subprocess.run(
-        from hwp5.xmlmodel import Hwp5File
+            [str(_EXE_PATH), str(hwp_path)],
-        from bs4 import BeautifulSoup
+            timeout=timeout,
-    except ImportError as e:
+            capture_output=True,
-        print(f'  pyhwp/bs4 미설치: {e}')
+        )
-        return False
+        if hwpx_path.exists() and (not existed_before or hwpx_path.stat().st_mtime > hwp_path.stat().st_mtime):
-
+            return hwpx_path
-    tmp_dir = Path(tempfile.mkdtemp())
+        print(f'  [경고] exe 실행 후 .hwpx 파일 없음')
-    try:
+        return None
-        f = Hwp5File(str(hwp_path))
+    except subprocess.TimeoutExpired:
-        HTMLTransform().transform_hwp5_to_dir(f, str(tmp_dir))
+        print(f'  [경고] exe 타임아웃 ({timeout}초)')
-        xhtml_path = tmp_dir / 'index.xhtml'
+        return None
        if not xhtml_path.exists():
            return False
        images_dir = output_path.parent / f'{base_name}_images'
        images_dir.mkdir(exist_ok=True)
        img_map = {}
        bindata_dir = tmp_dir / 'bindata'
        if bindata_dir.exists():
            for img in bindata_dir.iterdir():
                shutil.copy(img, images_dir / img.name)
                img_map[img.name] = img.name
        soup = BeautifulSoup(xhtml_path.read_text(encoding='utf-8'), 'lxml-xml')
        for area in soup.find_all(class_=re.compile(r'^(HeaderArea|FooterArea|Header parashape|Footer parashape)$')):
            area.decompose()
        md_lines = []
        img_counter = [0]
        for elem in soup.find_all(['p', 'table']):
            if elem.find_parent('table'):
                continue
            if elem.name == 'table':
                if not elem.find_parent('p'):
                    md = _table_to_md(elem)
                    if md:
                        md_lines += [md, '']
            elif elem.name == 'p':
                for img in elem.find_all('img'):
                    fn = Path(img.get('src', '')).name
                    if fn in img_map:
                        img_counter[0] += 1
                        md_lines += [f'![그림 {img_counter[0]}]({base_name}_images/{fn})', '']
                inner = elem.find('table')
                if inner:
                    md = _table_to_md(inner)
                    if md:
                        md_lines += [md, '']
                    continue
                text = re.sub(r'\s+', ' ', elem.get_text(separator=' ', strip=True)).strip()
                if not text:
                    continue
                kind, level, fmt = _detect_structure(text)
                if kind == 'heading':
                    if md_lines and md_lines[-1] != '':
                        md_lines.append('')
                    md_lines += [f'{"#" * level} {fmt}', '']
                elif kind == 'bullet':
                    md_lines.append(f'- {re.sub(r"^[▪▫-]\\s*", "", fmt)}')
                else:
                    md_lines += [fmt, '']
        output_path.write_text(re.sub(r'\n{3,}', '\n\n', '\n'.join(md_lines)), encoding='utf-8')
        return True
    except Exception as e:
-        print(f'  pyhwp 오류: {e}')
+        print(f'  [경고] exe 오류: {e}')
-        return False
+        return None
    finally:
        shutil.rmtree(tmp_dir, ignore_errors=True)
 def convert_hwp(hwp_path: Path, output_dir: Path) -> dict:
@@ -190,22 +45,19 @@ def convert_hwp(hwp_path: Path, output_dir: Path) -> dict:
        "output": str(md_path), "format": "hwp",
    }
    try:
-        hml_path = md_path.with_suffix('.hml')
+        hwpx_path = _exe_hwp_to_hwpx(hwp_path)
-        if _com_hwp_to_hml(hwp_path, hml_path):
+        if hwpx_path:
-            try:
+            from converters.hwpx import convert_hwpx
-                from converters.hml import convert_hml
+            r = convert_hwpx(hwpx_path, output_dir)
-                r = convert_hml(hml_path, output_dir)
+            if r['status'] == 'ok':
-                hml_path.unlink(missing_ok=True)
+                result['images'] = r.get('images', [])
-                if r['status'] == 'ok':
+                return result
-                    return result
+            result['error'] = r.get('error', 'hwpx 변환 실패')
-            except Exception:
+        else:
-                hml_path.unlink(missing_ok=True)
+            result['error'] = 'HWP → HWPX 변환 실패 — HwpToPdfConverter.exe 확인 필요'
        if _pyhwp_hwp_to_md(hwp_path, md_path, hwp_path.stem):
            return result
        result['status'] = 'error'
-        result['error'] = 'COM + pyhwp 모두 실패'
+        print(f'  [경고] HWP 변환 실패: {hwp_path.name}')
    except Exception as e:
        result['status'] = 'error'
        result['error'] = str(e)
--- a/converters/hwpx.py
+++ b/converters/hwpx.py
@@ -6,6 +6,13 @@ import re
 import zipfile
 import xml.etree.ElementTree as ET
 from pathlib import Path
 def _esc_path(s: str) -> str:
    return s.replace(' ', '%20').replace('[', '%5B').replace(']', '%5D')
 def _img_link(base_name: str, filename: str, idx: int) -> str:
    path = f'{_esc_path(base_name)}_images/{_esc_path(filename)}'
    return f'![그림 {idx}]({path})'
 NS = {
    'hp': 'http://www.hancom.co.kr/hwpml/2011/paragraph',
@@ -128,7 +135,7 @@ def _process_para(p_elem, pic_counter: list, id_to_file: dict, base_name: str) -
            ref_id = img_elem.get('binaryItemIDRef', '')
            filename = id_to_file.get(ref_id, '')
            if filename:
-                return [f'![그림 {idx+1}]({base_name}_images/{filename})']
+                return [_img_link(base_name, filename, idx + 1)]
        return [f'![그림 {idx+1}](그림_{idx+1}.png)']
    text = _extract_text(p_elem)