diff --git a/converters/hml.py b/converters/hml.py
index d280678..723e1ec 100644
--- a/converters/hml.py
+++ b/converters/hml.py
@@ -6,6 +6,8 @@ import base64
import re
import xml.etree.ElementTree as ET
from pathlib import Path
+def _esc_path(s: str) -> str:
+ return s.replace(' ', '%20').replace('[', '%5B').replace(']', '%5D')
def _extract_images(tree, images_dir: Path) -> tuple[dict, list]:
@@ -125,7 +127,7 @@ def _process_p(p_elem, pic_counter: list, bin_order: list, id_to_file: dict, bas
pic_counter[0] += 1
bid = bin_order[idx] if idx < len(bin_order) else None
filename = id_to_file.get(bid, '') if bid else ''
- ref = f'{base_name}_images/{filename}' if filename else f'그림_{idx+1}.png'
+ ref = f'{_esc_path(base_name)}_images/{_esc_path(filename)}' if filename else f'그림_{idx+1}.png'
lines.append(f'')
if not has_content:
text = _extract_text(p_elem)
diff --git a/converters/hwp.py b/converters/hwp.py
index a8036e9..fc2a307 100644
--- a/converters/hwp.py
+++ b/converters/hwp.py
@@ -1,181 +1,36 @@
#!/usr/bin/env python3
-"""HWP → Markdown (COM 자동화 우선, pyhwp fallback)"""
+"""HWP → Markdown (HwpToPdfConverter.exe → HWPX → MD)"""
from __future__ import annotations
-import re
-import shutil
-import tempfile
+import subprocess
from pathlib import Path
-
-def _com_hwp_to_hml(hwp_path: Path, hml_path: Path, timeout: int = 15) -> bool:
- import threading
- result = [False]
-
- def _run():
- try:
- import pythoncom, win32com.client
- except ImportError:
- return
- hwp = None
- try:
- pythoncom.CoInitialize()
- hwp = win32com.client.Dispatch('HWPFrame.HwpObject')
- try:
- hwp.RegisterModule('FilePathCheckDLL', 'SecurityModule')
- except Exception:
- pass
- ok = hwp.Open(str(hwp_path).replace('/', '\\'), 'HWP', 'forceopen:true')
- if not ok:
- return
- hwp.SaveAs(str(hml_path).replace('/', '\\'), 'HML', '')
- result[0] = hml_path.exists()
- except Exception as e:
- print(f' COM 오류: {e}')
- finally:
- if hwp:
- try: hwp.Quit()
- except Exception: pass
- try: pythoncom.CoUninitialize()
- except Exception: pass
-
- t = threading.Thread(target=_run, daemon=True)
- t.start()
- t.join(timeout)
- if t.is_alive():
- print(f' COM 타임아웃 ({timeout}초) -> pyhwp로 전환')
- return result[0]
+_EXE_PATH = Path(__file__).parent.parent / 'HwpToHwpxConverter_260420' / 'HwpToPdfConverter.exe'
-def _table_to_md(table_elem) -> str:
- from bs4 import Tag
- rows = table_elem.find_all('tr', recursive=False) or table_elem.find_all('tr')
- if not rows:
- return ''
- has_merge = False
- parsed = []
- for tr in rows:
- cells = []
- for td in tr.find_all(['td', 'th']):
- cs = int(td.get('colspan', 1))
- rs = int(td.get('rowspan', 1))
- if cs > 1 or rs > 1:
- has_merge = True
- cells.append((cs, rs, td.get_text(separator='
', strip=True)))
- if cells:
- parsed.append(cells)
- if not parsed:
- return ''
- if has_merge:
- lines = ['
']
- for ri, cells in enumerate(parsed):
- lines.append('')
- tag = 'th' if ri == 0 else 'td'
- for cs, rs, text in cells:
- attrs = (f' colspan="{cs}"' if cs > 1 else '') + (f' rowspan="{rs}"' if rs > 1 else '')
- lines.append(f'<{tag}{attrs}>{text}{tag}>')
- lines.append('
')
- lines.append('
')
- return '\n'.join(lines)
- else:
- rows_text = [[text for _, _, text in cells] for cells in parsed]
- mc = max(len(r) for r in rows_text)
- for r in rows_text:
- r += [''] * (mc - len(r))
- def esc(s): return s.replace('|', '\\|')
- lines = ['| ' + ' | '.join(esc(c) for c in rows_text[0]) + ' |',
- '| ' + ' | '.join(['---'] * mc) + ' |']
- for row in rows_text[1:]:
- lines.append('| ' + ' | '.join(esc(c) for c in row) + ' |')
- return '\n'.join(lines)
-
-
-def _detect_structure(text: str):
- if not text: return 'paragraph', 0, text
- if re.match(r'^\d+\.\d+\.\d+\s', text): return 'heading', 4, text
- if re.match(r'^\d+\.\d+\s', text): return 'heading', 3, text
- if re.match(r'^\d+\.\s.+', text): return 'heading', 2, text
- if re.match(r'^[\d가-힣]+\)\s+.+', text):return 'heading', 3, text
- if re.match(r'^[□■]\s*.+', text): return 'heading', 2, text
- if re.match(r'^[○●◎]\s*.+', text): return 'heading', 3, text
- if re.match(r'^[▶▷]\s*.+', text): return 'heading', 4, text
- if re.match(r'^[▪▫\-]\s*.+', text): return 'bullet', 0, text
- if re.match(r'^[※]', text): return 'paragraph', 0, f'> {text}'
- return 'paragraph', 0, text
-
-
-def _pyhwp_hwp_to_md(hwp_path: Path, output_path: Path, base_name: str) -> bool:
+def _exe_hwp_to_hwpx(hwp_path: Path, timeout: int = 30) -> Path | None:
+ """exe로 HWP → HWPX 변환. 성공 시 생성된 .hwpx 경로 반환."""
+ if not _EXE_PATH.exists():
+ print(f' [경고] exe 없음: {_EXE_PATH}')
+ return None
+ hwpx_path = hwp_path.with_suffix('.hwpx')
+ existed_before = hwpx_path.exists()
try:
- from hwp5.hwp5html import HTMLTransform
- from hwp5.xmlmodel import Hwp5File
- from bs4 import BeautifulSoup
- except ImportError as e:
- print(f' pyhwp/bs4 미설치: {e}')
- return False
-
- tmp_dir = Path(tempfile.mkdtemp())
- try:
- f = Hwp5File(str(hwp_path))
- HTMLTransform().transform_hwp5_to_dir(f, str(tmp_dir))
- xhtml_path = tmp_dir / 'index.xhtml'
- if not xhtml_path.exists():
- return False
-
- images_dir = output_path.parent / f'{base_name}_images'
- images_dir.mkdir(exist_ok=True)
- img_map = {}
- bindata_dir = tmp_dir / 'bindata'
- if bindata_dir.exists():
- for img in bindata_dir.iterdir():
- shutil.copy(img, images_dir / img.name)
- img_map[img.name] = img.name
-
- soup = BeautifulSoup(xhtml_path.read_text(encoding='utf-8'), 'lxml-xml')
- for area in soup.find_all(class_=re.compile(r'^(HeaderArea|FooterArea|Header parashape|Footer parashape)$')):
- area.decompose()
-
- md_lines = []
- img_counter = [0]
- for elem in soup.find_all(['p', 'table']):
- if elem.find_parent('table'):
- continue
- if elem.name == 'table':
- if not elem.find_parent('p'):
- md = _table_to_md(elem)
- if md:
- md_lines += [md, '']
- elif elem.name == 'p':
- for img in elem.find_all('img'):
- fn = Path(img.get('src', '')).name
- if fn in img_map:
- img_counter[0] += 1
- md_lines += [f'![그림 {img_counter[0]}]({base_name}_images/{fn})', '']
- inner = elem.find('table')
- if inner:
- md = _table_to_md(inner)
- if md:
- md_lines += [md, '']
- continue
- text = re.sub(r'\s+', ' ', elem.get_text(separator=' ', strip=True)).strip()
- if not text:
- continue
- kind, level, fmt = _detect_structure(text)
- if kind == 'heading':
- if md_lines and md_lines[-1] != '':
- md_lines.append('')
- md_lines += [f'{"#" * level} {fmt}', '']
- elif kind == 'bullet':
- md_lines.append(f'- {re.sub(r"^[▪▫-]\\s*", "", fmt)}')
- else:
- md_lines += [fmt, '']
-
- output_path.write_text(re.sub(r'\n{3,}', '\n\n', '\n'.join(md_lines)), encoding='utf-8')
- return True
+ subprocess.run(
+ [str(_EXE_PATH), str(hwp_path)],
+ timeout=timeout,
+ capture_output=True,
+ )
+ if hwpx_path.exists() and (not existed_before or hwpx_path.stat().st_mtime > hwp_path.stat().st_mtime):
+ return hwpx_path
+ print(f' [경고] exe 실행 후 .hwpx 파일 없음')
+ return None
+ except subprocess.TimeoutExpired:
+ print(f' [경고] exe 타임아웃 ({timeout}초)')
+ return None
except Exception as e:
- print(f' pyhwp 오류: {e}')
- return False
- finally:
- shutil.rmtree(tmp_dir, ignore_errors=True)
+ print(f' [경고] exe 오류: {e}')
+ return None
def convert_hwp(hwp_path: Path, output_dir: Path) -> dict:
@@ -190,22 +45,19 @@ def convert_hwp(hwp_path: Path, output_dir: Path) -> dict:
"output": str(md_path), "format": "hwp",
}
try:
- hml_path = md_path.with_suffix('.hml')
- if _com_hwp_to_hml(hwp_path, hml_path):
- try:
- from converters.hml import convert_hml
- r = convert_hml(hml_path, output_dir)
- hml_path.unlink(missing_ok=True)
- if r['status'] == 'ok':
- return result
- except Exception:
- hml_path.unlink(missing_ok=True)
-
- if _pyhwp_hwp_to_md(hwp_path, md_path, hwp_path.stem):
- return result
+ hwpx_path = _exe_hwp_to_hwpx(hwp_path)
+ if hwpx_path:
+ from converters.hwpx import convert_hwpx
+ r = convert_hwpx(hwpx_path, output_dir)
+ if r['status'] == 'ok':
+ result['images'] = r.get('images', [])
+ return result
+ result['error'] = r.get('error', 'hwpx 변환 실패')
+ else:
+ result['error'] = 'HWP → HWPX 변환 실패 — HwpToPdfConverter.exe 확인 필요'
result['status'] = 'error'
- result['error'] = 'COM + pyhwp 모두 실패'
+ print(f' [경고] HWP 변환 실패: {hwp_path.name}')
except Exception as e:
result['status'] = 'error'
result['error'] = str(e)
diff --git a/converters/hwpx.py b/converters/hwpx.py
index 52d4a2e..b3235aa 100644
--- a/converters/hwpx.py
+++ b/converters/hwpx.py
@@ -6,6 +6,13 @@ import re
import zipfile
import xml.etree.ElementTree as ET
from pathlib import Path
+def _esc_path(s: str) -> str:
+ return s.replace(' ', '%20').replace('[', '%5B').replace(']', '%5D')
+
+
+def _img_link(base_name: str, filename: str, idx: int) -> str:
+ path = f'{_esc_path(base_name)}_images/{_esc_path(filename)}'
+ return f''
NS = {
'hp': 'http://www.hancom.co.kr/hwpml/2011/paragraph',
@@ -128,7 +135,7 @@ def _process_para(p_elem, pic_counter: list, id_to_file: dict, base_name: str) -
ref_id = img_elem.get('binaryItemIDRef', '')
filename = id_to_file.get(ref_id, '')
if filename:
- return [f'']
+ return [_img_link(base_name, filename, idx + 1)]
return [f'']
text = _extract_text(p_elem)