diff --git a/converters/hml.py b/converters/hml.py index d280678..723e1ec 100644 --- a/converters/hml.py +++ b/converters/hml.py @@ -6,6 +6,8 @@ import base64 import re import xml.etree.ElementTree as ET from pathlib import Path +def _esc_path(s: str) -> str: + return s.replace(' ', '%20').replace('[', '%5B').replace(']', '%5D') def _extract_images(tree, images_dir: Path) -> tuple[dict, list]: @@ -125,7 +127,7 @@ def _process_p(p_elem, pic_counter: list, bin_order: list, id_to_file: dict, bas pic_counter[0] += 1 bid = bin_order[idx] if idx < len(bin_order) else None filename = id_to_file.get(bid, '') if bid else '' - ref = f'{base_name}_images/{filename}' if filename else f'그림_{idx+1}.png' + ref = f'{_esc_path(base_name)}_images/{_esc_path(filename)}' if filename else f'그림_{idx+1}.png' lines.append(f'![그림 {idx+1}]({ref})') if not has_content: text = _extract_text(p_elem) diff --git a/converters/hwp.py b/converters/hwp.py index a8036e9..fc2a307 100644 --- a/converters/hwp.py +++ b/converters/hwp.py @@ -1,181 +1,36 @@ #!/usr/bin/env python3 -"""HWP → Markdown (COM 자동화 우선, pyhwp fallback)""" +"""HWP → Markdown (HwpToPdfConverter.exe → HWPX → MD)""" from __future__ import annotations -import re -import shutil -import tempfile +import subprocess from pathlib import Path - -def _com_hwp_to_hml(hwp_path: Path, hml_path: Path, timeout: int = 15) -> bool: - import threading - result = [False] - - def _run(): - try: - import pythoncom, win32com.client - except ImportError: - return - hwp = None - try: - pythoncom.CoInitialize() - hwp = win32com.client.Dispatch('HWPFrame.HwpObject') - try: - hwp.RegisterModule('FilePathCheckDLL', 'SecurityModule') - except Exception: - pass - ok = hwp.Open(str(hwp_path).replace('/', '\\'), 'HWP', 'forceopen:true') - if not ok: - return - hwp.SaveAs(str(hml_path).replace('/', '\\'), 'HML', '') - result[0] = hml_path.exists() - except Exception as e: - print(f' COM 오류: {e}') - finally: - if hwp: - try: hwp.Quit() - except Exception: pass - try: pythoncom.CoUninitialize() - except Exception: pass - - t = threading.Thread(target=_run, daemon=True) - t.start() - t.join(timeout) - if t.is_alive(): - print(f' COM 타임아웃 ({timeout}초) -> pyhwp로 전환') - return result[0] +_EXE_PATH = Path(__file__).parent.parent / 'HwpToHwpxConverter_260420' / 'HwpToPdfConverter.exe' -def _table_to_md(table_elem) -> str: - from bs4 import Tag - rows = table_elem.find_all('tr', recursive=False) or table_elem.find_all('tr') - if not rows: - return '' - has_merge = False - parsed = [] - for tr in rows: - cells = [] - for td in tr.find_all(['td', 'th']): - cs = int(td.get('colspan', 1)) - rs = int(td.get('rowspan', 1)) - if cs > 1 or rs > 1: - has_merge = True - cells.append((cs, rs, td.get_text(separator='
', strip=True))) - if cells: - parsed.append(cells) - if not parsed: - return '' - if has_merge: - lines = [''] - for ri, cells in enumerate(parsed): - lines.append('') - tag = 'th' if ri == 0 else 'td' - for cs, rs, text in cells: - attrs = (f' colspan="{cs}"' if cs > 1 else '') + (f' rowspan="{rs}"' if rs > 1 else '') - lines.append(f'<{tag}{attrs}>{text}') - lines.append('') - lines.append('
') - return '\n'.join(lines) - else: - rows_text = [[text for _, _, text in cells] for cells in parsed] - mc = max(len(r) for r in rows_text) - for r in rows_text: - r += [''] * (mc - len(r)) - def esc(s): return s.replace('|', '\\|') - lines = ['| ' + ' | '.join(esc(c) for c in rows_text[0]) + ' |', - '| ' + ' | '.join(['---'] * mc) + ' |'] - for row in rows_text[1:]: - lines.append('| ' + ' | '.join(esc(c) for c in row) + ' |') - return '\n'.join(lines) - - -def _detect_structure(text: str): - if not text: return 'paragraph', 0, text - if re.match(r'^\d+\.\d+\.\d+\s', text): return 'heading', 4, text - if re.match(r'^\d+\.\d+\s', text): return 'heading', 3, text - if re.match(r'^\d+\.\s.+', text): return 'heading', 2, text - if re.match(r'^[\d가-힣]+\)\s+.+', text):return 'heading', 3, text - if re.match(r'^[□■]\s*.+', text): return 'heading', 2, text - if re.match(r'^[○●◎]\s*.+', text): return 'heading', 3, text - if re.match(r'^[▶▷]\s*.+', text): return 'heading', 4, text - if re.match(r'^[▪▫\-]\s*.+', text): return 'bullet', 0, text - if re.match(r'^[※]', text): return 'paragraph', 0, f'> {text}' - return 'paragraph', 0, text - - -def _pyhwp_hwp_to_md(hwp_path: Path, output_path: Path, base_name: str) -> bool: +def _exe_hwp_to_hwpx(hwp_path: Path, timeout: int = 30) -> Path | None: + """exe로 HWP → HWPX 변환. 성공 시 생성된 .hwpx 경로 반환.""" + if not _EXE_PATH.exists(): + print(f' [경고] exe 없음: {_EXE_PATH}') + return None + hwpx_path = hwp_path.with_suffix('.hwpx') + existed_before = hwpx_path.exists() try: - from hwp5.hwp5html import HTMLTransform - from hwp5.xmlmodel import Hwp5File - from bs4 import BeautifulSoup - except ImportError as e: - print(f' pyhwp/bs4 미설치: {e}') - return False - - tmp_dir = Path(tempfile.mkdtemp()) - try: - f = Hwp5File(str(hwp_path)) - HTMLTransform().transform_hwp5_to_dir(f, str(tmp_dir)) - xhtml_path = tmp_dir / 'index.xhtml' - if not xhtml_path.exists(): - return False - - images_dir = output_path.parent / f'{base_name}_images' - images_dir.mkdir(exist_ok=True) - img_map = {} - bindata_dir = tmp_dir / 'bindata' - if bindata_dir.exists(): - for img in bindata_dir.iterdir(): - shutil.copy(img, images_dir / img.name) - img_map[img.name] = img.name - - soup = BeautifulSoup(xhtml_path.read_text(encoding='utf-8'), 'lxml-xml') - for area in soup.find_all(class_=re.compile(r'^(HeaderArea|FooterArea|Header parashape|Footer parashape)$')): - area.decompose() - - md_lines = [] - img_counter = [0] - for elem in soup.find_all(['p', 'table']): - if elem.find_parent('table'): - continue - if elem.name == 'table': - if not elem.find_parent('p'): - md = _table_to_md(elem) - if md: - md_lines += [md, ''] - elif elem.name == 'p': - for img in elem.find_all('img'): - fn = Path(img.get('src', '')).name - if fn in img_map: - img_counter[0] += 1 - md_lines += [f'![그림 {img_counter[0]}]({base_name}_images/{fn})', ''] - inner = elem.find('table') - if inner: - md = _table_to_md(inner) - if md: - md_lines += [md, ''] - continue - text = re.sub(r'\s+', ' ', elem.get_text(separator=' ', strip=True)).strip() - if not text: - continue - kind, level, fmt = _detect_structure(text) - if kind == 'heading': - if md_lines and md_lines[-1] != '': - md_lines.append('') - md_lines += [f'{"#" * level} {fmt}', ''] - elif kind == 'bullet': - md_lines.append(f'- {re.sub(r"^[▪▫-]\\s*", "", fmt)}') - else: - md_lines += [fmt, ''] - - output_path.write_text(re.sub(r'\n{3,}', '\n\n', '\n'.join(md_lines)), encoding='utf-8') - return True + subprocess.run( + [str(_EXE_PATH), str(hwp_path)], + timeout=timeout, + capture_output=True, + ) + if hwpx_path.exists() and (not existed_before or hwpx_path.stat().st_mtime > hwp_path.stat().st_mtime): + return hwpx_path + print(f' [경고] exe 실행 후 .hwpx 파일 없음') + return None + except subprocess.TimeoutExpired: + print(f' [경고] exe 타임아웃 ({timeout}초)') + return None except Exception as e: - print(f' pyhwp 오류: {e}') - return False - finally: - shutil.rmtree(tmp_dir, ignore_errors=True) + print(f' [경고] exe 오류: {e}') + return None def convert_hwp(hwp_path: Path, output_dir: Path) -> dict: @@ -190,22 +45,19 @@ def convert_hwp(hwp_path: Path, output_dir: Path) -> dict: "output": str(md_path), "format": "hwp", } try: - hml_path = md_path.with_suffix('.hml') - if _com_hwp_to_hml(hwp_path, hml_path): - try: - from converters.hml import convert_hml - r = convert_hml(hml_path, output_dir) - hml_path.unlink(missing_ok=True) - if r['status'] == 'ok': - return result - except Exception: - hml_path.unlink(missing_ok=True) - - if _pyhwp_hwp_to_md(hwp_path, md_path, hwp_path.stem): - return result + hwpx_path = _exe_hwp_to_hwpx(hwp_path) + if hwpx_path: + from converters.hwpx import convert_hwpx + r = convert_hwpx(hwpx_path, output_dir) + if r['status'] == 'ok': + result['images'] = r.get('images', []) + return result + result['error'] = r.get('error', 'hwpx 변환 실패') + else: + result['error'] = 'HWP → HWPX 변환 실패 — HwpToPdfConverter.exe 확인 필요' result['status'] = 'error' - result['error'] = 'COM + pyhwp 모두 실패' + print(f' [경고] HWP 변환 실패: {hwp_path.name}') except Exception as e: result['status'] = 'error' result['error'] = str(e) diff --git a/converters/hwpx.py b/converters/hwpx.py index 52d4a2e..b3235aa 100644 --- a/converters/hwpx.py +++ b/converters/hwpx.py @@ -6,6 +6,13 @@ import re import zipfile import xml.etree.ElementTree as ET from pathlib import Path +def _esc_path(s: str) -> str: + return s.replace(' ', '%20').replace('[', '%5B').replace(']', '%5D') + + +def _img_link(base_name: str, filename: str, idx: int) -> str: + path = f'{_esc_path(base_name)}_images/{_esc_path(filename)}' + return f'![그림 {idx}]({path})' NS = { 'hp': 'http://www.hancom.co.kr/hwpml/2011/paragraph', @@ -128,7 +135,7 @@ def _process_para(p_elem, pic_counter: list, id_to_file: dict, base_name: str) - ref_id = img_elem.get('binaryItemIDRef', '') filename = id_to_file.get(ref_id, '') if filename: - return [f'![그림 {idx+1}]({base_name}_images/{filename})'] + return [_img_link(base_name, filename, idx + 1)] return [f'![그림 {idx+1}](그림_{idx+1}.png)'] text = _extract_text(p_elem)