#!/usr/bin/env python3 """HWP → Markdown (COM 자동화 우선, pyhwp fallback)""" from __future__ import annotations import re import shutil import tempfile from pathlib import Path def _com_hwp_to_hml(hwp_path: Path, hml_path: Path, timeout: int = 15) -> bool: import threading result = [False] def _run(): try: import pythoncom, win32com.client except ImportError: return hwp = None try: pythoncom.CoInitialize() hwp = win32com.client.Dispatch('HWPFrame.HwpObject') try: hwp.RegisterModule('FilePathCheckDLL', 'SecurityModule') except Exception: pass ok = hwp.Open(str(hwp_path).replace('/', '\\'), 'HWP', 'forceopen:true') if not ok: return hwp.SaveAs(str(hml_path).replace('/', '\\'), 'HML', '') result[0] = hml_path.exists() except Exception as e: print(f' COM 오류: {e}') finally: if hwp: try: hwp.Quit() except Exception: pass try: pythoncom.CoUninitialize() except Exception: pass t = threading.Thread(target=_run, daemon=True) t.start() t.join(timeout) if t.is_alive(): print(f' COM 타임아웃 ({timeout}초) -> pyhwp로 전환') return result[0] def _table_to_md(table_elem) -> str: from bs4 import Tag rows = table_elem.find_all('tr', recursive=False) or table_elem.find_all('tr') if not rows: return '' has_merge = False parsed = [] for tr in rows: cells = [] for td in tr.find_all(['td', 'th']): cs = int(td.get('colspan', 1)) rs = int(td.get('rowspan', 1)) if cs > 1 or rs > 1: has_merge = True cells.append((cs, rs, td.get_text(separator='
', strip=True))) if cells: parsed.append(cells) if not parsed: return '' if has_merge: lines = [''] for ri, cells in enumerate(parsed): lines.append('') tag = 'th' if ri == 0 else 'td' for cs, rs, text in cells: attrs = (f' colspan="{cs}"' if cs > 1 else '') + (f' rowspan="{rs}"' if rs > 1 else '') lines.append(f'<{tag}{attrs}>{text}') lines.append('') lines.append('
') return '\n'.join(lines) else: rows_text = [[text for _, _, text in cells] for cells in parsed] mc = max(len(r) for r in rows_text) for r in rows_text: r += [''] * (mc - len(r)) def esc(s): return s.replace('|', '\\|') lines = ['| ' + ' | '.join(esc(c) for c in rows_text[0]) + ' |', '| ' + ' | '.join(['---'] * mc) + ' |'] for row in rows_text[1:]: lines.append('| ' + ' | '.join(esc(c) for c in row) + ' |') return '\n'.join(lines) def _detect_structure(text: str): if not text: return 'paragraph', 0, text if re.match(r'^\d+\.\d+\.\d+\s', text): return 'heading', 4, text if re.match(r'^\d+\.\d+\s', text): return 'heading', 3, text if re.match(r'^\d+\.\s.+', text): return 'heading', 2, text if re.match(r'^[\d가-힣]+\)\s+.+', text):return 'heading', 3, text if re.match(r'^[□■]\s*.+', text): return 'heading', 2, text if re.match(r'^[○●◎]\s*.+', text): return 'heading', 3, text if re.match(r'^[▶▷]\s*.+', text): return 'heading', 4, text if re.match(r'^[▪▫\-]\s*.+', text): return 'bullet', 0, text if re.match(r'^[※]', text): return 'paragraph', 0, f'> {text}' return 'paragraph', 0, text def _pyhwp_hwp_to_md(hwp_path: Path, output_path: Path, base_name: str) -> bool: try: from hwp5.hwp5html import HTMLTransform from hwp5.xmlmodel import Hwp5File from bs4 import BeautifulSoup except ImportError as e: print(f' pyhwp/bs4 미설치: {e}') return False tmp_dir = Path(tempfile.mkdtemp()) try: f = Hwp5File(str(hwp_path)) HTMLTransform().transform_hwp5_to_dir(f, str(tmp_dir)) xhtml_path = tmp_dir / 'index.xhtml' if not xhtml_path.exists(): return False images_dir = output_path.parent / f'{base_name}_images' images_dir.mkdir(exist_ok=True) img_map = {} bindata_dir = tmp_dir / 'bindata' if bindata_dir.exists(): for img in bindata_dir.iterdir(): shutil.copy(img, images_dir / img.name) img_map[img.name] = img.name soup = BeautifulSoup(xhtml_path.read_text(encoding='utf-8'), 'lxml-xml') for area in soup.find_all(class_=re.compile(r'^(HeaderArea|FooterArea|Header parashape|Footer parashape)$')): area.decompose() md_lines = [] img_counter = [0] for elem in soup.find_all(['p', 'table']): if elem.find_parent('table'): continue if elem.name == 'table': if not elem.find_parent('p'): md = _table_to_md(elem) if md: md_lines += [md, ''] elif elem.name == 'p': for img in elem.find_all('img'): fn = Path(img.get('src', '')).name if fn in img_map: img_counter[0] += 1 md_lines += [f'![그림 {img_counter[0]}]({base_name}_images/{fn})', ''] inner = elem.find('table') if inner: md = _table_to_md(inner) if md: md_lines += [md, ''] continue text = re.sub(r'\s+', ' ', elem.get_text(separator=' ', strip=True)).strip() if not text: continue kind, level, fmt = _detect_structure(text) if kind == 'heading': if md_lines and md_lines[-1] != '': md_lines.append('') md_lines += [f'{"#" * level} {fmt}', ''] elif kind == 'bullet': md_lines.append(f'- {re.sub(r"^[▪▫-]\\s*", "", fmt)}') else: md_lines += [fmt, ''] output_path.write_text(re.sub(r'\n{3,}', '\n\n', '\n'.join(md_lines)), encoding='utf-8') return True except Exception as e: print(f' pyhwp 오류: {e}') return False finally: shutil.rmtree(tmp_dir, ignore_errors=True) def convert_hwp(hwp_path: Path, output_dir: Path) -> dict: """HWP → MD. AGENT_GUIDE 스펙 dict 반환.""" hwp_path = Path(hwp_path) output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) md_path = output_dir / f'{hwp_path.stem}.md' result = { "status": "ok", "input": str(hwp_path), "output": str(md_path), "format": "hwp", } try: hml_path = md_path.with_suffix('.hml') if _com_hwp_to_hml(hwp_path, hml_path): try: from converters.hml import convert_hml r = convert_hml(hml_path, output_dir) hml_path.unlink(missing_ok=True) if r['status'] == 'ok': return result except Exception: hml_path.unlink(missing_ok=True) if _pyhwp_hwp_to_md(hwp_path, md_path, hwp_path.stem): return result result['status'] = 'error' result['error'] = 'COM + pyhwp 모두 실패' except Exception as e: result['status'] = 'error' result['error'] = str(e) return result