#!/usr/bin/env python3 """HWPX → Markdown (ZIP+XML 직접 파싱, 이미지 추출 포함)""" from __future__ import annotations import re import zipfile import xml.etree.ElementTree as ET from pathlib import Path def _esc_path(s: str) -> str: return s.replace(' ', '%20').replace('[', '%5B').replace(']', '%5D') def _img_link(base_name: str, filename: str, idx: int) -> str: path = f'{_esc_path(base_name)}_images/{_esc_path(filename)}' return f'![그림 {idx}]({path})' NS = { 'hp': 'http://www.hancom.co.kr/hwpml/2011/paragraph', 'hc': 'http://www.hancom.co.kr/hwpml/2011/core', } def _extract_images(zf: zipfile.ZipFile, images_dir: Path) -> dict: images_dir.mkdir(parents=True, exist_ok=True) id_to_file = {} for name in zf.namelist(): if not name.startswith('BinData/'): continue filename = Path(name).name if not filename: continue out_path = images_dir / filename out_path.write_bytes(zf.read(name)) id_to_file[Path(filename).stem] = filename return id_to_file def _extract_text(p_elem) -> str: parts = [] for run in p_elem.findall('hp:run', NS): for t in run.findall('hp:t', NS): if t.text: parts.append(t.text) if run.findall('hp:tab', NS): parts.append(' ') return ''.join(parts).strip() def _detect_structure(text: str): if not text: return 'paragraph', 0, text if re.match(r'^\d+\.\d+\.\d+\s', text): return 'heading', 4, text if re.match(r'^\d+\.\d+\s', text): return 'heading', 3, text if re.match(r'^\d+\.\s.+', text): return 'heading', 2, text if re.match(r'^[\d가-힣]+\)\s+.+', text):return 'heading', 3, text if re.match(r'^[□■]\s*.+', text): return 'heading', 2, text if re.match(r'^[○●◎]\s*.+', text): return 'heading', 3, text if re.match(r'^[▶▷]\s*.+', text): return 'heading', 4, text if re.match(r'^[▪▫\-]\s*.+', text): return 'bullet', 0, text if re.match(r'^[※]', text): return 'paragraph', 0, f'> {text}' return 'paragraph', 0, text def _cell_text(tc_elem) -> str: parts = [] for sub in tc_elem.findall('hp:subList', NS): for p in sub.findall('hp:p', NS): if p.find('.//hp:tbl', NS) is not None: continue t = _extract_text(p) if t: parts.append(t) return '
'.join(parts) def _get_span(tc_elem): cs = int(tc_elem.get('colSpan', 1)) rs = int(tc_elem.get('rowSpan', 1)) span = tc_elem.find('hp:cellSpan', NS) if span is not None: cs = int(span.get('colSpan', cs)) rs = int(span.get('rowSpan', rs)) return cs, rs def _extract_table(tbl_elem) -> str: has_merge = False raw_rows = [] for tr in tbl_elem.findall('hp:tr', NS): cells = [] for tc in tr.findall('hp:tc', NS): cs, rs = _get_span(tc) if cs > 1 or rs > 1: has_merge = True cells.append((cs, rs, _cell_text(tc))) if cells: raw_rows.append(cells) if not raw_rows: return '' if has_merge: lines = [''] for ri, cells in enumerate(raw_rows): lines.append('') tag = 'th' if ri == 0 else 'td' for cs, rs, text in cells: attrs = (f' colspan="{cs}"' if cs > 1 else '') + (f' rowspan="{rs}"' if rs > 1 else '') lines.append(f'<{tag}{attrs}>{text}') lines.append('') lines.append('
') return '\n'.join(lines) else: rows = [[t for _, _, t in cells] for cells in raw_rows] mc = max(len(r) for r in rows) for r in rows: r += [''] * (mc - len(r)) def esc(s): return s.replace('|', '\\|').replace('\n', ' ') lines = ['| ' + ' | '.join(esc(c) for c in rows[0]) + ' |', '| ' + ' | '.join(['---'] * mc) + ' |'] for row in rows[1:]: lines.append('| ' + ' | '.join(esc(c) for c in row) + ' |') return '\n'.join(lines) def _process_para(p_elem, pic_counter: list, id_to_file: dict, base_name: str) -> list[str]: tbl = p_elem.find('.//hp:tbl', NS) if tbl is not None: md = _extract_table(tbl) return [md] if md else [] pic = p_elem.find('.//hp:pic', NS) if pic is not None: idx = pic_counter[0] pic_counter[0] += 1 img_elem = pic.find('.//hc:img', NS) if img_elem is not None: ref_id = img_elem.get('binaryItemIDRef', '') filename = id_to_file.get(ref_id, '') if filename: return [_img_link(base_name, filename, idx + 1)] return [f'![그림 {idx+1}](그림_{idx+1}.png)'] text = _extract_text(p_elem) if not text: return [] kind, level, fmt = _detect_structure(text) if kind == 'heading': return [f'{"#" * level} {fmt}'] elif kind == 'bullet': return [f'- {re.sub(r"^[▪▫-]\\s*", "", fmt)}'] return [fmt] def convert_hwpx(hwpx_path: Path, output_dir: Path) -> dict: """HWPX → MD. AGENT_GUIDE 스펙 dict 반환.""" hwpx_path = Path(hwpx_path) output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) md_path = output_dir / f'{hwpx_path.stem}.md' images_dir = output_dir / f'{hwpx_path.stem}_images' result = { "status": "ok", "input": str(hwpx_path), "output": str(md_path), "format": "hwpx", "images": [], } try: with zipfile.ZipFile(hwpx_path, 'r') as zf: id_to_file = _extract_images(zf, images_dir) result["images"] = [str(images_dir / f) for f in id_to_file.values()] section_files = sorted( n for n in zf.namelist() if re.match(r'Contents/section\d+\.xml', n) ) md_lines: list[str] = [] pic_counter = [0] for sec_file in section_files: root = ET.fromstring(zf.read(sec_file)) for p_elem in root.findall('hp:p', NS): if p_elem.find('.//hp:secPr', NS) is not None: continue for line in _process_para(p_elem, pic_counter, id_to_file, hwpx_path.stem): if line.startswith('#') or line.startswith('