doc2md/converters/hwpx.py

#!/usr/bin/env python3
"""HWPX → Markdown (ZIP+XML 직접 파싱, 이미지 추출 포함)"""
from __future__ import annotations

import re
import zipfile
import xml.etree.ElementTree as ET
from pathlib import Path
def _esc_path(s: str) -> str:
    return s.replace(' ', '%20').replace('[', '%5B').replace(']', '%5D')


def _img_link(base_name: str, filename: str, idx: int) -> str:
    path = f'{_esc_path(base_name)}_images/{_esc_path(filename)}'
    return f'![그림 {idx}]({path})'

NS = {
    'hp': 'http://www.hancom.co.kr/hwpml/2011/paragraph',
    'hc': 'http://www.hancom.co.kr/hwpml/2011/core',
}


def _extract_images(zf: zipfile.ZipFile, images_dir: Path) -> dict:
    images_dir.mkdir(parents=True, exist_ok=True)
    id_to_file = {}
    for name in zf.namelist():
        if not name.startswith('BinData/'):
            continue
        filename = Path(name).name
        if not filename:
            continue
        out_path = images_dir / filename
        out_path.write_bytes(zf.read(name))
        id_to_file[Path(filename).stem] = filename
    return id_to_file


def _extract_text(p_elem) -> str:
    parts = []
    for run in p_elem.findall('hp:run', NS):
        for t in run.findall('hp:t', NS):
            if t.text:
                parts.append(t.text)
        if run.findall('hp:tab', NS):
            parts.append(' ')
    return ''.join(parts).strip()


def _detect_structure(text: str):
    if not text: return 'paragraph', 0, text
    if re.match(r'^\d+\.\d+\.\d+\s', text): return 'heading', 4, text
    if re.match(r'^\d+\.\d+\s', text):       return 'heading', 3, text
    if re.match(r'^\d+\.\s.+', text):        return 'heading', 2, text
    if re.match(r'^[\d가-힣]+\)\s+.+', text):return 'heading', 3, text
    if re.match(r'^[□■]\s*.+', text):        return 'heading', 2, text
    if re.match(r'^[○●◎]\s*.+', text):       return 'heading', 3, text
    if re.match(r'^[▶▷]\s*.+', text):        return 'heading', 4, text
    if re.match(r'^[▪▫\-]\s*.+', text):      return 'bullet',  0, text
    if re.match(r'^[※]', text):              return 'paragraph', 0, f'> {text}'
    return 'paragraph', 0, text


def _cell_text(tc_elem) -> str:
    parts = []
    for sub in tc_elem.findall('hp:subList', NS):
        for p in sub.findall('hp:p', NS):
            if p.find('.//hp:tbl', NS) is not None:
                continue
            t = _extract_text(p)
            if t:
                parts.append(t)
    return '<br>'.join(parts)


def _get_span(tc_elem):
    cs = int(tc_elem.get('colSpan', 1))
    rs = int(tc_elem.get('rowSpan', 1))
    span = tc_elem.find('hp:cellSpan', NS)
    if span is not None:
        cs = int(span.get('colSpan', cs))
        rs = int(span.get('rowSpan', rs))
    return cs, rs


def _extract_table(tbl_elem) -> str:
    has_merge = False
    raw_rows = []
    for tr in tbl_elem.findall('hp:tr', NS):
        cells = []
        for tc in tr.findall('hp:tc', NS):
            cs, rs = _get_span(tc)
            if cs > 1 or rs > 1:
                has_merge = True
            cells.append((cs, rs, _cell_text(tc)))
        if cells:
            raw_rows.append(cells)
    if not raw_rows:
        return ''
    if has_merge:
        lines = ['<table>']
        for ri, cells in enumerate(raw_rows):
            lines.append('<tr>')
            tag = 'th' if ri == 0 else 'td'
            for cs, rs, text in cells:
                attrs = (f' colspan="{cs}"' if cs > 1 else '') + (f' rowspan="{rs}"' if rs > 1 else '')
                lines.append(f'<{tag}{attrs}>{text}</{tag}>')
            lines.append('</tr>')
        lines.append('</table>')
        return '\n'.join(lines)
    else:
        rows = [[t for _, _, t in cells] for cells in raw_rows]
        mc = max(len(r) for r in rows)
        for r in rows:
            r += [''] * (mc - len(r))
        def esc(s): return s.replace('|', '\\|').replace('\n', ' ')
        lines = ['| ' + ' | '.join(esc(c) for c in rows[0]) + ' |',
                 '| ' + ' | '.join(['---'] * mc) + ' |']
        for row in rows[1:]:
            lines.append('| ' + ' | '.join(esc(c) for c in row) + ' |')
        return '\n'.join(lines)


def _process_para(p_elem, pic_counter: list, id_to_file: dict, base_name: str) -> list[str]:
    tbl = p_elem.find('.//hp:tbl', NS)
    if tbl is not None:
        md = _extract_table(tbl)
        return [md] if md else []

    pic = p_elem.find('.//hp:pic', NS)
    if pic is not None:
        idx = pic_counter[0]
        pic_counter[0] += 1
        img_elem = pic.find('.//hc:img', NS)
        if img_elem is not None:
            ref_id = img_elem.get('binaryItemIDRef', '')
            filename = id_to_file.get(ref_id, '')
            if filename:
                return [_img_link(base_name, filename, idx + 1)]
        return [f'![그림 {idx+1}](그림_{idx+1}.png)']

    text = _extract_text(p_elem)
    if not text:
        return []
    kind, level, fmt = _detect_structure(text)
    if kind == 'heading':
        return [f'{"#" * level} {fmt}']
    elif kind == 'bullet':
        return [f'- {re.sub(r"^[▪▫-]\\s*", "", fmt)}']
    return [fmt]


def convert_hwpx(hwpx_path: Path, output_dir: Path) -> dict:
    """HWPX → MD. AGENT_GUIDE 스펙 dict 반환."""
    hwpx_path = Path(hwpx_path)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    md_path = output_dir / f'{hwpx_path.stem}.md'
    images_dir = output_dir / f'{hwpx_path.stem}_images'

    result = {
        "status": "ok", "input": str(hwpx_path),
        "output": str(md_path), "format": "hwpx", "images": [],
    }
    try:
        with zipfile.ZipFile(hwpx_path, 'r') as zf:
            id_to_file = _extract_images(zf, images_dir)
            result["images"] = [str(images_dir / f) for f in id_to_file.values()]

            section_files = sorted(
                n for n in zf.namelist()
                if re.match(r'Contents/section\d+\.xml', n)
            )
            md_lines: list[str] = []
            pic_counter = [0]
            for sec_file in section_files:
                root = ET.fromstring(zf.read(sec_file))
                for p_elem in root.findall('hp:p', NS):
                    if p_elem.find('.//hp:secPr', NS) is not None:
                        continue
                    for line in _process_para(p_elem, pic_counter, id_to_file, hwpx_path.stem):
                        if line.startswith('#') or line.startswith('<table') or line.startswith('|') or line.startswith('!['):
                            if md_lines and md_lines[-1] != '':
                                md_lines.append('')
                            md_lines.append(line)
                            md_lines.append('')
                        elif line:
                            md_lines.append(line)
                            md_lines.append('')

        final = re.sub(r'\n{3,}', '\n\n', '\n'.join(md_lines)).strip()
        md_path.write_text(final, encoding='utf-8')
    except Exception as e:
        result['status'] = 'error'
        result['error'] = str(e)
    return result