#!/usr/bin/env python3
"""HWPX → Markdown (ZIP+XML 직접 파싱, 이미지 추출 포함)"""
from __future__ import annotations
import re
import zipfile
import xml.etree.ElementTree as ET
from pathlib import Path
def _esc_path(s: str) -> str:
return s.replace(' ', '%20').replace('[', '%5B').replace(']', '%5D')
def _img_link(base_name: str, filename: str, idx: int) -> str:
path = f'{_esc_path(base_name)}_images/{_esc_path(filename)}'
return f''
NS = {
'hp': 'http://www.hancom.co.kr/hwpml/2011/paragraph',
'hc': 'http://www.hancom.co.kr/hwpml/2011/core',
}
def _extract_images(zf: zipfile.ZipFile, images_dir: Path) -> dict:
images_dir.mkdir(parents=True, exist_ok=True)
id_to_file = {}
for name in zf.namelist():
if not name.startswith('BinData/'):
continue
filename = Path(name).name
if not filename:
continue
out_path = images_dir / filename
out_path.write_bytes(zf.read(name))
id_to_file[Path(filename).stem] = filename
return id_to_file
def _extract_text(p_elem) -> str:
parts = []
for run in p_elem.findall('hp:run', NS):
for t in run.findall('hp:t', NS):
if t.text:
parts.append(t.text)
if run.findall('hp:tab', NS):
parts.append(' ')
return ''.join(parts).strip()
def _detect_structure(text: str):
if not text: return 'paragraph', 0, text
if re.match(r'^\d+\.\d+\.\d+\s', text): return 'heading', 4, text
if re.match(r'^\d+\.\d+\s', text): return 'heading', 3, text
if re.match(r'^\d+\.\s.+', text): return 'heading', 2, text
if re.match(r'^[\d가-힣]+\)\s+.+', text):return 'heading', 3, text
if re.match(r'^[□■]\s*.+', text): return 'heading', 2, text
if re.match(r'^[○●◎]\s*.+', text): return 'heading', 3, text
if re.match(r'^[▶▷]\s*.+', text): return 'heading', 4, text
if re.match(r'^[▪▫\-]\s*.+', text): return 'bullet', 0, text
if re.match(r'^[※]', text): return 'paragraph', 0, f'> {text}'
return 'paragraph', 0, text
def _cell_text(tc_elem) -> str:
parts = []
for sub in tc_elem.findall('hp:subList', NS):
for p in sub.findall('hp:p', NS):
if p.find('.//hp:tbl', NS) is not None:
continue
t = _extract_text(p)
if t:
parts.append(t)
return '
'.join(parts)
def _get_span(tc_elem):
cs = int(tc_elem.get('colSpan', 1))
rs = int(tc_elem.get('rowSpan', 1))
span = tc_elem.find('hp:cellSpan', NS)
if span is not None:
cs = int(span.get('colSpan', cs))
rs = int(span.get('rowSpan', rs))
return cs, rs
def _extract_table(tbl_elem) -> str:
has_merge = False
raw_rows = []
for tr in tbl_elem.findall('hp:tr', NS):
cells = []
for tc in tr.findall('hp:tc', NS):
cs, rs = _get_span(tc)
if cs > 1 or rs > 1:
has_merge = True
cells.append((cs, rs, _cell_text(tc)))
if cells:
raw_rows.append(cells)
if not raw_rows:
return ''
if has_merge:
lines = ['