#!/usr/bin/env python3
"""HML → Markdown (XML 직접 파싱, Base64 이미지 추출)"""
from __future__ import annotations
import base64
import re
import xml.etree.ElementTree as ET
from pathlib import Path
def _extract_images(tree, images_dir: Path) -> tuple[dict, list]:
images_dir.mkdir(parents=True, exist_ok=True)
bin_format = {
item.get('BinData'): item.get('Format', 'PNG').lower()
for item in tree.findall('.//BINITEM') if item.get('BinData')
}
id_to_file = {}
for bindata in tree.findall('.//BINDATA'):
bid = bindata.get('Id')
raw = (bindata.text or '').strip()
if not raw or bindata.get('Encoding', 'Base64').lower() != 'base64':
continue
fmt = bin_format.get(bid, 'png')
filename = f'BIN{int(bid):04d}.{fmt}'
try:
(images_dir / filename).write_bytes(base64.b64decode(raw))
id_to_file[bid] = filename
except Exception:
pass
body = tree.find('.//BODY')
bin_order = []
if body is not None:
for pic in body.findall('.//PICTURE'):
imgs = pic.findall('.//IMAGE')
bin_order.append(imgs[0].get('BinItem') if imgs else None)
return id_to_file, bin_order
def _extract_text(p_elem) -> str:
parts = []
for t in p_elem.findall('TEXT'):
for child in t:
if child.tag == 'CHAR' and child.text:
parts.append(child.text)
elif child.tag == 'TAB':
parts.append(' ')
return ''.join(parts).strip()
def _detect_structure(text: str):
if not text: return 'paragraph', 0, text
if re.match(r'^\d+\.\d+\.\d+\s', text): return 'heading', 4, text
if re.match(r'^\d+\.\d+\s', text): return 'heading', 3, text
if re.match(r'^\d+\.\s.+', text): return 'heading', 2, text
if re.match(r'^[\d가-힣]+\)\s+.+', text):return 'heading', 3, text
if re.match(r'^[□■]\s*.+', text): return 'heading', 2, text
if re.match(r'^[○●◎]\s*.+', text): return 'heading', 3, text
if re.match(r'^[▶▷]\s*.+', text): return 'heading', 4, text
if re.match(r'^[▪▫\-]\s*.+', text): return 'bullet', 0, text
if re.match(r'^[※]', text): return 'paragraph', 0, f'> {text}'
return 'paragraph', 0, text
def _extract_table(table_elem) -> str:
col_count = int(table_elem.get('ColCount', 0))
has_merge = False
raw_rows = []
for ri, row_elem in enumerate(table_elem.findall('.//ROW')):
cells = []
for cell_elem in row_elem.findall('CELL'):
cs = int(cell_elem.get('ColSpan', 1))
rs = int(cell_elem.get('RowSpan', 1))
ca = int(cell_elem.get('ColAddr', 0))
if cs > 1 or rs > 1:
has_merge = True
parts = [_extract_text(p) for p in cell_elem.findall('.//P')]
cells.append((ca, cs, rs, '
'.join(p for p in parts if p)))
if cells:
raw_rows.append((ri, cells))
if not raw_rows:
return ''
if has_merge:
lines = ['