- convert.py: 통합 CLI, --json 출력, --scan 폴더 모드 - converters/pdf.py: 페이지별 분류(text/diagram/mixed) + marker-pdf + PNG 렌더링 - converters/hwp.py: COM 자동화 + pyhwp fallback - converters/hwpx.py: ZIP+XML 직접 파싱, 이미지 추출 - converters/hml.py: XML 파싱, Base64 이미지 추출, colspan/rowspan HTML 표 - converters/html.py: html2text (body_width=0) - requirements.txt: 최소 의존성 - .env.example: 환경변수 템플릿 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
189 lines
6.7 KiB
Python
189 lines
6.7 KiB
Python
#!/usr/bin/env python3
|
|
"""HWPX → Markdown (ZIP+XML 직접 파싱, 이미지 추출 포함)"""
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
import zipfile
|
|
import xml.etree.ElementTree as ET
|
|
from pathlib import Path
|
|
|
|
NS = {
|
|
'hp': 'http://www.hancom.co.kr/hwpml/2011/paragraph',
|
|
'hc': 'http://www.hancom.co.kr/hwpml/2011/core',
|
|
}
|
|
|
|
|
|
def _extract_images(zf: zipfile.ZipFile, images_dir: Path) -> dict:
|
|
images_dir.mkdir(parents=True, exist_ok=True)
|
|
id_to_file = {}
|
|
for name in zf.namelist():
|
|
if not name.startswith('BinData/'):
|
|
continue
|
|
filename = Path(name).name
|
|
if not filename:
|
|
continue
|
|
out_path = images_dir / filename
|
|
out_path.write_bytes(zf.read(name))
|
|
id_to_file[Path(filename).stem] = filename
|
|
return id_to_file
|
|
|
|
|
|
def _extract_text(p_elem) -> str:
|
|
parts = []
|
|
for run in p_elem.findall('hp:run', NS):
|
|
for t in run.findall('hp:t', NS):
|
|
if t.text:
|
|
parts.append(t.text)
|
|
if run.findall('hp:tab', NS):
|
|
parts.append(' ')
|
|
return ''.join(parts).strip()
|
|
|
|
|
|
def _detect_structure(text: str):
|
|
if not text: return 'paragraph', 0, text
|
|
if re.match(r'^\d+\.\d+\.\d+\s', text): return 'heading', 4, text
|
|
if re.match(r'^\d+\.\d+\s', text): return 'heading', 3, text
|
|
if re.match(r'^\d+\.\s.+', text): return 'heading', 2, text
|
|
if re.match(r'^[\d가-힣]+\)\s+.+', text):return 'heading', 3, text
|
|
if re.match(r'^[□■]\s*.+', text): return 'heading', 2, text
|
|
if re.match(r'^[○●◎]\s*.+', text): return 'heading', 3, text
|
|
if re.match(r'^[▶▷]\s*.+', text): return 'heading', 4, text
|
|
if re.match(r'^[▪▫\-]\s*.+', text): return 'bullet', 0, text
|
|
if re.match(r'^[※]', text): return 'paragraph', 0, f'> {text}'
|
|
return 'paragraph', 0, text
|
|
|
|
|
|
def _cell_text(tc_elem) -> str:
|
|
parts = []
|
|
for sub in tc_elem.findall('hp:subList', NS):
|
|
for p in sub.findall('hp:p', NS):
|
|
if p.find('.//hp:tbl', NS) is not None:
|
|
continue
|
|
t = _extract_text(p)
|
|
if t:
|
|
parts.append(t)
|
|
return '<br>'.join(parts)
|
|
|
|
|
|
def _get_span(tc_elem):
|
|
cs = int(tc_elem.get('colSpan', 1))
|
|
rs = int(tc_elem.get('rowSpan', 1))
|
|
span = tc_elem.find('hp:cellSpan', NS)
|
|
if span is not None:
|
|
cs = int(span.get('colSpan', cs))
|
|
rs = int(span.get('rowSpan', rs))
|
|
return cs, rs
|
|
|
|
|
|
def _extract_table(tbl_elem) -> str:
|
|
has_merge = False
|
|
raw_rows = []
|
|
for tr in tbl_elem.findall('hp:tr', NS):
|
|
cells = []
|
|
for tc in tr.findall('hp:tc', NS):
|
|
cs, rs = _get_span(tc)
|
|
if cs > 1 or rs > 1:
|
|
has_merge = True
|
|
cells.append((cs, rs, _cell_text(tc)))
|
|
if cells:
|
|
raw_rows.append(cells)
|
|
if not raw_rows:
|
|
return ''
|
|
if has_merge:
|
|
lines = ['<table>']
|
|
for ri, cells in enumerate(raw_rows):
|
|
lines.append('<tr>')
|
|
tag = 'th' if ri == 0 else 'td'
|
|
for cs, rs, text in cells:
|
|
attrs = (f' colspan="{cs}"' if cs > 1 else '') + (f' rowspan="{rs}"' if rs > 1 else '')
|
|
lines.append(f'<{tag}{attrs}>{text}</{tag}>')
|
|
lines.append('</tr>')
|
|
lines.append('</table>')
|
|
return '\n'.join(lines)
|
|
else:
|
|
rows = [[t for _, _, t in cells] for cells in raw_rows]
|
|
mc = max(len(r) for r in rows)
|
|
for r in rows:
|
|
r += [''] * (mc - len(r))
|
|
def esc(s): return s.replace('|', '\\|').replace('\n', ' ')
|
|
lines = ['| ' + ' | '.join(esc(c) for c in rows[0]) + ' |',
|
|
'| ' + ' | '.join(['---'] * mc) + ' |']
|
|
for row in rows[1:]:
|
|
lines.append('| ' + ' | '.join(esc(c) for c in row) + ' |')
|
|
return '\n'.join(lines)
|
|
|
|
|
|
def _process_para(p_elem, pic_counter: list, id_to_file: dict, base_name: str) -> list[str]:
|
|
tbl = p_elem.find('.//hp:tbl', NS)
|
|
if tbl is not None:
|
|
md = _extract_table(tbl)
|
|
return [md] if md else []
|
|
|
|
pic = p_elem.find('.//hp:pic', NS)
|
|
if pic is not None:
|
|
idx = pic_counter[0]
|
|
pic_counter[0] += 1
|
|
img_elem = pic.find('.//hc:img', NS)
|
|
if img_elem is not None:
|
|
ref_id = img_elem.get('binaryItemIDRef', '')
|
|
filename = id_to_file.get(ref_id, '')
|
|
if filename:
|
|
return [f'']
|
|
return [f'']
|
|
|
|
text = _extract_text(p_elem)
|
|
if not text:
|
|
return []
|
|
kind, level, fmt = _detect_structure(text)
|
|
if kind == 'heading':
|
|
return [f'{"#" * level} {fmt}']
|
|
elif kind == 'bullet':
|
|
return [f'- {re.sub(r"^[▪▫-]\\s*", "", fmt)}']
|
|
return [fmt]
|
|
|
|
|
|
def convert_hwpx(hwpx_path: Path, output_dir: Path) -> dict:
|
|
"""HWPX → MD. AGENT_GUIDE 스펙 dict 반환."""
|
|
hwpx_path = Path(hwpx_path)
|
|
output_dir = Path(output_dir)
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
md_path = output_dir / f'{hwpx_path.stem}.md'
|
|
images_dir = output_dir / f'{hwpx_path.stem}_images'
|
|
|
|
result = {
|
|
"status": "ok", "input": str(hwpx_path),
|
|
"output": str(md_path), "format": "hwpx", "images": [],
|
|
}
|
|
try:
|
|
with zipfile.ZipFile(hwpx_path, 'r') as zf:
|
|
id_to_file = _extract_images(zf, images_dir)
|
|
result["images"] = [str(images_dir / f) for f in id_to_file.values()]
|
|
|
|
section_files = sorted(
|
|
n for n in zf.namelist()
|
|
if re.match(r'Contents/section\d+\.xml', n)
|
|
)
|
|
md_lines: list[str] = []
|
|
pic_counter = [0]
|
|
for sec_file in section_files:
|
|
root = ET.fromstring(zf.read(sec_file))
|
|
for p_elem in root.findall('hp:p', NS):
|
|
if p_elem.find('.//hp:secPr', NS) is not None:
|
|
continue
|
|
for line in _process_para(p_elem, pic_counter, id_to_file, hwpx_path.stem):
|
|
if line.startswith('#') or line.startswith('<table') or line.startswith('|') or line.startswith('!['):
|
|
if md_lines and md_lines[-1] != '':
|
|
md_lines.append('')
|
|
md_lines.append(line)
|
|
md_lines.append('')
|
|
elif line:
|
|
md_lines.append(line)
|
|
md_lines.append('')
|
|
|
|
final = re.sub(r'\n{3,}', '\n\n', '\n'.join(md_lines)).strip()
|
|
md_path.write_text(final, encoding='utf-8')
|
|
except Exception as e:
|
|
result['status'] = 'error'
|
|
result['error'] = str(e)
|
|
return result
|