feat: Implement full conversion pipeline (PDF/HWP/HWPX/HML/HTML)
- convert.py: 통합 CLI, --json 출력, --scan 폴더 모드 - converters/pdf.py: 페이지별 분류(text/diagram/mixed) + marker-pdf + PNG 렌더링 - converters/hwp.py: COM 자동화 + pyhwp fallback - converters/hwpx.py: ZIP+XML 직접 파싱, 이미지 추출 - converters/hml.py: XML 파싱, Base64 이미지 추출, colspan/rowspan HTML 표 - converters/html.py: html2text (body_width=0) - requirements.txt: 최소 의존성 - .env.example: 환경변수 템플릿 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
188
converters/hml.py
Normal file
188
converters/hml.py
Normal file
@@ -0,0 +1,188 @@
|
||||
#!/usr/bin/env python3
|
||||
"""HML → Markdown (XML 직접 파싱, Base64 이미지 추출)"""
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def _extract_images(tree, images_dir: Path) -> tuple[dict, list]:
|
||||
images_dir.mkdir(parents=True, exist_ok=True)
|
||||
bin_format = {
|
||||
item.get('BinData'): item.get('Format', 'PNG').lower()
|
||||
for item in tree.findall('.//BINITEM') if item.get('BinData')
|
||||
}
|
||||
id_to_file = {}
|
||||
for bindata in tree.findall('.//BINDATA'):
|
||||
bid = bindata.get('Id')
|
||||
raw = (bindata.text or '').strip()
|
||||
if not raw or bindata.get('Encoding', 'Base64').lower() != 'base64':
|
||||
continue
|
||||
fmt = bin_format.get(bid, 'png')
|
||||
filename = f'BIN{int(bid):04d}.{fmt}'
|
||||
try:
|
||||
(images_dir / filename).write_bytes(base64.b64decode(raw))
|
||||
id_to_file[bid] = filename
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
body = tree.find('.//BODY')
|
||||
bin_order = []
|
||||
if body is not None:
|
||||
for pic in body.findall('.//PICTURE'):
|
||||
imgs = pic.findall('.//IMAGE')
|
||||
bin_order.append(imgs[0].get('BinItem') if imgs else None)
|
||||
return id_to_file, bin_order
|
||||
|
||||
|
||||
def _extract_text(p_elem) -> str:
|
||||
parts = []
|
||||
for t in p_elem.findall('TEXT'):
|
||||
for child in t:
|
||||
if child.tag == 'CHAR' and child.text:
|
||||
parts.append(child.text)
|
||||
elif child.tag == 'TAB':
|
||||
parts.append(' ')
|
||||
return ''.join(parts).strip()
|
||||
|
||||
|
||||
def _detect_structure(text: str):
|
||||
if not text: return 'paragraph', 0, text
|
||||
if re.match(r'^\d+\.\d+\.\d+\s', text): return 'heading', 4, text
|
||||
if re.match(r'^\d+\.\d+\s', text): return 'heading', 3, text
|
||||
if re.match(r'^\d+\.\s.+', text): return 'heading', 2, text
|
||||
if re.match(r'^[\d가-힣]+\)\s+.+', text):return 'heading', 3, text
|
||||
if re.match(r'^[□■]\s*.+', text): return 'heading', 2, text
|
||||
if re.match(r'^[○●◎]\s*.+', text): return 'heading', 3, text
|
||||
if re.match(r'^[▶▷]\s*.+', text): return 'heading', 4, text
|
||||
if re.match(r'^[▪▫\-]\s*.+', text): return 'bullet', 0, text
|
||||
if re.match(r'^[※]', text): return 'paragraph', 0, f'> {text}'
|
||||
return 'paragraph', 0, text
|
||||
|
||||
|
||||
def _extract_table(table_elem) -> str:
|
||||
col_count = int(table_elem.get('ColCount', 0))
|
||||
has_merge = False
|
||||
raw_rows = []
|
||||
for ri, row_elem in enumerate(table_elem.findall('.//ROW')):
|
||||
cells = []
|
||||
for cell_elem in row_elem.findall('CELL'):
|
||||
cs = int(cell_elem.get('ColSpan', 1))
|
||||
rs = int(cell_elem.get('RowSpan', 1))
|
||||
ca = int(cell_elem.get('ColAddr', 0))
|
||||
if cs > 1 or rs > 1:
|
||||
has_merge = True
|
||||
parts = [_extract_text(p) for p in cell_elem.findall('.//P')]
|
||||
cells.append((ca, cs, rs, '<br>'.join(p for p in parts if p)))
|
||||
if cells:
|
||||
raw_rows.append((ri, cells))
|
||||
if not raw_rows:
|
||||
return ''
|
||||
|
||||
if has_merge:
|
||||
lines = ['<table>']
|
||||
for ri, (_, cells) in enumerate(raw_rows):
|
||||
lines.append('<tr>')
|
||||
tag = 'th' if ri == 0 else 'td'
|
||||
for _, cs, rs, text in cells:
|
||||
attrs = (f' colspan="{cs}"' if cs > 1 else '') + (f' rowspan="{rs}"' if rs > 1 else '')
|
||||
lines.append(f'<{tag}{attrs}>{text}</{tag}>')
|
||||
lines.append('</tr>')
|
||||
lines.append('</table>')
|
||||
return '\n'.join(lines)
|
||||
else:
|
||||
rows = []
|
||||
for _, cells in raw_rows:
|
||||
grid = {ca: text for ca, _, _, text in cells}
|
||||
n = col_count if col_count > 0 else (max(grid) + 1)
|
||||
rows.append([grid.get(i, '') for i in range(n)])
|
||||
mc = max(len(r) for r in rows)
|
||||
for r in rows:
|
||||
r += [''] * (mc - len(r))
|
||||
def esc(s): return s.replace('|', '\\|').replace('\n', ' ')
|
||||
lines = ['| ' + ' | '.join(esc(c) for c in rows[0]) + ' |',
|
||||
'| ' + ' | '.join(['---'] * mc) + ' |']
|
||||
for row in rows[1:]:
|
||||
lines.append('| ' + ' | '.join(esc(c) for c in row) + ' |')
|
||||
return '\n'.join(lines)
|
||||
|
||||
|
||||
def _process_p(p_elem, pic_counter: list, bin_order: list, id_to_file: dict, base_name: str) -> list[str]:
|
||||
lines = []
|
||||
has_content = False
|
||||
for text_elem in p_elem.findall('TEXT'):
|
||||
for child in text_elem:
|
||||
if child.tag == 'TABLE':
|
||||
has_content = True
|
||||
md = _extract_table(child)
|
||||
if md:
|
||||
lines.append(md)
|
||||
elif child.tag == 'PICTURE':
|
||||
has_content = True
|
||||
idx = pic_counter[0]
|
||||
pic_counter[0] += 1
|
||||
bid = bin_order[idx] if idx < len(bin_order) else None
|
||||
filename = id_to_file.get(bid, '') if bid else ''
|
||||
ref = f'{base_name}_images/{filename}' if filename else f'그림_{idx+1}.png'
|
||||
lines.append(f'')
|
||||
if not has_content:
|
||||
text = _extract_text(p_elem)
|
||||
if text:
|
||||
kind, level, fmt = _detect_structure(text)
|
||||
if kind == 'heading':
|
||||
lines.append(f'{"#" * level} {fmt}')
|
||||
elif kind == 'bullet':
|
||||
lines.append(f'- {re.sub(r"^[▪▫-]\\s*", "", fmt)}')
|
||||
else:
|
||||
lines.append(fmt)
|
||||
return lines
|
||||
|
||||
|
||||
def convert_hml(hml_path: Path, output_dir: Path) -> dict:
|
||||
"""HML → MD. AGENT_GUIDE 스펙 dict 반환."""
|
||||
hml_path = Path(hml_path)
|
||||
output_dir = Path(output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
md_path = output_dir / f'{hml_path.stem}.md'
|
||||
images_dir = output_dir / f'{hml_path.stem}_images'
|
||||
|
||||
result = {
|
||||
"status": "ok", "input": str(hml_path),
|
||||
"output": str(md_path), "format": "hml", "images": [],
|
||||
}
|
||||
try:
|
||||
tree = ET.fromstring(hml_path.read_text(encoding='utf-8-sig'))
|
||||
id_to_file, bin_order = _extract_images(tree, images_dir)
|
||||
result["images"] = [str(images_dir / f) for f in id_to_file.values()]
|
||||
|
||||
title_elem = tree.find('.//TITLE')
|
||||
doc_title = title_elem.text.strip() if (title_elem is not None and title_elem.text) else hml_path.stem
|
||||
md_lines = [f'# {doc_title}', '']
|
||||
|
||||
body = tree.find('.//BODY')
|
||||
if body is None:
|
||||
result['status'] = 'error'
|
||||
result['error'] = 'BODY 요소 없음'
|
||||
return result
|
||||
|
||||
pic_counter = [0]
|
||||
for section in body.findall('.//SECTION'):
|
||||
for p_elem in section.findall('P'):
|
||||
for line in _process_p(p_elem, pic_counter, bin_order, id_to_file, hml_path.stem):
|
||||
if line.startswith('#'):
|
||||
if md_lines and md_lines[-1] != '':
|
||||
md_lines.append('')
|
||||
md_lines += [line, '']
|
||||
elif line.startswith('|') or line.startswith('<table') or line.startswith('!['):
|
||||
md_lines += [line, '']
|
||||
elif line:
|
||||
md_lines += [line, '']
|
||||
|
||||
final = re.sub(r'\n{3,}', '\n\n', '\n'.join(md_lines)).strip()
|
||||
md_path.write_text(final, encoding='utf-8')
|
||||
except Exception as e:
|
||||
result['status'] = 'error'
|
||||
result['error'] = str(e)
|
||||
return result
|
||||
Reference in New Issue
Block a user