feat: Implement full conversion pipeline (PDF/HWP/HWPX/HML/HTML)
- convert.py: 통합 CLI, --json 출력, --scan 폴더 모드 - converters/pdf.py: 페이지별 분류(text/diagram/mixed) + marker-pdf + PNG 렌더링 - converters/hwp.py: COM 자동화 + pyhwp fallback - converters/hwpx.py: ZIP+XML 직접 파싱, 이미지 추출 - converters/hml.py: XML 파싱, Base64 이미지 추출, colspan/rowspan HTML 표 - converters/html.py: html2text (body_width=0) - requirements.txt: 최소 의존성 - .env.example: 환경변수 템플릿 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
31
converters/html.py
Normal file
31
converters/html.py
Normal file
@@ -0,0 +1,31 @@
|
||||
#!/usr/bin/env python3
|
||||
"""HTML / HTM → Markdown (html2text, body_width=0)"""
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def convert_html(html_path: Path, output_dir: Path) -> dict:
|
||||
"""HTML → MD. AGENT_GUIDE 스펙 dict 반환."""
|
||||
html_path = Path(html_path)
|
||||
output_dir = Path(output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
md_path = output_dir / f'{html_path.stem}.md'
|
||||
|
||||
result = {
|
||||
"status": "ok", "input": str(html_path),
|
||||
"output": str(md_path), "format": "html",
|
||||
}
|
||||
try:
|
||||
import html2text
|
||||
h = html2text.HTML2Text()
|
||||
h.body_width = 0
|
||||
h.ignore_links = False
|
||||
h.ignore_images = False
|
||||
content = html_path.read_text(encoding='utf-8', errors='ignore')
|
||||
md = h.handle(content)
|
||||
md_path.write_text(md, encoding='utf-8')
|
||||
except Exception as e:
|
||||
result['status'] = 'error'
|
||||
result['error'] = str(e)
|
||||
return result
|
||||
Reference in New Issue
Block a user