feat: Implement full conversion pipeline (PDF/HWP/HWPX/HML/HTML)

- convert.py: 통합 CLI, --json 출력, --scan 폴더 모드 - converters/pdf.py: 페이지별 분류(text/diagram/mixed) + marker-pdf + PNG 렌더링 - converters/hwp.py: COM 자동화 + pyhwp fallback - converters/hwpx.py: ZIP+XML 직접 파싱, 이미지 추출 - converters/hml.py: XML 파싱, Base64 이미지 추출, colspan/rowspan HTML 표 - converters/html.py: html2text (body_width=0) - requirements.txt: 최소 의존성 - .env.example: 환경변수 템플릿 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-20 09:06:34 +09:00
parent 6f365018f5
commit 2ec2759a20
12 changed files with 1072 additions and 0 deletions
--- a/.claude/hooks/token-usage/.gitignore
+++ b/.claude/hooks/token-usage/.gitignore
@@ -0,0 +1 @@
+aptabase.json
--- a/.claude/settings.json
+++ b/.claude/settings.json
@@ -0,0 +1,43 @@
+{
+  "hooks": {
+    "UserPromptSubmit": [
+      {
+        "hooks": [
+          {
+            "type": "command",
+            "command": "t=$(mktemp);cat>\"$t\";e=./.claude/hooks/token-usage/claude-hook.exe;[ -x \"$e\" ] && \"$e\" session-context \"$t\";rm -f \"$t\"",
+            "timeout": 5
+          }
+        ]
+      }
+    ],
+    "Stop": [
+      {
+        "hooks": [
+          {
+            "type": "command",
+            "command": "t=$(mktemp);cat>\"$t\";e=./.claude/hooks/token-usage/claude-hook.exe;[ -x \"$e\" ] && \"$e\" stop-record \"$t\";rm -f \"$t\"",
+            "timeout": 5
+          }
+        ]
+      }
+    ],
+    "PostToolUse": [
+      {
+        "matcher": "Bash",
+        "hooks": [
+          {
+            "type": "command",
+            "command": "t=$(mktemp);cat>\"$t\";e=./.claude/hooks/token-usage/claude-hook.exe;[ -x \"$e\" ] && \"$e\" aptabase-commit \"$t\";rm -f \"$t\"",
+            "timeout": 15
+          }
+        ]
+      }
+    ]
+  },
+  "permissions": {
+    "allow": [
+      "mcp__gitea__issue_write"
+    ]
+  }
+}
--- a/.env.example
+++ b/.env.example
@@ -0,0 +1,5 @@
+# doc2md 환경변수 예시
+# 이 파일을 .env로 복사 후 값 수정
+
+# ParaWiki 등 외부 프로젝트에서 이 도구를 subprocess로 호출할 때 사용
+# DOCU_CONVERTER_PATH=D:\MYCLAUDE_PROJECT\doc2md
--- a/.usage/token/.gitignore
+++ b/.usage/token/.gitignore
@@ -0,0 +1,2 @@
+job-commit-pool.json
+job-send-pool.json
--- a/convert.py
+++ b/convert.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+"""
+doc2md — 통합 문서 변환기 (AI 에이전트용)
+사용법: python convert.py <file> -o <output_dir> [--json]
+        python convert.py --scan <dir> -o <output_dir> [--json]
+
+자세한 사용법: AGENT_GUIDE.md 참조
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+SUPPORTED = {'.pdf', '.hwp', '.hwpx', '.hml', '.html', '.htm'}
+SKIP_NAMES = {'README.md', 'CLAUDE.md', 'AGENT_GUIDE.md'}
+
+
+def convert_file(src: Path, output_dir: Path) -> dict:
+    """파일 하나를 변환. AGENT_GUIDE 스펙 dict 반환."""
+    ext = src.suffix.lower()
+    try:
+        if ext == '.pdf':
+            from converters.pdf import convert_pdf
+            return convert_pdf(src, output_dir)
+        elif ext == '.hwp':
+            from converters.hwp import convert_hwp
+            return convert_hwp(src, output_dir)
+        elif ext == '.hwpx':
+            from converters.hwpx import convert_hwpx
+            return convert_hwpx(src, output_dir)
+        elif ext == '.hml':
+            from converters.hml import convert_hml
+            return convert_hml(src, output_dir)
+        elif ext in {'.html', '.htm'}:
+            from converters.html import convert_html
+            return convert_html(src, output_dir)
+        else:
+            return {"status": "skipped", "input": str(src), "reason": "unsupported_format"}
+    except Exception as e:
+        return {"status": "error", "input": str(src), "error": str(e)}
+
+
+def scan_and_convert(scan_dir: Path, output_dir: Path) -> dict:
+    """폴더 스캔 후 변환 대상 일괄 처리."""
+    targets = []
+    for ext in SUPPORTED:
+        targets.extend(scan_dir.rglob(f'*{ext}'))
+    targets.sort()
+
+    results = []
+    ok = fail = skipped = 0
+
+    for src in targets:
+        if src.name in SKIP_NAMES:
+            continue
+
+        # 이미 .md 존재하면 스킵
+        if src.with_suffix('.md').exists():
+            results.append({"input": str(src), "output": None,
+                            "status": "skipped", "reason": "already_md"})
+            skipped += 1
+            continue
+
+        out_dir = output_dir / src.parent.relative_to(scan_dir)
+        r = convert_file(src, out_dir)
+        results.append(r)
+        if r['status'] == 'ok':
+            ok += 1
+        elif r['status'] == 'error':
+            fail += 1
+        else:
+            skipped += 1
+
+    return {
+        "status": "ok" if fail == 0 else ("error" if ok == 0 else "partial"),
+        "total": len(results),
+        "converted": ok,
+        "skipped": skipped,
+        "failed": fail,
+        "results": results,
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='doc2md — AI 에이전트용 문서 변환기',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog='자세한 사용법: AGENT_GUIDE.md'
+    )
+    parser.add_argument('file', nargs='?', help='변환할 파일')
+    parser.add_argument('-o', '--output', required=True, help='출력 폴더')
+    parser.add_argument('--scan', metavar='DIR', help='폴더 일괄 변환 모드')
+    parser.add_argument('--json', action='store_true', help='결과를 JSON으로 출력')
+    args = parser.parse_args()
+
+    output_dir = Path(args.output)
+
+    if args.scan:
+        result = scan_and_convert(Path(args.scan), output_dir)
+        exit_code = 0 if result['status'] == 'ok' else (1 if result['status'] == 'partial' else 2)
+    elif args.file:
+        src = Path(args.file)
+        if not src.exists():
+            err = {"status": "error", "input": str(src), "error": "파일 없음"}
+            if args.json:
+                print(json.dumps(err, ensure_ascii=False))
+            else:
+                print(f"오류: 파일 없음 — {src}", file=sys.stderr)
+            sys.exit(2)
+        result = convert_file(src, output_dir)
+        exit_code = 0 if result['status'] == 'ok' else 2
+    else:
+        parser.print_help()
+        sys.exit(1)
+
+    if args.json:
+        print(json.dumps(result, ensure_ascii=False, indent=2))
+    else:
+        # 사람이 읽기 쉬운 출력 (에이전트가 --json 없이 호출 시)
+        status = result.get('status', '')
+        if 'results' in result:
+            print(f"[doc2md] {result['converted']}개 변환 / {result['skipped']}개 스킵 / {result['failed']}개 실패")
+        else:
+            output = result.get('output', '')
+            print(f"[doc2md] {status.upper()} — {output or result.get('error', '')}")
+            if result.get('has_diagrams'):
+                pages = result.get('diagram_pages', [])
+                print(f"[doc2md] 다이어그램 페이지: {pages} → Vision AI 처리 필요")
+
+    sys.exit(exit_code)
+
+
+if __name__ == '__main__':
+    main()
--- a/converters/init.py
+++ b/converters/init.py
--- a/converters/hml.py
+++ b/converters/hml.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python3
+"""HML → Markdown (XML 직접 파싱, Base64 이미지 추출)"""
+from __future__ import annotations
+
+import base64
+import re
+import xml.etree.ElementTree as ET
+from pathlib import Path
+
+
+def _extract_images(tree, images_dir: Path) -> tuple[dict, list]:
+    images_dir.mkdir(parents=True, exist_ok=True)
+    bin_format = {
+        item.get('BinData'): item.get('Format', 'PNG').lower()
+        for item in tree.findall('.//BINITEM') if item.get('BinData')
+    }
+    id_to_file = {}
+    for bindata in tree.findall('.//BINDATA'):
+        bid = bindata.get('Id')
+        raw = (bindata.text or '').strip()
+        if not raw or bindata.get('Encoding', 'Base64').lower() != 'base64':
+            continue
+        fmt = bin_format.get(bid, 'png')
+        filename = f'BIN{int(bid):04d}.{fmt}'
+        try:
+            (images_dir / filename).write_bytes(base64.b64decode(raw))
+            id_to_file[bid] = filename
+        except Exception:
+            pass
+
+    body = tree.find('.//BODY')
+    bin_order = []
+    if body is not None:
+        for pic in body.findall('.//PICTURE'):
+            imgs = pic.findall('.//IMAGE')
+            bin_order.append(imgs[0].get('BinItem') if imgs else None)
+    return id_to_file, bin_order
+
+
+def _extract_text(p_elem) -> str:
+    parts = []
+    for t in p_elem.findall('TEXT'):
+        for child in t:
+            if child.tag == 'CHAR' and child.text:
+                parts.append(child.text)
+            elif child.tag == 'TAB':
+                parts.append(' ')
+    return ''.join(parts).strip()
+
+
+def _detect_structure(text: str):
+    if not text: return 'paragraph', 0, text
+    if re.match(r'^\d+\.\d+\.\d+\s', text): return 'heading', 4, text
+    if re.match(r'^\d+\.\d+\s', text):       return 'heading', 3, text
+    if re.match(r'^\d+\.\s.+', text):        return 'heading', 2, text
+    if re.match(r'^[\d가-힣]+\)\s+.+', text):return 'heading', 3, text
+    if re.match(r'^[□■]\s*.+', text):        return 'heading', 2, text
+    if re.match(r'^[○●◎]\s*.+', text):       return 'heading', 3, text
+    if re.match(r'^[▶▷]\s*.+', text):        return 'heading', 4, text
+    if re.match(r'^[▪▫\-]\s*.+', text):      return 'bullet',  0, text
+    if re.match(r'^[※]', text):              return 'paragraph', 0, f'> {text}'
+    return 'paragraph', 0, text
+
+
+def _extract_table(table_elem) -> str:
+    col_count = int(table_elem.get('ColCount', 0))
+    has_merge = False
+    raw_rows = []
+    for ri, row_elem in enumerate(table_elem.findall('.//ROW')):
+        cells = []
+        for cell_elem in row_elem.findall('CELL'):
+            cs = int(cell_elem.get('ColSpan', 1))
+            rs = int(cell_elem.get('RowSpan', 1))
+            ca = int(cell_elem.get('ColAddr', 0))
+            if cs > 1 or rs > 1:
+                has_merge = True
+            parts = [_extract_text(p) for p in cell_elem.findall('.//P')]
+            cells.append((ca, cs, rs, '<br>'.join(p for p in parts if p)))
+        if cells:
+            raw_rows.append((ri, cells))
+    if not raw_rows:
+        return ''
+
+    if has_merge:
+        lines = ['<table>']
+        for ri, (_, cells) in enumerate(raw_rows):
+            lines.append('<tr>')
+            tag = 'th' if ri == 0 else 'td'
+            for _, cs, rs, text in cells:
+                attrs = (f' colspan="{cs}"' if cs > 1 else '') + (f' rowspan="{rs}"' if rs > 1 else '')
+                lines.append(f'<{tag}{attrs}>{text}</{tag}>')
+            lines.append('</tr>')
+        lines.append('</table>')
+        return '\n'.join(lines)
+    else:
+        rows = []
+        for _, cells in raw_rows:
+            grid = {ca: text for ca, _, _, text in cells}
+            n = col_count if col_count > 0 else (max(grid) + 1)
+            rows.append([grid.get(i, '') for i in range(n)])
+        mc = max(len(r) for r in rows)
+        for r in rows:
+            r += [''] * (mc - len(r))
+        def esc(s): return s.replace('|', '\\|').replace('\n', ' ')
+        lines = ['| ' + ' | '.join(esc(c) for c in rows[0]) + ' |',
+                 '| ' + ' | '.join(['---'] * mc) + ' |']
+        for row in rows[1:]:
+            lines.append('| ' + ' | '.join(esc(c) for c in row) + ' |')
+        return '\n'.join(lines)
+
+
+def _process_p(p_elem, pic_counter: list, bin_order: list, id_to_file: dict, base_name: str) -> list[str]:
+    lines = []
+    has_content = False
+    for text_elem in p_elem.findall('TEXT'):
+        for child in text_elem:
+            if child.tag == 'TABLE':
+                has_content = True
+                md = _extract_table(child)
+                if md:
+                    lines.append(md)
+            elif child.tag == 'PICTURE':
+                has_content = True
+                idx = pic_counter[0]
+                pic_counter[0] += 1
+                bid = bin_order[idx] if idx < len(bin_order) else None
+                filename = id_to_file.get(bid, '') if bid else ''
+                ref = f'{base_name}_images/{filename}' if filename else f'그림_{idx+1}.png'
+                lines.append(f'![그림 {idx+1}]({ref})')
+    if not has_content:
+        text = _extract_text(p_elem)
+        if text:
+            kind, level, fmt = _detect_structure(text)
+            if kind == 'heading':
+                lines.append(f'{"#" * level} {fmt}')
+            elif kind == 'bullet':
+                lines.append(f'- {re.sub(r"^[▪▫-]\\s*", "", fmt)}')
+            else:
+                lines.append(fmt)
+    return lines
+
+
+def convert_hml(hml_path: Path, output_dir: Path) -> dict:
+    """HML → MD. AGENT_GUIDE 스펙 dict 반환."""
+    hml_path = Path(hml_path)
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    md_path = output_dir / f'{hml_path.stem}.md'
+    images_dir = output_dir / f'{hml_path.stem}_images'
+
+    result = {
+        "status": "ok", "input": str(hml_path),
+        "output": str(md_path), "format": "hml", "images": [],
+    }
+    try:
+        tree = ET.fromstring(hml_path.read_text(encoding='utf-8-sig'))
+        id_to_file, bin_order = _extract_images(tree, images_dir)
+        result["images"] = [str(images_dir / f) for f in id_to_file.values()]
+
+        title_elem = tree.find('.//TITLE')
+        doc_title = title_elem.text.strip() if (title_elem is not None and title_elem.text) else hml_path.stem
+        md_lines = [f'# {doc_title}', '']
+
+        body = tree.find('.//BODY')
+        if body is None:
+            result['status'] = 'error'
+            result['error'] = 'BODY 요소 없음'
+            return result
+
+        pic_counter = [0]
+        for section in body.findall('.//SECTION'):
+            for p_elem in section.findall('P'):
+                for line in _process_p(p_elem, pic_counter, bin_order, id_to_file, hml_path.stem):
+                    if line.startswith('#'):
+                        if md_lines and md_lines[-1] != '':
+                            md_lines.append('')
+                        md_lines += [line, '']
+                    elif line.startswith('|') or line.startswith('<table') or line.startswith('!['):
+                        md_lines += [line, '']
+                    elif line:
+                        md_lines += [line, '']
+
+        final = re.sub(r'\n{3,}', '\n\n', '\n'.join(md_lines)).strip()
+        md_path.write_text(final, encoding='utf-8')
+    except Exception as e:
+        result['status'] = 'error'
+        result['error'] = str(e)
+    return result
--- a/converters/html.py
+++ b/converters/html.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+"""HTML / HTM → Markdown (html2text, body_width=0)"""
+from __future__ import annotations
+
+from pathlib import Path
+
+
+def convert_html(html_path: Path, output_dir: Path) -> dict:
+    """HTML → MD. AGENT_GUIDE 스펙 dict 반환."""
+    html_path = Path(html_path)
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    md_path = output_dir / f'{html_path.stem}.md'
+
+    result = {
+        "status": "ok", "input": str(html_path),
+        "output": str(md_path), "format": "html",
+    }
+    try:
+        import html2text
+        h = html2text.HTML2Text()
+        h.body_width = 0
+        h.ignore_links = False
+        h.ignore_images = False
+        content = html_path.read_text(encoding='utf-8', errors='ignore')
+        md = h.handle(content)
+        md_path.write_text(md, encoding='utf-8')
+    except Exception as e:
+        result['status'] = 'error'
+        result['error'] = str(e)
+    return result
--- a/converters/hwp.py
+++ b/converters/hwp.py
@@ -0,0 +1,212 @@
+#!/usr/bin/env python3
+"""HWP → Markdown (COM 자동화 우선, pyhwp fallback)"""
+from __future__ import annotations
+
+import re
+import shutil
+import tempfile
+from pathlib import Path
+
+
+def _com_hwp_to_hml(hwp_path: Path, hml_path: Path, timeout: int = 15) -> bool:
+    import threading
+    result = [False]
+
+    def _run():
+        try:
+            import pythoncom, win32com.client
+        except ImportError:
+            return
+        hwp = None
+        try:
+            pythoncom.CoInitialize()
+            hwp = win32com.client.Dispatch('HWPFrame.HwpObject')
+            try:
+                hwp.RegisterModule('FilePathCheckDLL', 'SecurityModule')
+            except Exception:
+                pass
+            ok = hwp.Open(str(hwp_path).replace('/', '\\'), 'HWP', 'forceopen:true')
+            if not ok:
+                return
+            hwp.SaveAs(str(hml_path).replace('/', '\\'), 'HML', '')
+            result[0] = hml_path.exists()
+        except Exception as e:
+            print(f'  COM 오류: {e}')
+        finally:
+            if hwp:
+                try: hwp.Quit()
+                except Exception: pass
+            try: pythoncom.CoUninitialize()
+            except Exception: pass
+
+    t = threading.Thread(target=_run, daemon=True)
+    t.start()
+    t.join(timeout)
+    if t.is_alive():
+        print(f'  COM 타임아웃 ({timeout}초) — pyhwp로 전환')
+    return result[0]
+
+
+def _table_to_md(table_elem) -> str:
+    from bs4 import Tag
+    rows = table_elem.find_all('tr', recursive=False) or table_elem.find_all('tr')
+    if not rows:
+        return ''
+    has_merge = False
+    parsed = []
+    for tr in rows:
+        cells = []
+        for td in tr.find_all(['td', 'th']):
+            cs = int(td.get('colspan', 1))
+            rs = int(td.get('rowspan', 1))
+            if cs > 1 or rs > 1:
+                has_merge = True
+            cells.append((cs, rs, td.get_text(separator='<br>', strip=True)))
+        if cells:
+            parsed.append(cells)
+    if not parsed:
+        return ''
+    if has_merge:
+        lines = ['<table>']
+        for ri, cells in enumerate(parsed):
+            lines.append('<tr>')
+            tag = 'th' if ri == 0 else 'td'
+            for cs, rs, text in cells:
+                attrs = (f' colspan="{cs}"' if cs > 1 else '') + (f' rowspan="{rs}"' if rs > 1 else '')
+                lines.append(f'<{tag}{attrs}>{text}</{tag}>')
+            lines.append('</tr>')
+        lines.append('</table>')
+        return '\n'.join(lines)
+    else:
+        rows_text = [[text for _, _, text in cells] for cells in parsed]
+        mc = max(len(r) for r in rows_text)
+        for r in rows_text:
+            r += [''] * (mc - len(r))
+        def esc(s): return s.replace('|', '\\|')
+        lines = ['| ' + ' | '.join(esc(c) for c in rows_text[0]) + ' |',
+                 '| ' + ' | '.join(['---'] * mc) + ' |']
+        for row in rows_text[1:]:
+            lines.append('| ' + ' | '.join(esc(c) for c in row) + ' |')
+        return '\n'.join(lines)
+
+
+def _detect_structure(text: str):
+    if not text: return 'paragraph', 0, text
+    if re.match(r'^\d+\.\d+\.\d+\s', text): return 'heading', 4, text
+    if re.match(r'^\d+\.\d+\s', text):       return 'heading', 3, text
+    if re.match(r'^\d+\.\s.+', text):        return 'heading', 2, text
+    if re.match(r'^[\d가-힣]+\)\s+.+', text):return 'heading', 3, text
+    if re.match(r'^[□■]\s*.+', text):        return 'heading', 2, text
+    if re.match(r'^[○●◎]\s*.+', text):       return 'heading', 3, text
+    if re.match(r'^[▶▷]\s*.+', text):        return 'heading', 4, text
+    if re.match(r'^[▪▫\-]\s*.+', text):      return 'bullet',  0, text
+    if re.match(r'^[※]', text):              return 'paragraph', 0, f'> {text}'
+    return 'paragraph', 0, text
+
+
+def _pyhwp_hwp_to_md(hwp_path: Path, output_path: Path, base_name: str) -> bool:
+    try:
+        from hwp5.hwp5html import HTMLTransform
+        from hwp5.xmlmodel import Hwp5File
+        from bs4 import BeautifulSoup
+    except ImportError as e:
+        print(f'  pyhwp/bs4 미설치: {e}')
+        return False
+
+    tmp_dir = Path(tempfile.mkdtemp())
+    try:
+        f = Hwp5File(str(hwp_path))
+        HTMLTransform().transform_hwp5_to_dir(f, str(tmp_dir))
+        xhtml_path = tmp_dir / 'index.xhtml'
+        if not xhtml_path.exists():
+            return False
+
+        images_dir = output_path.parent / f'{base_name}_images'
+        images_dir.mkdir(exist_ok=True)
+        img_map = {}
+        bindata_dir = tmp_dir / 'bindata'
+        if bindata_dir.exists():
+            for img in bindata_dir.iterdir():
+                shutil.copy(img, images_dir / img.name)
+                img_map[img.name] = img.name
+
+        soup = BeautifulSoup(xhtml_path.read_text(encoding='utf-8'), 'lxml-xml')
+        for area in soup.find_all(class_=re.compile(r'^(HeaderArea|FooterArea|Header parashape|Footer parashape)$')):
+            area.decompose()
+
+        md_lines = []
+        img_counter = [0]
+        for elem in soup.find_all(['p', 'table']):
+            if elem.find_parent('table'):
+                continue
+            if elem.name == 'table':
+                if not elem.find_parent('p'):
+                    md = _table_to_md(elem)
+                    if md:
+                        md_lines += [md, '']
+            elif elem.name == 'p':
+                for img in elem.find_all('img'):
+                    fn = Path(img.get('src', '')).name
+                    if fn in img_map:
+                        img_counter[0] += 1
+                        md_lines += [f'![그림 {img_counter[0]}]({base_name}_images/{fn})', '']
+                inner = elem.find('table')
+                if inner:
+                    md = _table_to_md(inner)
+                    if md:
+                        md_lines += [md, '']
+                    continue
+                text = re.sub(r'\s+', ' ', elem.get_text(separator=' ', strip=True)).strip()
+                if not text:
+                    continue
+                kind, level, fmt = _detect_structure(text)
+                if kind == 'heading':
+                    if md_lines and md_lines[-1] != '':
+                        md_lines.append('')
+                    md_lines += [f'{"#" * level} {fmt}', '']
+                elif kind == 'bullet':
+                    md_lines.append(f'- {re.sub(r"^[▪▫-]\\s*", "", fmt)}')
+                else:
+                    md_lines += [fmt, '']
+
+        output_path.write_text(re.sub(r'\n{3,}', '\n\n', '\n'.join(md_lines)), encoding='utf-8')
+        return True
+    except Exception as e:
+        print(f'  pyhwp 오류: {e}')
+        return False
+    finally:
+        shutil.rmtree(tmp_dir, ignore_errors=True)
+
+
+def convert_hwp(hwp_path: Path, output_dir: Path) -> dict:
+    """HWP → MD. AGENT_GUIDE 스펙 dict 반환."""
+    hwp_path = Path(hwp_path)
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    md_path = output_dir / f'{hwp_path.stem}.md'
+
+    result = {
+        "status": "ok", "input": str(hwp_path),
+        "output": str(md_path), "format": "hwp",
+    }
+    try:
+        hml_path = md_path.with_suffix('.hml')
+        if _com_hwp_to_hml(hwp_path, hml_path):
+            try:
+                from converters.hml import convert_hml
+                r = convert_hml(hml_path, output_dir)
+                hml_path.unlink(missing_ok=True)
+                if r['status'] == 'ok':
+                    return result
+            except Exception:
+                hml_path.unlink(missing_ok=True)
+
+        if _pyhwp_hwp_to_md(hwp_path, md_path, hwp_path.stem):
+            return result
+
+        result['status'] = 'error'
+        result['error'] = 'COM + pyhwp 모두 실패'
+    except Exception as e:
+        result['status'] = 'error'
+        result['error'] = str(e)
+    return result
--- a/converters/hwpx.py
+++ b/converters/hwpx.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python3
+"""HWPX → Markdown (ZIP+XML 직접 파싱, 이미지 추출 포함)"""
+from __future__ import annotations
+
+import re
+import zipfile
+import xml.etree.ElementTree as ET
+from pathlib import Path
+
+NS = {
+    'hp': 'http://www.hancom.co.kr/hwpml/2011/paragraph',
+    'hc': 'http://www.hancom.co.kr/hwpml/2011/core',
+}
+
+
+def _extract_images(zf: zipfile.ZipFile, images_dir: Path) -> dict:
+    images_dir.mkdir(parents=True, exist_ok=True)
+    id_to_file = {}
+    for name in zf.namelist():
+        if not name.startswith('BinData/'):
+            continue
+        filename = Path(name).name
+        if not filename:
+            continue
+        out_path = images_dir / filename
+        out_path.write_bytes(zf.read(name))
+        id_to_file[Path(filename).stem] = filename
+    return id_to_file
+
+
+def _extract_text(p_elem) -> str:
+    parts = []
+    for run in p_elem.findall('hp:run', NS):
+        for t in run.findall('hp:t', NS):
+            if t.text:
+                parts.append(t.text)
+        if run.findall('hp:tab', NS):
+            parts.append(' ')
+    return ''.join(parts).strip()
+
+
+def _detect_structure(text: str):
+    if not text: return 'paragraph', 0, text
+    if re.match(r'^\d+\.\d+\.\d+\s', text): return 'heading', 4, text
+    if re.match(r'^\d+\.\d+\s', text):       return 'heading', 3, text
+    if re.match(r'^\d+\.\s.+', text):        return 'heading', 2, text
+    if re.match(r'^[\d가-힣]+\)\s+.+', text):return 'heading', 3, text
+    if re.match(r'^[□■]\s*.+', text):        return 'heading', 2, text
+    if re.match(r'^[○●◎]\s*.+', text):       return 'heading', 3, text
+    if re.match(r'^[▶▷]\s*.+', text):        return 'heading', 4, text
+    if re.match(r'^[▪▫\-]\s*.+', text):      return 'bullet',  0, text
+    if re.match(r'^[※]', text):              return 'paragraph', 0, f'> {text}'
+    return 'paragraph', 0, text
+
+
+def _cell_text(tc_elem) -> str:
+    parts = []
+    for sub in tc_elem.findall('hp:subList', NS):
+        for p in sub.findall('hp:p', NS):
+            if p.find('.//hp:tbl', NS) is not None:
+                continue
+            t = _extract_text(p)
+            if t:
+                parts.append(t)
+    return '<br>'.join(parts)
+
+
+def _get_span(tc_elem):
+    cs = int(tc_elem.get('colSpan', 1))
+    rs = int(tc_elem.get('rowSpan', 1))
+    span = tc_elem.find('hp:cellSpan', NS)
+    if span is not None:
+        cs = int(span.get('colSpan', cs))
+        rs = int(span.get('rowSpan', rs))
+    return cs, rs
+
+
+def _extract_table(tbl_elem) -> str:
+    has_merge = False
+    raw_rows = []
+    for tr in tbl_elem.findall('hp:tr', NS):
+        cells = []
+        for tc in tr.findall('hp:tc', NS):
+            cs, rs = _get_span(tc)
+            if cs > 1 or rs > 1:
+                has_merge = True
+            cells.append((cs, rs, _cell_text(tc)))
+        if cells:
+            raw_rows.append(cells)
+    if not raw_rows:
+        return ''
+    if has_merge:
+        lines = ['<table>']
+        for ri, cells in enumerate(raw_rows):
+            lines.append('<tr>')
+            tag = 'th' if ri == 0 else 'td'
+            for cs, rs, text in cells:
+                attrs = (f' colspan="{cs}"' if cs > 1 else '') + (f' rowspan="{rs}"' if rs > 1 else '')
+                lines.append(f'<{tag}{attrs}>{text}</{tag}>')
+            lines.append('</tr>')
+        lines.append('</table>')
+        return '\n'.join(lines)
+    else:
+        rows = [[t for _, _, t in cells] for cells in raw_rows]
+        mc = max(len(r) for r in rows)
+        for r in rows:
+            r += [''] * (mc - len(r))
+        def esc(s): return s.replace('|', '\\|').replace('\n', ' ')
+        lines = ['| ' + ' | '.join(esc(c) for c in rows[0]) + ' |',
+                 '| ' + ' | '.join(['---'] * mc) + ' |']
+        for row in rows[1:]:
+            lines.append('| ' + ' | '.join(esc(c) for c in row) + ' |')
+        return '\n'.join(lines)
+
+
+def _process_para(p_elem, pic_counter: list, id_to_file: dict, base_name: str) -> list[str]:
+    tbl = p_elem.find('.//hp:tbl', NS)
+    if tbl is not None:
+        md = _extract_table(tbl)
+        return [md] if md else []
+
+    pic = p_elem.find('.//hp:pic', NS)
+    if pic is not None:
+        idx = pic_counter[0]
+        pic_counter[0] += 1
+        img_elem = pic.find('.//hc:img', NS)
+        if img_elem is not None:
+            ref_id = img_elem.get('binaryItemIDRef', '')
+            filename = id_to_file.get(ref_id, '')
+            if filename:
+                return [f'![그림 {idx+1}]({base_name}_images/{filename})']
+        return [f'![그림 {idx+1}](그림_{idx+1}.png)']
+
+    text = _extract_text(p_elem)
+    if not text:
+        return []
+    kind, level, fmt = _detect_structure(text)
+    if kind == 'heading':
+        return [f'{"#" * level} {fmt}']
+    elif kind == 'bullet':
+        return [f'- {re.sub(r"^[▪▫-]\\s*", "", fmt)}']
+    return [fmt]
+
+
+def convert_hwpx(hwpx_path: Path, output_dir: Path) -> dict:
+    """HWPX → MD. AGENT_GUIDE 스펙 dict 반환."""
+    hwpx_path = Path(hwpx_path)
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    md_path = output_dir / f'{hwpx_path.stem}.md'
+    images_dir = output_dir / f'{hwpx_path.stem}_images'
+
+    result = {
+        "status": "ok", "input": str(hwpx_path),
+        "output": str(md_path), "format": "hwpx", "images": [],
+    }
+    try:
+        with zipfile.ZipFile(hwpx_path, 'r') as zf:
+            id_to_file = _extract_images(zf, images_dir)
+            result["images"] = [str(images_dir / f) for f in id_to_file.values()]
+
+            section_files = sorted(
+                n for n in zf.namelist()
+                if re.match(r'Contents/section\d+\.xml', n)
+            )
+            md_lines: list[str] = []
+            pic_counter = [0]
+            for sec_file in section_files:
+                root = ET.fromstring(zf.read(sec_file))
+                for p_elem in root.findall('hp:p', NS):
+                    if p_elem.find('.//hp:secPr', NS) is not None:
+                        continue
+                    for line in _process_para(p_elem, pic_counter, id_to_file, hwpx_path.stem):
+                        if line.startswith('#') or line.startswith('<table') or line.startswith('|') or line.startswith('!['):
+                            if md_lines and md_lines[-1] != '':
+                                md_lines.append('')
+                            md_lines.append(line)
+                            md_lines.append('')
+                        elif line:
+                            md_lines.append(line)
+                            md_lines.append('')
+
+        final = re.sub(r'\n{3,}', '\n\n', '\n'.join(md_lines)).strip()
+        md_path.write_text(final, encoding='utf-8')
+    except Exception as e:
+        result['status'] = 'error'
+        result['error'] = str(e)
+    return result
--- a/converters/pdf.py
+++ b/converters/pdf.py
@@ -0,0 +1,240 @@
+#!/usr/bin/env python3
+"""
+PDF → Markdown 변환기 (페이지별 분류 + 라우팅)
+
+페이지 타입:
+  text            - 텍스트 위주 → marker-pdf 추출
+  text-with-photo - 텍스트 + 사진 → marker-pdf + 이미지 크롭
+  diagram         - 다이어그램/도면 → 페이지 PNG 렌더링 (에이전트가 Vision으로 처리)
+  image-heavy     - 텍스트 거의 없음 → 페이지 PNG 렌더링
+"""
+from __future__ import annotations
+
+import io
+import re
+from pathlib import Path
+
+import fitz  # PyMuPDF
+from PIL import Image
+
+
+# ── 페이지 분류 ───────────────────────────────────────────────────────────────
+
+def _pix_to_pil(pix: fitz.Pixmap) -> Image.Image:
+    """PyMuPDF Pixmap → PIL Image."""
+    mode = "RGBA" if pix.alpha else "RGB"
+    return Image.frombytes(mode, (pix.width, pix.height), pix.samples)
+
+
+def _is_diagram_image(img: Image.Image) -> bool:
+    """
+    래스터 이미지가 다이어그램인지 판별.
+    다이어그램 특성: 제한된 색상 팔레트 + 높은 흰 배경 비율.
+    """
+    # 너무 작은 이미지(로고, 아이콘)는 스킵
+    if img.width < 100 or img.height < 100:
+        return False
+
+    # 색상 수 (64색으로 양자화 후 실제 사용 색상)
+    small = img.resize((200, 200), Image.LANCZOS).convert("RGB")
+    quantized = small.quantize(colors=64)
+    color_count = len(set(quantized.getdata()))
+
+    # 흰 배경 비율
+    gray = small.convert("L")
+    pixels = list(gray.getdata())
+    white_ratio = sum(1 for p in pixels if p > 240) / len(pixels)
+
+    return color_count < 32 and white_ratio > 0.35
+
+
+def classify_page(page: fitz.Page, doc: fitz.Document) -> str:
+    """
+    페이지를 분류한다.
+    반환값: 'text' | 'text-with-photo' | 'diagram' | 'image-heavy'
+    """
+    text = page.get_text().strip()
+    text_len = len(text)
+    page_area = page.rect.width * page.rect.height
+
+    drawings = page.get_drawings()
+    images = page.get_images(full=True)
+
+    text_density = text_len / page_area * 10_000  # 면적 대비 문자 수
+
+    # 벡터 드로잉 밀도 (flowchart, CAD export 등은 수백 개 드로잉 포함)
+    drawing_density = len(drawings) / page_area * 10_000
+
+    # 1) 텍스트가 충분하면 텍스트 계열
+    if text_density > 4:
+        if not images:
+            return "text"
+        # 이미지가 있어도 작은 이미지(로고 등)면 text
+        large_images = [
+            img for img in images
+            if doc.extract_image(img[0])["width"] > 150
+            and doc.extract_image(img[0])["height"] > 150
+        ]
+        return "text-with-photo" if large_images else "text"
+
+    # 2) 벡터 드로잉이 많으면 다이어그램
+    if drawing_density > 1.5:
+        return "diagram"
+
+    # 3) 래스터 이미지가 있으면 다이어그램 여부 분석
+    if images:
+        for img_info in images[:3]:  # 최대 3개만 검사 (속도)
+            try:
+                xref = img_info[0]
+                pix = fitz.Pixmap(doc, xref)
+                if pix.colorspace and pix.colorspace.n > 1:
+                    pil = _pix_to_pil(pix)
+                    if _is_diagram_image(pil):
+                        return "diagram"
+            except Exception:
+                pass
+        return "text-with-photo" if text_len > 50 else "image-heavy"
+
+    # 4) 텍스트도 이미지도 거의 없음
+    return "image-heavy" if not text_len else "text"
+
+
+# ── 페이지 PNG 렌더링 ─────────────────────────────────────────────────────────
+
+def _render_page_png(page: fitz.Page, output_path: Path, scale: float = 2.0) -> None:
+    """페이지를 고해상도 PNG로 렌더링."""
+    mat = fitz.Matrix(scale, scale)
+    pix = page.get_pixmap(matrix=mat)
+    pix.save(str(output_path))
+
+
+# ── 메인 변환 함수 ────────────────────────────────────────────────────────────
+
+def convert_pdf(pdf_path: Path, output_dir: Path) -> dict:
+    """
+    PDF → MD 변환. AGENT_GUIDE.md 스펙의 JSON 구조를 dict로 반환.
+
+    반환 dict:
+      status        : "ok" | "error"
+      input         : str
+      output        : str (md 파일 경로)
+      format        : "pdf"
+      pages         : list of {n, type, image?}
+      has_diagrams  : bool
+      diagram_pages : list[int]
+      images        : list[str]
+      error?        : str
+    """
+    pdf_path = Path(pdf_path)
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    images_dir = output_dir / f"{pdf_path.stem}_images"
+    md_path = output_dir / f"{pdf_path.stem}.md"
+
+    result: dict = {
+        "status": "ok",
+        "input": str(pdf_path),
+        "output": str(md_path),
+        "format": "pdf",
+        "pages": [],
+        "has_diagrams": False,
+        "diagram_pages": [],
+        "images": [],
+    }
+
+    try:
+        doc = fitz.open(str(pdf_path))
+
+        # ── 1) 각 페이지 분류 ──────────────────────────────────────────────
+        page_types: list[str] = []
+        for page in doc:
+            ptype = classify_page(page, doc)
+            page_types.append(ptype)
+            result["pages"].append({"n": page.number + 1, "type": ptype})
+
+        diagram_page_nums = [
+            i + 1 for i, t in enumerate(page_types)
+            if t in ("diagram", "image-heavy")
+        ]
+        result["has_diagrams"] = bool(diagram_page_nums)
+        result["diagram_pages"] = diagram_page_nums
+
+        # ── 2) 텍스트 추출 (marker-pdf) ────────────────────────────────────
+        text_sections: list[str] = []
+        try:
+            from marker.converters.pdf import PdfConverter
+            from marker.models import create_model_dict
+            from marker.output import text_from_rendered
+
+            converter = PdfConverter(artifact_dict=create_model_dict())
+            rendered = converter(str(pdf_path))
+            full_text, _, marker_images = text_from_rendered(rendered)
+
+            # marker 추출 이미지 저장
+            if marker_images:
+                images_dir.mkdir(exist_ok=True)
+                for img_name, img_data in marker_images.items():
+                    try:
+                        img_dest = images_dir / img_name
+                        if isinstance(img_data, Image.Image):
+                            img_data.save(str(img_dest))
+                        elif isinstance(img_data, bytes) and img_data:
+                            img_dest.write_bytes(img_data)
+                        result["images"].append(str(img_dest))
+                    except Exception:
+                        pass
+
+            # 이미지 경로 prefix 수정
+            full_text = re.sub(
+                r'!\[([^\]]*)\]\((?!http)([^)]+)\)',
+                rf'![\1]({pdf_path.stem}_images/\2)',
+                full_text,
+            )
+            text_sections.append(full_text)
+
+        except ImportError:
+            # marker-pdf 없으면 PyMuPDF 텍스트 추출로 fallback
+            pages_text = []
+            for page in doc:
+                t = page.get_text().strip()
+                if t:
+                    pages_text.append(t)
+            text_sections.append("\n\n---\n\n".join(pages_text))
+
+        # ── 3) 다이어그램 페이지 PNG 렌더링 ────────────────────────────────
+        if diagram_page_nums:
+            images_dir.mkdir(exist_ok=True)
+            diagram_section_lines = ["\n\n---\n\n## 다이어그램 페이지\n"]
+
+            for page_num in diagram_page_nums:
+                page = doc[page_num - 1]
+                img_name = f"page_{page_num}.png"
+                img_path = images_dir / img_name
+                _render_page_png(page, img_path)
+                result["images"].append(str(img_path))
+                diagram_section_lines.append(
+                    f"\n### Page {page_num}\n"
+                    f"![Page {page_num} — 다이어그램]"
+                    f"({pdf_path.stem}_images/{img_name})\n"
+                )
+                # pages 항목에 image 경로 추가
+                for p in result["pages"]:
+                    if p["n"] == page_num:
+                        p["image"] = str(img_path)
+
+            text_sections.append("".join(diagram_section_lines))
+
+        doc.close()
+
+        # ── 4) MD 파일 저장 ────────────────────────────────────────────────
+        final_md = re.sub(r'\n{3,}', '\n\n', "\n\n".join(text_sections)).strip()
+        md_path.write_text(final_md, encoding="utf-8")
+
+    except Exception as e:
+        result["status"] = "error"
+        result["error"] = str(e)
+        import traceback
+        traceback.print_exc()
+
+    return result
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,26 @@
+# doc2md 필수 패키지
+# pip install -r requirements.txt
+
+# PDF 변환 (텍스트/이미지 혼합)
+marker-pdf>=1.0.0
+
+# PDF 페이지 분석 + 렌더링
+PyMuPDF>=1.23.0
+
+# 이미지 처리 (다이어그램 감지)
+Pillow>=10.0.0
+
+# XML 파싱 (HML, HWPX) — 표준 라이브러리 포함
+lxml>=4.9.0
+
+# HTML 파싱 (HWP pyhwp fallback)
+beautifulsoup4>=4.12.0
+
+# HTML → MD
+html2text>=2020.1.16
+
+# HWP 변환 fallback (한컴오피스 미설치 환경)
+pyhwp>=0.1.0b19
+
+# Windows 전용: HWP COM 자동화 (한컴오피스 설치 시 자동 사용)
+# pywin32>=306