feat: Implement full conversion pipeline (PDF/HWP/HWPX/HML/HTML)

- convert.py: 통합 CLI, --json 출력, --scan 폴더 모드 - converters/pdf.py: 페이지별 분류(text/diagram/mixed) + marker-pdf + PNG 렌더링 - converters/hwp.py: COM 자동화 + pyhwp fallback - converters/hwpx.py: ZIP+XML 직접 파싱, 이미지 추출 - converters/hml.py: XML 파싱, Base64 이미지 추출, colspan/rowspan HTML 표 - converters/html.py: html2text (body_width=0) - requirements.txt: 최소 의존성 - .env.example: 환경변수 템플릿 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-20 09:06:34 +09:00
parent 6f365018f5
commit 2ec2759a20
12 changed files with 1072 additions and 0 deletions
--- a/.claude/hooks/token-usage/.gitignore
+++ b/.claude/hooks/token-usage/.gitignore
@@ -0,0 +1 @@
 aptabase.json
--- a/.claude/settings.json
+++ b/.claude/settings.json
@@ -0,0 +1,43 @@
 {
  "hooks": {
    "UserPromptSubmit": [
      {
        "hooks": [
          {
            "type": "command",
            "command": "t=$(mktemp);cat>\"$t\";e=./.claude/hooks/token-usage/claude-hook.exe;[ -x \"$e\" ] && \"$e\" session-context \"$t\";rm -f \"$t\"",
            "timeout": 5
          }
        ]
      }
    ],
    "Stop": [
      {
        "hooks": [
          {
            "type": "command",
            "command": "t=$(mktemp);cat>\"$t\";e=./.claude/hooks/token-usage/claude-hook.exe;[ -x \"$e\" ] && \"$e\" stop-record \"$t\";rm -f \"$t\"",
            "timeout": 5
          }
        ]
      }
    ],
    "PostToolUse": [
      {
        "matcher": "Bash",
        "hooks": [
          {
            "type": "command",
            "command": "t=$(mktemp);cat>\"$t\";e=./.claude/hooks/token-usage/claude-hook.exe;[ -x \"$e\" ] && \"$e\" aptabase-commit \"$t\";rm -f \"$t\"",
            "timeout": 15
          }
        ]
      }
    ]
  },
  "permissions": {
    "allow": [
      "mcp__gitea__issue_write"
    ]
  }
 }
--- a/.env.example
+++ b/.env.example
@@ -0,0 +1,5 @@
 # doc2md 환경변수 예시
 # 이 파일을 .env로 복사 후 값 수정
 # ParaWiki 등 외부 프로젝트에서 이 도구를 subprocess로 호출할 때 사용
 # DOCU_CONVERTER_PATH=D:\MYCLAUDE_PROJECT\doc2md
--- a/.usage/token/.gitignore
+++ b/.usage/token/.gitignore
@@ -0,0 +1,2 @@
 job-commit-pool.json
 job-send-pool.json
--- a/convert.py
+++ b/convert.py
@@ -0,0 +1,136 @@
 #!/usr/bin/env python3
 """
 doc2md — 통합 문서 변환기 (AI 에이전트용)
 사용법: python convert.py <file> -o <output_dir> [--json]
        python convert.py --scan <dir> -o <output_dir> [--json]
 자세한 사용법: AGENT_GUIDE.md 참조
 """
 from __future__ import annotations
 import argparse
 import json
 import sys
 from pathlib import Path
 SUPPORTED = {'.pdf', '.hwp', '.hwpx', '.hml', '.html', '.htm'}
 SKIP_NAMES = {'README.md', 'CLAUDE.md', 'AGENT_GUIDE.md'}
 def convert_file(src: Path, output_dir: Path) -> dict:
    """파일 하나를 변환. AGENT_GUIDE 스펙 dict 반환."""
    ext = src.suffix.lower()
    try:
        if ext == '.pdf':
            from converters.pdf import convert_pdf
            return convert_pdf(src, output_dir)
        elif ext == '.hwp':
            from converters.hwp import convert_hwp
            return convert_hwp(src, output_dir)
        elif ext == '.hwpx':
            from converters.hwpx import convert_hwpx
            return convert_hwpx(src, output_dir)
        elif ext == '.hml':
            from converters.hml import convert_hml
            return convert_hml(src, output_dir)
        elif ext in {'.html', '.htm'}:
            from converters.html import convert_html
            return convert_html(src, output_dir)
        else:
            return {"status": "skipped", "input": str(src), "reason": "unsupported_format"}
    except Exception as e:
        return {"status": "error", "input": str(src), "error": str(e)}
 def scan_and_convert(scan_dir: Path, output_dir: Path) -> dict:
    """폴더 스캔 후 변환 대상 일괄 처리."""
    targets = []
    for ext in SUPPORTED:
        targets.extend(scan_dir.rglob(f'*{ext}'))
    targets.sort()
    results = []
    ok = fail = skipped = 0
    for src in targets:
        if src.name in SKIP_NAMES:
            continue
        # 이미 .md 존재하면 스킵
        if src.with_suffix('.md').exists():
            results.append({"input": str(src), "output": None,
                            "status": "skipped", "reason": "already_md"})
            skipped += 1
            continue
        out_dir = output_dir / src.parent.relative_to(scan_dir)
        r = convert_file(src, out_dir)
        results.append(r)
        if r['status'] == 'ok':
            ok += 1
        elif r['status'] == 'error':
            fail += 1
        else:
            skipped += 1
    return {
        "status": "ok" if fail == 0 else ("error" if ok == 0 else "partial"),
        "total": len(results),
        "converted": ok,
        "skipped": skipped,
        "failed": fail,
        "results": results,
    }
 def main():
    parser = argparse.ArgumentParser(
        description='doc2md — AI 에이전트용 문서 변환기',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog='자세한 사용법: AGENT_GUIDE.md'
    )
    parser.add_argument('file', nargs='?', help='변환할 파일')
    parser.add_argument('-o', '--output', required=True, help='출력 폴더')
    parser.add_argument('--scan', metavar='DIR', help='폴더 일괄 변환 모드')
    parser.add_argument('--json', action='store_true', help='결과를 JSON으로 출력')
    args = parser.parse_args()
    output_dir = Path(args.output)
    if args.scan:
        result = scan_and_convert(Path(args.scan), output_dir)
        exit_code = 0 if result['status'] == 'ok' else (1 if result['status'] == 'partial' else 2)
    elif args.file:
        src = Path(args.file)
        if not src.exists():
            err = {"status": "error", "input": str(src), "error": "파일 없음"}
            if args.json:
                print(json.dumps(err, ensure_ascii=False))
            else:
                print(f"오류: 파일 없음 — {src}", file=sys.stderr)
            sys.exit(2)
        result = convert_file(src, output_dir)
        exit_code = 0 if result['status'] == 'ok' else 2
    else:
        parser.print_help()
        sys.exit(1)
    if args.json:
        print(json.dumps(result, ensure_ascii=False, indent=2))
    else:
        # 사람이 읽기 쉬운 출력 (에이전트가 --json 없이 호출 시)
        status = result.get('status', '')
        if 'results' in result:
            print(f"[doc2md] {result['converted']}개 변환 / {result['skipped']}개 스킵 / {result['failed']}개 실패")
        else:
            output = result.get('output', '')
            print(f"[doc2md] {status.upper()} — {output or result.get('error', '')}")
            if result.get('has_diagrams'):
                pages = result.get('diagram_pages', [])
                print(f"[doc2md] 다이어그램 페이지: {pages} → Vision AI 처리 필요")
    sys.exit(exit_code)
 if __name__ == '__main__':
    main()
--- a/converters/init.py
+++ b/converters/init.py
--- a/converters/hml.py
+++ b/converters/hml.py
@@ -0,0 +1,188 @@
 #!/usr/bin/env python3
 """HML → Markdown (XML 직접 파싱, Base64 이미지 추출)"""
 from __future__ import annotations
 import base64
 import re
 import xml.etree.ElementTree as ET
 from pathlib import Path
 def _extract_images(tree, images_dir: Path) -> tuple[dict, list]:
    images_dir.mkdir(parents=True, exist_ok=True)
    bin_format = {
        item.get('BinData'): item.get('Format', 'PNG').lower()
        for item in tree.findall('.//BINITEM') if item.get('BinData')
    }
    id_to_file = {}
    for bindata in tree.findall('.//BINDATA'):
        bid = bindata.get('Id')
        raw = (bindata.text or '').strip()
        if not raw or bindata.get('Encoding', 'Base64').lower() != 'base64':
            continue
        fmt = bin_format.get(bid, 'png')
        filename = f'BIN{int(bid):04d}.{fmt}'
        try:
            (images_dir / filename).write_bytes(base64.b64decode(raw))
            id_to_file[bid] = filename
        except Exception:
            pass
    body = tree.find('.//BODY')
    bin_order = []
    if body is not None:
        for pic in body.findall('.//PICTURE'):
            imgs = pic.findall('.//IMAGE')
            bin_order.append(imgs[0].get('BinItem') if imgs else None)
    return id_to_file, bin_order
 def _extract_text(p_elem) -> str:
    parts = []
    for t in p_elem.findall('TEXT'):
        for child in t:
            if child.tag == 'CHAR' and child.text:
                parts.append(child.text)
            elif child.tag == 'TAB':
                parts.append(' ')
    return ''.join(parts).strip()
 def _detect_structure(text: str):
    if not text: return 'paragraph', 0, text
    if re.match(r'^\d+\.\d+\.\d+\s', text): return 'heading', 4, text
    if re.match(r'^\d+\.\d+\s', text):       return 'heading', 3, text
    if re.match(r'^\d+\.\s.+', text):        return 'heading', 2, text
    if re.match(r'^[\d가-힣]+\)\s+.+', text):return 'heading', 3, text
    if re.match(r'^[□■]\s*.+', text):        return 'heading', 2, text
    if re.match(r'^[○●◎]\s*.+', text):       return 'heading', 3, text
    if re.match(r'^[▶▷]\s*.+', text):        return 'heading', 4, text
    if re.match(r'^[▪▫\-]\s*.+', text):      return 'bullet',  0, text
    if re.match(r'^[※]', text):              return 'paragraph', 0, f'> {text}'
    return 'paragraph', 0, text
 def _extract_table(table_elem) -> str:
    col_count = int(table_elem.get('ColCount', 0))
    has_merge = False
    raw_rows = []
    for ri, row_elem in enumerate(table_elem.findall('.//ROW')):
        cells = []
        for cell_elem in row_elem.findall('CELL'):
            cs = int(cell_elem.get('ColSpan', 1))
            rs = int(cell_elem.get('RowSpan', 1))
            ca = int(cell_elem.get('ColAddr', 0))
            if cs > 1 or rs > 1:
                has_merge = True
            parts = [_extract_text(p) for p in cell_elem.findall('.//P')]
            cells.append((ca, cs, rs, '<br>'.join(p for p in parts if p)))
        if cells:
            raw_rows.append((ri, cells))
    if not raw_rows:
        return ''
    if has_merge:
        lines = ['<table>']
        for ri, (_, cells) in enumerate(raw_rows):
            lines.append('<tr>')
            tag = 'th' if ri == 0 else 'td'
            for _, cs, rs, text in cells:
                attrs = (f' colspan="{cs}"' if cs > 1 else '') + (f' rowspan="{rs}"' if rs > 1 else '')
                lines.append(f'<{tag}{attrs}>{text}</{tag}>')
            lines.append('</tr>')
        lines.append('</table>')
        return '\n'.join(lines)
    else:
        rows = []
        for _, cells in raw_rows:
            grid = {ca: text for ca, _, _, text in cells}
            n = col_count if col_count > 0 else (max(grid) + 1)
            rows.append([grid.get(i, '') for i in range(n)])
        mc = max(len(r) for r in rows)
        for r in rows:
            r += [''] * (mc - len(r))
        def esc(s): return s.replace('|', '\\|').replace('\n', ' ')
        lines = ['| ' + ' | '.join(esc(c) for c in rows[0]) + ' |',
                 '| ' + ' | '.join(['---'] * mc) + ' |']
        for row in rows[1:]:
            lines.append('| ' + ' | '.join(esc(c) for c in row) + ' |')
        return '\n'.join(lines)
 def _process_p(p_elem, pic_counter: list, bin_order: list, id_to_file: dict, base_name: str) -> list[str]:
    lines = []
    has_content = False
    for text_elem in p_elem.findall('TEXT'):
        for child in text_elem:
            if child.tag == 'TABLE':
                has_content = True
                md = _extract_table(child)
                if md:
                    lines.append(md)
            elif child.tag == 'PICTURE':
                has_content = True
                idx = pic_counter[0]
                pic_counter[0] += 1
                bid = bin_order[idx] if idx < len(bin_order) else None
                filename = id_to_file.get(bid, '') if bid else ''
                ref = f'{base_name}_images/{filename}' if filename else f'그림_{idx+1}.png'
                lines.append(f'![그림 {idx+1}]({ref})')
    if not has_content:
        text = _extract_text(p_elem)
        if text:
            kind, level, fmt = _detect_structure(text)
            if kind == 'heading':
                lines.append(f'{"#" * level} {fmt}')
            elif kind == 'bullet':
                lines.append(f'- {re.sub(r"^[▪▫-]\\s*", "", fmt)}')
            else:
                lines.append(fmt)
    return lines
 def convert_hml(hml_path: Path, output_dir: Path) -> dict:
    """HML → MD. AGENT_GUIDE 스펙 dict 반환."""
    hml_path = Path(hml_path)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    md_path = output_dir / f'{hml_path.stem}.md'
    images_dir = output_dir / f'{hml_path.stem}_images'
    result = {
        "status": "ok", "input": str(hml_path),
        "output": str(md_path), "format": "hml", "images": [],
    }
    try:
        tree = ET.fromstring(hml_path.read_text(encoding='utf-8-sig'))
        id_to_file, bin_order = _extract_images(tree, images_dir)
        result["images"] = [str(images_dir / f) for f in id_to_file.values()]
        title_elem = tree.find('.//TITLE')
        doc_title = title_elem.text.strip() if (title_elem is not None and title_elem.text) else hml_path.stem
        md_lines = [f'# {doc_title}', '']
        body = tree.find('.//BODY')
        if body is None:
            result['status'] = 'error'
            result['error'] = 'BODY 요소 없음'
            return result
        pic_counter = [0]
        for section in body.findall('.//SECTION'):
            for p_elem in section.findall('P'):
                for line in _process_p(p_elem, pic_counter, bin_order, id_to_file, hml_path.stem):
                    if line.startswith('#'):
                        if md_lines and md_lines[-1] != '':
                            md_lines.append('')
                        md_lines += [line, '']
                    elif line.startswith('|') or line.startswith('<table') or line.startswith('!['):
                        md_lines += [line, '']
                    elif line:
                        md_lines += [line, '']
        final = re.sub(r'\n{3,}', '\n\n', '\n'.join(md_lines)).strip()
        md_path.write_text(final, encoding='utf-8')
    except Exception as e:
        result['status'] = 'error'
        result['error'] = str(e)
    return result
--- a/converters/html.py
+++ b/converters/html.py
@@ -0,0 +1,31 @@
 #!/usr/bin/env python3
 """HTML / HTM → Markdown (html2text, body_width=0)"""
 from __future__ import annotations
 from pathlib import Path
 def convert_html(html_path: Path, output_dir: Path) -> dict:
    """HTML → MD. AGENT_GUIDE 스펙 dict 반환."""
    html_path = Path(html_path)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    md_path = output_dir / f'{html_path.stem}.md'
    result = {
        "status": "ok", "input": str(html_path),
        "output": str(md_path), "format": "html",
    }
    try:
        import html2text
        h = html2text.HTML2Text()
        h.body_width = 0
        h.ignore_links = False
        h.ignore_images = False
        content = html_path.read_text(encoding='utf-8', errors='ignore')
        md = h.handle(content)
        md_path.write_text(md, encoding='utf-8')
    except Exception as e:
        result['status'] = 'error'
        result['error'] = str(e)
    return result
--- a/converters/hwp.py
+++ b/converters/hwp.py
@@ -0,0 +1,212 @@
 #!/usr/bin/env python3
 """HWP → Markdown (COM 자동화 우선, pyhwp fallback)"""
 from __future__ import annotations
 import re
 import shutil
 import tempfile
 from pathlib import Path
 def _com_hwp_to_hml(hwp_path: Path, hml_path: Path, timeout: int = 15) -> bool:
    import threading
    result = [False]
    def _run():
        try:
            import pythoncom, win32com.client
        except ImportError:
            return
        hwp = None
        try:
            pythoncom.CoInitialize()
            hwp = win32com.client.Dispatch('HWPFrame.HwpObject')
            try:
                hwp.RegisterModule('FilePathCheckDLL', 'SecurityModule')
            except Exception:
                pass
            ok = hwp.Open(str(hwp_path).replace('/', '\\'), 'HWP', 'forceopen:true')
            if not ok:
                return
            hwp.SaveAs(str(hml_path).replace('/', '\\'), 'HML', '')
            result[0] = hml_path.exists()
        except Exception as e:
            print(f'  COM 오류: {e}')
        finally:
            if hwp:
                try: hwp.Quit()
                except Exception: pass
            try: pythoncom.CoUninitialize()
            except Exception: pass
    t = threading.Thread(target=_run, daemon=True)
    t.start()
    t.join(timeout)
    if t.is_alive():
        print(f'  COM 타임아웃 ({timeout}초) — pyhwp로 전환')
    return result[0]
 def _table_to_md(table_elem) -> str:
    from bs4 import Tag
    rows = table_elem.find_all('tr', recursive=False) or table_elem.find_all('tr')
    if not rows:
        return ''
    has_merge = False
    parsed = []
    for tr in rows:
        cells = []
        for td in tr.find_all(['td', 'th']):
            cs = int(td.get('colspan', 1))
            rs = int(td.get('rowspan', 1))
            if cs > 1 or rs > 1:
                has_merge = True
            cells.append((cs, rs, td.get_text(separator='<br>', strip=True)))
        if cells:
            parsed.append(cells)
    if not parsed:
        return ''
    if has_merge:
        lines = ['<table>']
        for ri, cells in enumerate(parsed):
            lines.append('<tr>')
            tag = 'th' if ri == 0 else 'td'
            for cs, rs, text in cells:
                attrs = (f' colspan="{cs}"' if cs > 1 else '') + (f' rowspan="{rs}"' if rs > 1 else '')
                lines.append(f'<{tag}{attrs}>{text}</{tag}>')
            lines.append('</tr>')
        lines.append('</table>')
        return '\n'.join(lines)
    else:
        rows_text = [[text for _, _, text in cells] for cells in parsed]
        mc = max(len(r) for r in rows_text)
        for r in rows_text:
            r += [''] * (mc - len(r))
        def esc(s): return s.replace('|', '\\|')
        lines = ['| ' + ' | '.join(esc(c) for c in rows_text[0]) + ' |',
                 '| ' + ' | '.join(['---'] * mc) + ' |']
        for row in rows_text[1:]:
            lines.append('| ' + ' | '.join(esc(c) for c in row) + ' |')
        return '\n'.join(lines)
 def _detect_structure(text: str):
    if not text: return 'paragraph', 0, text
    if re.match(r'^\d+\.\d+\.\d+\s', text): return 'heading', 4, text
    if re.match(r'^\d+\.\d+\s', text):       return 'heading', 3, text
    if re.match(r'^\d+\.\s.+', text):        return 'heading', 2, text
    if re.match(r'^[\d가-힣]+\)\s+.+', text):return 'heading', 3, text
    if re.match(r'^[□■]\s*.+', text):        return 'heading', 2, text
    if re.match(r'^[○●◎]\s*.+', text):       return 'heading', 3, text
    if re.match(r'^[▶▷]\s*.+', text):        return 'heading', 4, text
    if re.match(r'^[▪▫\-]\s*.+', text):      return 'bullet',  0, text
    if re.match(r'^[※]', text):              return 'paragraph', 0, f'> {text}'
    return 'paragraph', 0, text
 def _pyhwp_hwp_to_md(hwp_path: Path, output_path: Path, base_name: str) -> bool:
    try:
        from hwp5.hwp5html import HTMLTransform
        from hwp5.xmlmodel import Hwp5File
        from bs4 import BeautifulSoup
    except ImportError as e:
        print(f'  pyhwp/bs4 미설치: {e}')
        return False
    tmp_dir = Path(tempfile.mkdtemp())
    try:
        f = Hwp5File(str(hwp_path))
        HTMLTransform().transform_hwp5_to_dir(f, str(tmp_dir))
        xhtml_path = tmp_dir / 'index.xhtml'
        if not xhtml_path.exists():
            return False
        images_dir = output_path.parent / f'{base_name}_images'
        images_dir.mkdir(exist_ok=True)
        img_map = {}
        bindata_dir = tmp_dir / 'bindata'
        if bindata_dir.exists():
            for img in bindata_dir.iterdir():
                shutil.copy(img, images_dir / img.name)
                img_map[img.name] = img.name
        soup = BeautifulSoup(xhtml_path.read_text(encoding='utf-8'), 'lxml-xml')
        for area in soup.find_all(class_=re.compile(r'^(HeaderArea|FooterArea|Header parashape|Footer parashape)$')):
            area.decompose()
        md_lines = []
        img_counter = [0]
        for elem in soup.find_all(['p', 'table']):
            if elem.find_parent('table'):
                continue
            if elem.name == 'table':
                if not elem.find_parent('p'):
                    md = _table_to_md(elem)
                    if md:
                        md_lines += [md, '']
            elif elem.name == 'p':
                for img in elem.find_all('img'):
                    fn = Path(img.get('src', '')).name
                    if fn in img_map:
                        img_counter[0] += 1
                        md_lines += [f'![그림 {img_counter[0]}]({base_name}_images/{fn})', '']
                inner = elem.find('table')
                if inner:
                    md = _table_to_md(inner)
                    if md:
                        md_lines += [md, '']
                    continue
                text = re.sub(r'\s+', ' ', elem.get_text(separator=' ', strip=True)).strip()
                if not text:
                    continue
                kind, level, fmt = _detect_structure(text)
                if kind == 'heading':
                    if md_lines and md_lines[-1] != '':
                        md_lines.append('')
                    md_lines += [f'{"#" * level} {fmt}', '']
                elif kind == 'bullet':
                    md_lines.append(f'- {re.sub(r"^[▪▫-]\\s*", "", fmt)}')
                else:
                    md_lines += [fmt, '']
        output_path.write_text(re.sub(r'\n{3,}', '\n\n', '\n'.join(md_lines)), encoding='utf-8')
        return True
    except Exception as e:
        print(f'  pyhwp 오류: {e}')
        return False
    finally:
        shutil.rmtree(tmp_dir, ignore_errors=True)
 def convert_hwp(hwp_path: Path, output_dir: Path) -> dict:
    """HWP → MD. AGENT_GUIDE 스펙 dict 반환."""
    hwp_path = Path(hwp_path)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    md_path = output_dir / f'{hwp_path.stem}.md'
    result = {
        "status": "ok", "input": str(hwp_path),
        "output": str(md_path), "format": "hwp",
    }
    try:
        hml_path = md_path.with_suffix('.hml')
        if _com_hwp_to_hml(hwp_path, hml_path):
            try:
                from converters.hml import convert_hml
                r = convert_hml(hml_path, output_dir)
                hml_path.unlink(missing_ok=True)
                if r['status'] == 'ok':
                    return result
            except Exception:
                hml_path.unlink(missing_ok=True)
        if _pyhwp_hwp_to_md(hwp_path, md_path, hwp_path.stem):
            return result
        result['status'] = 'error'
        result['error'] = 'COM + pyhwp 모두 실패'
    except Exception as e:
        result['status'] = 'error'
        result['error'] = str(e)
    return result
--- a/converters/hwpx.py
+++ b/converters/hwpx.py
@@ -0,0 +1,188 @@
 #!/usr/bin/env python3
 """HWPX → Markdown (ZIP+XML 직접 파싱, 이미지 추출 포함)"""
 from __future__ import annotations
 import re
 import zipfile
 import xml.etree.ElementTree as ET
 from pathlib import Path
 NS = {
    'hp': 'http://www.hancom.co.kr/hwpml/2011/paragraph',
    'hc': 'http://www.hancom.co.kr/hwpml/2011/core',
 }
 def _extract_images(zf: zipfile.ZipFile, images_dir: Path) -> dict:
    images_dir.mkdir(parents=True, exist_ok=True)
    id_to_file = {}
    for name in zf.namelist():
        if not name.startswith('BinData/'):
            continue
        filename = Path(name).name
        if not filename:
            continue
        out_path = images_dir / filename
        out_path.write_bytes(zf.read(name))
        id_to_file[Path(filename).stem] = filename
    return id_to_file
 def _extract_text(p_elem) -> str:
    parts = []
    for run in p_elem.findall('hp:run', NS):
        for t in run.findall('hp:t', NS):
            if t.text:
                parts.append(t.text)
        if run.findall('hp:tab', NS):
            parts.append(' ')
    return ''.join(parts).strip()
 def _detect_structure(text: str):
    if not text: return 'paragraph', 0, text
    if re.match(r'^\d+\.\d+\.\d+\s', text): return 'heading', 4, text
    if re.match(r'^\d+\.\d+\s', text):       return 'heading', 3, text
    if re.match(r'^\d+\.\s.+', text):        return 'heading', 2, text
    if re.match(r'^[\d가-힣]+\)\s+.+', text):return 'heading', 3, text
    if re.match(r'^[□■]\s*.+', text):        return 'heading', 2, text
    if re.match(r'^[○●◎]\s*.+', text):       return 'heading', 3, text
    if re.match(r'^[▶▷]\s*.+', text):        return 'heading', 4, text
    if re.match(r'^[▪▫\-]\s*.+', text):      return 'bullet',  0, text
    if re.match(r'^[※]', text):              return 'paragraph', 0, f'> {text}'
    return 'paragraph', 0, text
 def _cell_text(tc_elem) -> str:
    parts = []
    for sub in tc_elem.findall('hp:subList', NS):
        for p in sub.findall('hp:p', NS):
            if p.find('.//hp:tbl', NS) is not None:
                continue
            t = _extract_text(p)
            if t:
                parts.append(t)
    return '<br>'.join(parts)
 def _get_span(tc_elem):
    cs = int(tc_elem.get('colSpan', 1))
    rs = int(tc_elem.get('rowSpan', 1))
    span = tc_elem.find('hp:cellSpan', NS)
    if span is not None:
        cs = int(span.get('colSpan', cs))
        rs = int(span.get('rowSpan', rs))
    return cs, rs
 def _extract_table(tbl_elem) -> str:
    has_merge = False
    raw_rows = []
    for tr in tbl_elem.findall('hp:tr', NS):
        cells = []
        for tc in tr.findall('hp:tc', NS):
            cs, rs = _get_span(tc)
            if cs > 1 or rs > 1:
                has_merge = True
            cells.append((cs, rs, _cell_text(tc)))
        if cells:
            raw_rows.append(cells)
    if not raw_rows:
        return ''
    if has_merge:
        lines = ['<table>']
        for ri, cells in enumerate(raw_rows):
            lines.append('<tr>')
            tag = 'th' if ri == 0 else 'td'
            for cs, rs, text in cells:
                attrs = (f' colspan="{cs}"' if cs > 1 else '') + (f' rowspan="{rs}"' if rs > 1 else '')
                lines.append(f'<{tag}{attrs}>{text}</{tag}>')
            lines.append('</tr>')
        lines.append('</table>')
        return '\n'.join(lines)
    else:
        rows = [[t for _, _, t in cells] for cells in raw_rows]
        mc = max(len(r) for r in rows)
        for r in rows:
            r += [''] * (mc - len(r))
        def esc(s): return s.replace('|', '\\|').replace('\n', ' ')
        lines = ['| ' + ' | '.join(esc(c) for c in rows[0]) + ' |',
                 '| ' + ' | '.join(['---'] * mc) + ' |']
        for row in rows[1:]:
            lines.append('| ' + ' | '.join(esc(c) for c in row) + ' |')
        return '\n'.join(lines)
 def _process_para(p_elem, pic_counter: list, id_to_file: dict, base_name: str) -> list[str]:
    tbl = p_elem.find('.//hp:tbl', NS)
    if tbl is not None:
        md = _extract_table(tbl)
        return [md] if md else []
    pic = p_elem.find('.//hp:pic', NS)
    if pic is not None:
        idx = pic_counter[0]
        pic_counter[0] += 1
        img_elem = pic.find('.//hc:img', NS)
        if img_elem is not None:
            ref_id = img_elem.get('binaryItemIDRef', '')
            filename = id_to_file.get(ref_id, '')
            if filename:
                return [f'![그림 {idx+1}]({base_name}_images/{filename})']
        return [f'![그림 {idx+1}](그림_{idx+1}.png)']
    text = _extract_text(p_elem)
    if not text:
        return []
    kind, level, fmt = _detect_structure(text)
    if kind == 'heading':
        return [f'{"#" * level} {fmt}']
    elif kind == 'bullet':
        return [f'- {re.sub(r"^[▪▫-]\\s*", "", fmt)}']
    return [fmt]
 def convert_hwpx(hwpx_path: Path, output_dir: Path) -> dict:
    """HWPX → MD. AGENT_GUIDE 스펙 dict 반환."""
    hwpx_path = Path(hwpx_path)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    md_path = output_dir / f'{hwpx_path.stem}.md'
    images_dir = output_dir / f'{hwpx_path.stem}_images'
    result = {
        "status": "ok", "input": str(hwpx_path),
        "output": str(md_path), "format": "hwpx", "images": [],
    }
    try:
        with zipfile.ZipFile(hwpx_path, 'r') as zf:
            id_to_file = _extract_images(zf, images_dir)
            result["images"] = [str(images_dir / f) for f in id_to_file.values()]
            section_files = sorted(
                n for n in zf.namelist()
                if re.match(r'Contents/section\d+\.xml', n)
            )
            md_lines: list[str] = []
            pic_counter = [0]
            for sec_file in section_files:
                root = ET.fromstring(zf.read(sec_file))
                for p_elem in root.findall('hp:p', NS):
                    if p_elem.find('.//hp:secPr', NS) is not None:
                        continue
                    for line in _process_para(p_elem, pic_counter, id_to_file, hwpx_path.stem):
                        if line.startswith('#') or line.startswith('<table') or line.startswith('|') or line.startswith('!['):
                            if md_lines and md_lines[-1] != '':
                                md_lines.append('')
                            md_lines.append(line)
                            md_lines.append('')
                        elif line:
                            md_lines.append(line)
                            md_lines.append('')
        final = re.sub(r'\n{3,}', '\n\n', '\n'.join(md_lines)).strip()
        md_path.write_text(final, encoding='utf-8')
    except Exception as e:
        result['status'] = 'error'
        result['error'] = str(e)
    return result
--- a/converters/pdf.py
+++ b/converters/pdf.py
@@ -0,0 +1,240 @@
 #!/usr/bin/env python3
 """
 PDF → Markdown 변환기 (페이지별 분류 + 라우팅)
 페이지 타입:
  text            - 텍스트 위주 → marker-pdf 추출
  text-with-photo - 텍스트 + 사진 → marker-pdf + 이미지 크롭
  diagram         - 다이어그램/도면 → 페이지 PNG 렌더링 (에이전트가 Vision으로 처리)
  image-heavy     - 텍스트 거의 없음 → 페이지 PNG 렌더링
 """
 from __future__ import annotations
 import io
 import re
 from pathlib import Path
 import fitz  # PyMuPDF
 from PIL import Image
 # ── 페이지 분류 ───────────────────────────────────────────────────────────────
 def _pix_to_pil(pix: fitz.Pixmap) -> Image.Image:
    """PyMuPDF Pixmap → PIL Image."""
    mode = "RGBA" if pix.alpha else "RGB"
    return Image.frombytes(mode, (pix.width, pix.height), pix.samples)
 def _is_diagram_image(img: Image.Image) -> bool:
    """
    래스터 이미지가 다이어그램인지 판별.
    다이어그램 특성: 제한된 색상 팔레트 + 높은 흰 배경 비율.
    """
    # 너무 작은 이미지(로고, 아이콘)는 스킵
    if img.width < 100 or img.height < 100:
        return False
    # 색상 수 (64색으로 양자화 후 실제 사용 색상)
    small = img.resize((200, 200), Image.LANCZOS).convert("RGB")
    quantized = small.quantize(colors=64)
    color_count = len(set(quantized.getdata()))
    # 흰 배경 비율
    gray = small.convert("L")
    pixels = list(gray.getdata())
    white_ratio = sum(1 for p in pixels if p > 240) / len(pixels)
    return color_count < 32 and white_ratio > 0.35
 def classify_page(page: fitz.Page, doc: fitz.Document) -> str:
    """
    페이지를 분류한다.
    반환값: 'text' | 'text-with-photo' | 'diagram' | 'image-heavy'
    """
    text = page.get_text().strip()
    text_len = len(text)
    page_area = page.rect.width * page.rect.height
    drawings = page.get_drawings()
    images = page.get_images(full=True)
    text_density = text_len / page_area * 10_000  # 면적 대비 문자 수
    # 벡터 드로잉 밀도 (flowchart, CAD export 등은 수백 개 드로잉 포함)
    drawing_density = len(drawings) / page_area * 10_000
    # 1) 텍스트가 충분하면 텍스트 계열
    if text_density > 4:
        if not images:
            return "text"
        # 이미지가 있어도 작은 이미지(로고 등)면 text
        large_images = [
            img for img in images
            if doc.extract_image(img[0])["width"] > 150
            and doc.extract_image(img[0])["height"] > 150
        ]
        return "text-with-photo" if large_images else "text"
    # 2) 벡터 드로잉이 많으면 다이어그램
    if drawing_density > 1.5:
        return "diagram"
    # 3) 래스터 이미지가 있으면 다이어그램 여부 분석
    if images:
        for img_info in images[:3]:  # 최대 3개만 검사 (속도)
            try:
                xref = img_info[0]
                pix = fitz.Pixmap(doc, xref)
                if pix.colorspace and pix.colorspace.n > 1:
                    pil = _pix_to_pil(pix)
                    if _is_diagram_image(pil):
                        return "diagram"
            except Exception:
                pass
        return "text-with-photo" if text_len > 50 else "image-heavy"
    # 4) 텍스트도 이미지도 거의 없음
    return "image-heavy" if not text_len else "text"
 # ── 페이지 PNG 렌더링 ─────────────────────────────────────────────────────────
 def _render_page_png(page: fitz.Page, output_path: Path, scale: float = 2.0) -> None:
    """페이지를 고해상도 PNG로 렌더링."""
    mat = fitz.Matrix(scale, scale)
    pix = page.get_pixmap(matrix=mat)
    pix.save(str(output_path))
 # ── 메인 변환 함수 ────────────────────────────────────────────────────────────
 def convert_pdf(pdf_path: Path, output_dir: Path) -> dict:
    """
    PDF → MD 변환. AGENT_GUIDE.md 스펙의 JSON 구조를 dict로 반환.
    반환 dict:
      status        : "ok" | "error"
      input         : str
      output        : str (md 파일 경로)
      format        : "pdf"
      pages         : list of {n, type, image?}
      has_diagrams  : bool
      diagram_pages : list[int]
      images        : list[str]
      error?        : str
    """
    pdf_path = Path(pdf_path)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    images_dir = output_dir / f"{pdf_path.stem}_images"
    md_path = output_dir / f"{pdf_path.stem}.md"
    result: dict = {
        "status": "ok",
        "input": str(pdf_path),
        "output": str(md_path),
        "format": "pdf",
        "pages": [],
        "has_diagrams": False,
        "diagram_pages": [],
        "images": [],
    }
    try:
        doc = fitz.open(str(pdf_path))
        # ── 1) 각 페이지 분류 ──────────────────────────────────────────────
        page_types: list[str] = []
        for page in doc:
            ptype = classify_page(page, doc)
            page_types.append(ptype)
            result["pages"].append({"n": page.number + 1, "type": ptype})
        diagram_page_nums = [
            i + 1 for i, t in enumerate(page_types)
            if t in ("diagram", "image-heavy")
        ]
        result["has_diagrams"] = bool(diagram_page_nums)
        result["diagram_pages"] = diagram_page_nums
        # ── 2) 텍스트 추출 (marker-pdf) ────────────────────────────────────
        text_sections: list[str] = []
        try:
            from marker.converters.pdf import PdfConverter
            from marker.models import create_model_dict
            from marker.output import text_from_rendered
            converter = PdfConverter(artifact_dict=create_model_dict())
            rendered = converter(str(pdf_path))
            full_text, _, marker_images = text_from_rendered(rendered)
            # marker 추출 이미지 저장
            if marker_images:
                images_dir.mkdir(exist_ok=True)
                for img_name, img_data in marker_images.items():
                    try:
                        img_dest = images_dir / img_name
                        if isinstance(img_data, Image.Image):
                            img_data.save(str(img_dest))
                        elif isinstance(img_data, bytes) and img_data:
                            img_dest.write_bytes(img_data)
                        result["images"].append(str(img_dest))
                    except Exception:
                        pass
            # 이미지 경로 prefix 수정
            full_text = re.sub(
                r'!\[([^\]]*)\]\((?!http)([^)]+)\)',
                rf'![\1]({pdf_path.stem}_images/\2)',
                full_text,
            )
            text_sections.append(full_text)
        except ImportError:
            # marker-pdf 없으면 PyMuPDF 텍스트 추출로 fallback
            pages_text = []
            for page in doc:
                t = page.get_text().strip()
                if t:
                    pages_text.append(t)
            text_sections.append("\n\n---\n\n".join(pages_text))
        # ── 3) 다이어그램 페이지 PNG 렌더링 ────────────────────────────────
        if diagram_page_nums:
            images_dir.mkdir(exist_ok=True)
            diagram_section_lines = ["\n\n---\n\n## 다이어그램 페이지\n"]
            for page_num in diagram_page_nums:
                page = doc[page_num - 1]
                img_name = f"page_{page_num}.png"
                img_path = images_dir / img_name
                _render_page_png(page, img_path)
                result["images"].append(str(img_path))
                diagram_section_lines.append(
                    f"\n### Page {page_num}\n"
                    f"![Page {page_num} — 다이어그램]"
                    f"({pdf_path.stem}_images/{img_name})\n"
                )
                # pages 항목에 image 경로 추가
                for p in result["pages"]:
                    if p["n"] == page_num:
                        p["image"] = str(img_path)
            text_sections.append("".join(diagram_section_lines))
        doc.close()
        # ── 4) MD 파일 저장 ────────────────────────────────────────────────
        final_md = re.sub(r'\n{3,}', '\n\n', "\n\n".join(text_sections)).strip()
        md_path.write_text(final_md, encoding="utf-8")
    except Exception as e:
        result["status"] = "error"
        result["error"] = str(e)
        import traceback
        traceback.print_exc()
    return result
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,26 @@
 # doc2md 필수 패키지
 # pip install -r requirements.txt
 # PDF 변환 (텍스트/이미지 혼합)
 marker-pdf>=1.0.0
 # PDF 페이지 분석 + 렌더링
 PyMuPDF>=1.23.0
 # 이미지 처리 (다이어그램 감지)
 Pillow>=10.0.0
 # XML 파싱 (HML, HWPX) — 표준 라이브러리 포함
 lxml>=4.9.0
 # HTML 파싱 (HWP pyhwp fallback)
 beautifulsoup4>=4.12.0
 # HTML → MD
 html2text>=2020.1.16
 # HWP 변환 fallback (한컴오피스 미설치 환경)
 pyhwp>=0.1.0b19
 # Windows 전용: HWP COM 자동화 (한컴오피스 설치 시 자동 사용)
 # pywin32>=306