From 2ec2759a20c5c77894457981e2617168226434fd Mon Sep 17 00:00:00 2001 From: minsung Date: Mon, 20 Apr 2026 09:06:34 +0900 Subject: [PATCH] feat: Implement full conversion pipeline (PDF/HWP/HWPX/HML/HTML) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - convert.py: 통합 CLI, --json 출력, --scan 폴더 모드 - converters/pdf.py: 페이지별 분류(text/diagram/mixed) + marker-pdf + PNG 렌더링 - converters/hwp.py: COM 자동화 + pyhwp fallback - converters/hwpx.py: ZIP+XML 직접 파싱, 이미지 추출 - converters/hml.py: XML 파싱, Base64 이미지 추출, colspan/rowspan HTML 표 - converters/html.py: html2text (body_width=0) - requirements.txt: 최소 의존성 - .env.example: 환경변수 템플릿 Co-Authored-By: Claude Sonnet 4.6 --- .claude/hooks/token-usage/.gitignore | 1 + .claude/settings.json | 43 +++++ .env.example | 5 + .usage/token/.gitignore | 2 + convert.py | 136 +++++++++++++++ converters/__init__.py | 0 converters/hml.py | 188 +++++++++++++++++++++ converters/html.py | 31 ++++ converters/hwp.py | 212 +++++++++++++++++++++++ converters/hwpx.py | 188 +++++++++++++++++++++ converters/pdf.py | 240 +++++++++++++++++++++++++++ requirements.txt | 26 +++ 12 files changed, 1072 insertions(+) create mode 100644 .claude/hooks/token-usage/.gitignore create mode 100644 .claude/settings.json create mode 100644 .env.example create mode 100644 .usage/token/.gitignore create mode 100644 convert.py create mode 100644 converters/__init__.py create mode 100644 converters/hml.py create mode 100644 converters/html.py create mode 100644 converters/hwp.py create mode 100644 converters/hwpx.py create mode 100644 converters/pdf.py create mode 100644 requirements.txt diff --git a/.claude/hooks/token-usage/.gitignore b/.claude/hooks/token-usage/.gitignore new file mode 100644 index 0000000..bb68236 --- /dev/null +++ b/.claude/hooks/token-usage/.gitignore @@ -0,0 +1 @@ +aptabase.json diff --git a/.claude/settings.json b/.claude/settings.json new file mode 100644 index 0000000..a8953d8 --- /dev/null +++ b/.claude/settings.json @@ -0,0 +1,43 @@ +{ + "hooks": { + "UserPromptSubmit": [ + { + "hooks": [ + { + "type": "command", + "command": "t=$(mktemp);cat>\"$t\";e=./.claude/hooks/token-usage/claude-hook.exe;[ -x \"$e\" ] && \"$e\" session-context \"$t\";rm -f \"$t\"", + "timeout": 5 + } + ] + } + ], + "Stop": [ + { + "hooks": [ + { + "type": "command", + "command": "t=$(mktemp);cat>\"$t\";e=./.claude/hooks/token-usage/claude-hook.exe;[ -x \"$e\" ] && \"$e\" stop-record \"$t\";rm -f \"$t\"", + "timeout": 5 + } + ] + } + ], + "PostToolUse": [ + { + "matcher": "Bash", + "hooks": [ + { + "type": "command", + "command": "t=$(mktemp);cat>\"$t\";e=./.claude/hooks/token-usage/claude-hook.exe;[ -x \"$e\" ] && \"$e\" aptabase-commit \"$t\";rm -f \"$t\"", + "timeout": 15 + } + ] + } + ] + }, + "permissions": { + "allow": [ + "mcp__gitea__issue_write" + ] + } +} diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..4980b37 --- /dev/null +++ b/.env.example @@ -0,0 +1,5 @@ +# doc2md 환경변수 예시 +# 이 파일을 .env로 복사 후 값 수정 + +# ParaWiki 등 외부 프로젝트에서 이 도구를 subprocess로 호출할 때 사용 +# DOCU_CONVERTER_PATH=D:\MYCLAUDE_PROJECT\doc2md diff --git a/.usage/token/.gitignore b/.usage/token/.gitignore new file mode 100644 index 0000000..bf19123 --- /dev/null +++ b/.usage/token/.gitignore @@ -0,0 +1,2 @@ +job-commit-pool.json +job-send-pool.json diff --git a/convert.py b/convert.py new file mode 100644 index 0000000..9ff1df8 --- /dev/null +++ b/convert.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 +""" +doc2md — 통합 문서 변환기 (AI 에이전트용) +사용법: python convert.py -o [--json] + python convert.py --scan -o [--json] + +자세한 사용법: AGENT_GUIDE.md 참조 +""" +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + +SUPPORTED = {'.pdf', '.hwp', '.hwpx', '.hml', '.html', '.htm'} +SKIP_NAMES = {'README.md', 'CLAUDE.md', 'AGENT_GUIDE.md'} + + +def convert_file(src: Path, output_dir: Path) -> dict: + """파일 하나를 변환. AGENT_GUIDE 스펙 dict 반환.""" + ext = src.suffix.lower() + try: + if ext == '.pdf': + from converters.pdf import convert_pdf + return convert_pdf(src, output_dir) + elif ext == '.hwp': + from converters.hwp import convert_hwp + return convert_hwp(src, output_dir) + elif ext == '.hwpx': + from converters.hwpx import convert_hwpx + return convert_hwpx(src, output_dir) + elif ext == '.hml': + from converters.hml import convert_hml + return convert_hml(src, output_dir) + elif ext in {'.html', '.htm'}: + from converters.html import convert_html + return convert_html(src, output_dir) + else: + return {"status": "skipped", "input": str(src), "reason": "unsupported_format"} + except Exception as e: + return {"status": "error", "input": str(src), "error": str(e)} + + +def scan_and_convert(scan_dir: Path, output_dir: Path) -> dict: + """폴더 스캔 후 변환 대상 일괄 처리.""" + targets = [] + for ext in SUPPORTED: + targets.extend(scan_dir.rglob(f'*{ext}')) + targets.sort() + + results = [] + ok = fail = skipped = 0 + + for src in targets: + if src.name in SKIP_NAMES: + continue + + # 이미 .md 존재하면 스킵 + if src.with_suffix('.md').exists(): + results.append({"input": str(src), "output": None, + "status": "skipped", "reason": "already_md"}) + skipped += 1 + continue + + out_dir = output_dir / src.parent.relative_to(scan_dir) + r = convert_file(src, out_dir) + results.append(r) + if r['status'] == 'ok': + ok += 1 + elif r['status'] == 'error': + fail += 1 + else: + skipped += 1 + + return { + "status": "ok" if fail == 0 else ("error" if ok == 0 else "partial"), + "total": len(results), + "converted": ok, + "skipped": skipped, + "failed": fail, + "results": results, + } + + +def main(): + parser = argparse.ArgumentParser( + description='doc2md — AI 에이전트용 문서 변환기', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog='자세한 사용법: AGENT_GUIDE.md' + ) + parser.add_argument('file', nargs='?', help='변환할 파일') + parser.add_argument('-o', '--output', required=True, help='출력 폴더') + parser.add_argument('--scan', metavar='DIR', help='폴더 일괄 변환 모드') + parser.add_argument('--json', action='store_true', help='결과를 JSON으로 출력') + args = parser.parse_args() + + output_dir = Path(args.output) + + if args.scan: + result = scan_and_convert(Path(args.scan), output_dir) + exit_code = 0 if result['status'] == 'ok' else (1 if result['status'] == 'partial' else 2) + elif args.file: + src = Path(args.file) + if not src.exists(): + err = {"status": "error", "input": str(src), "error": "파일 없음"} + if args.json: + print(json.dumps(err, ensure_ascii=False)) + else: + print(f"오류: 파일 없음 — {src}", file=sys.stderr) + sys.exit(2) + result = convert_file(src, output_dir) + exit_code = 0 if result['status'] == 'ok' else 2 + else: + parser.print_help() + sys.exit(1) + + if args.json: + print(json.dumps(result, ensure_ascii=False, indent=2)) + else: + # 사람이 읽기 쉬운 출력 (에이전트가 --json 없이 호출 시) + status = result.get('status', '') + if 'results' in result: + print(f"[doc2md] {result['converted']}개 변환 / {result['skipped']}개 스킵 / {result['failed']}개 실패") + else: + output = result.get('output', '') + print(f"[doc2md] {status.upper()} — {output or result.get('error', '')}") + if result.get('has_diagrams'): + pages = result.get('diagram_pages', []) + print(f"[doc2md] 다이어그램 페이지: {pages} → Vision AI 처리 필요") + + sys.exit(exit_code) + + +if __name__ == '__main__': + main() diff --git a/converters/__init__.py b/converters/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/converters/hml.py b/converters/hml.py new file mode 100644 index 0000000..d280678 --- /dev/null +++ b/converters/hml.py @@ -0,0 +1,188 @@ +#!/usr/bin/env python3 +"""HML → Markdown (XML 직접 파싱, Base64 이미지 추출)""" +from __future__ import annotations + +import base64 +import re +import xml.etree.ElementTree as ET +from pathlib import Path + + +def _extract_images(tree, images_dir: Path) -> tuple[dict, list]: + images_dir.mkdir(parents=True, exist_ok=True) + bin_format = { + item.get('BinData'): item.get('Format', 'PNG').lower() + for item in tree.findall('.//BINITEM') if item.get('BinData') + } + id_to_file = {} + for bindata in tree.findall('.//BINDATA'): + bid = bindata.get('Id') + raw = (bindata.text or '').strip() + if not raw or bindata.get('Encoding', 'Base64').lower() != 'base64': + continue + fmt = bin_format.get(bid, 'png') + filename = f'BIN{int(bid):04d}.{fmt}' + try: + (images_dir / filename).write_bytes(base64.b64decode(raw)) + id_to_file[bid] = filename + except Exception: + pass + + body = tree.find('.//BODY') + bin_order = [] + if body is not None: + for pic in body.findall('.//PICTURE'): + imgs = pic.findall('.//IMAGE') + bin_order.append(imgs[0].get('BinItem') if imgs else None) + return id_to_file, bin_order + + +def _extract_text(p_elem) -> str: + parts = [] + for t in p_elem.findall('TEXT'): + for child in t: + if child.tag == 'CHAR' and child.text: + parts.append(child.text) + elif child.tag == 'TAB': + parts.append(' ') + return ''.join(parts).strip() + + +def _detect_structure(text: str): + if not text: return 'paragraph', 0, text + if re.match(r'^\d+\.\d+\.\d+\s', text): return 'heading', 4, text + if re.match(r'^\d+\.\d+\s', text): return 'heading', 3, text + if re.match(r'^\d+\.\s.+', text): return 'heading', 2, text + if re.match(r'^[\d가-힣]+\)\s+.+', text):return 'heading', 3, text + if re.match(r'^[□■]\s*.+', text): return 'heading', 2, text + if re.match(r'^[○●◎]\s*.+', text): return 'heading', 3, text + if re.match(r'^[▶▷]\s*.+', text): return 'heading', 4, text + if re.match(r'^[▪▫\-]\s*.+', text): return 'bullet', 0, text + if re.match(r'^[※]', text): return 'paragraph', 0, f'> {text}' + return 'paragraph', 0, text + + +def _extract_table(table_elem) -> str: + col_count = int(table_elem.get('ColCount', 0)) + has_merge = False + raw_rows = [] + for ri, row_elem in enumerate(table_elem.findall('.//ROW')): + cells = [] + for cell_elem in row_elem.findall('CELL'): + cs = int(cell_elem.get('ColSpan', 1)) + rs = int(cell_elem.get('RowSpan', 1)) + ca = int(cell_elem.get('ColAddr', 0)) + if cs > 1 or rs > 1: + has_merge = True + parts = [_extract_text(p) for p in cell_elem.findall('.//P')] + cells.append((ca, cs, rs, '
'.join(p for p in parts if p))) + if cells: + raw_rows.append((ri, cells)) + if not raw_rows: + return '' + + if has_merge: + lines = [''] + for ri, (_, cells) in enumerate(raw_rows): + lines.append('') + tag = 'th' if ri == 0 else 'td' + for _, cs, rs, text in cells: + attrs = (f' colspan="{cs}"' if cs > 1 else '') + (f' rowspan="{rs}"' if rs > 1 else '') + lines.append(f'<{tag}{attrs}>{text}') + lines.append('') + lines.append('
') + return '\n'.join(lines) + else: + rows = [] + for _, cells in raw_rows: + grid = {ca: text for ca, _, _, text in cells} + n = col_count if col_count > 0 else (max(grid) + 1) + rows.append([grid.get(i, '') for i in range(n)]) + mc = max(len(r) for r in rows) + for r in rows: + r += [''] * (mc - len(r)) + def esc(s): return s.replace('|', '\\|').replace('\n', ' ') + lines = ['| ' + ' | '.join(esc(c) for c in rows[0]) + ' |', + '| ' + ' | '.join(['---'] * mc) + ' |'] + for row in rows[1:]: + lines.append('| ' + ' | '.join(esc(c) for c in row) + ' |') + return '\n'.join(lines) + + +def _process_p(p_elem, pic_counter: list, bin_order: list, id_to_file: dict, base_name: str) -> list[str]: + lines = [] + has_content = False + for text_elem in p_elem.findall('TEXT'): + for child in text_elem: + if child.tag == 'TABLE': + has_content = True + md = _extract_table(child) + if md: + lines.append(md) + elif child.tag == 'PICTURE': + has_content = True + idx = pic_counter[0] + pic_counter[0] += 1 + bid = bin_order[idx] if idx < len(bin_order) else None + filename = id_to_file.get(bid, '') if bid else '' + ref = f'{base_name}_images/{filename}' if filename else f'그림_{idx+1}.png' + lines.append(f'![그림 {idx+1}]({ref})') + if not has_content: + text = _extract_text(p_elem) + if text: + kind, level, fmt = _detect_structure(text) + if kind == 'heading': + lines.append(f'{"#" * level} {fmt}') + elif kind == 'bullet': + lines.append(f'- {re.sub(r"^[▪▫-]\\s*", "", fmt)}') + else: + lines.append(fmt) + return lines + + +def convert_hml(hml_path: Path, output_dir: Path) -> dict: + """HML → MD. AGENT_GUIDE 스펙 dict 반환.""" + hml_path = Path(hml_path) + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + md_path = output_dir / f'{hml_path.stem}.md' + images_dir = output_dir / f'{hml_path.stem}_images' + + result = { + "status": "ok", "input": str(hml_path), + "output": str(md_path), "format": "hml", "images": [], + } + try: + tree = ET.fromstring(hml_path.read_text(encoding='utf-8-sig')) + id_to_file, bin_order = _extract_images(tree, images_dir) + result["images"] = [str(images_dir / f) for f in id_to_file.values()] + + title_elem = tree.find('.//TITLE') + doc_title = title_elem.text.strip() if (title_elem is not None and title_elem.text) else hml_path.stem + md_lines = [f'# {doc_title}', ''] + + body = tree.find('.//BODY') + if body is None: + result['status'] = 'error' + result['error'] = 'BODY 요소 없음' + return result + + pic_counter = [0] + for section in body.findall('.//SECTION'): + for p_elem in section.findall('P'): + for line in _process_p(p_elem, pic_counter, bin_order, id_to_file, hml_path.stem): + if line.startswith('#'): + if md_lines and md_lines[-1] != '': + md_lines.append('') + md_lines += [line, ''] + elif line.startswith('|') or line.startswith(' dict: + """HTML → MD. AGENT_GUIDE 스펙 dict 반환.""" + html_path = Path(html_path) + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + md_path = output_dir / f'{html_path.stem}.md' + + result = { + "status": "ok", "input": str(html_path), + "output": str(md_path), "format": "html", + } + try: + import html2text + h = html2text.HTML2Text() + h.body_width = 0 + h.ignore_links = False + h.ignore_images = False + content = html_path.read_text(encoding='utf-8', errors='ignore') + md = h.handle(content) + md_path.write_text(md, encoding='utf-8') + except Exception as e: + result['status'] = 'error' + result['error'] = str(e) + return result diff --git a/converters/hwp.py b/converters/hwp.py new file mode 100644 index 0000000..f10131f --- /dev/null +++ b/converters/hwp.py @@ -0,0 +1,212 @@ +#!/usr/bin/env python3 +"""HWP → Markdown (COM 자동화 우선, pyhwp fallback)""" +from __future__ import annotations + +import re +import shutil +import tempfile +from pathlib import Path + + +def _com_hwp_to_hml(hwp_path: Path, hml_path: Path, timeout: int = 15) -> bool: + import threading + result = [False] + + def _run(): + try: + import pythoncom, win32com.client + except ImportError: + return + hwp = None + try: + pythoncom.CoInitialize() + hwp = win32com.client.Dispatch('HWPFrame.HwpObject') + try: + hwp.RegisterModule('FilePathCheckDLL', 'SecurityModule') + except Exception: + pass + ok = hwp.Open(str(hwp_path).replace('/', '\\'), 'HWP', 'forceopen:true') + if not ok: + return + hwp.SaveAs(str(hml_path).replace('/', '\\'), 'HML', '') + result[0] = hml_path.exists() + except Exception as e: + print(f' COM 오류: {e}') + finally: + if hwp: + try: hwp.Quit() + except Exception: pass + try: pythoncom.CoUninitialize() + except Exception: pass + + t = threading.Thread(target=_run, daemon=True) + t.start() + t.join(timeout) + if t.is_alive(): + print(f' COM 타임아웃 ({timeout}초) — pyhwp로 전환') + return result[0] + + +def _table_to_md(table_elem) -> str: + from bs4 import Tag + rows = table_elem.find_all('tr', recursive=False) or table_elem.find_all('tr') + if not rows: + return '' + has_merge = False + parsed = [] + for tr in rows: + cells = [] + for td in tr.find_all(['td', 'th']): + cs = int(td.get('colspan', 1)) + rs = int(td.get('rowspan', 1)) + if cs > 1 or rs > 1: + has_merge = True + cells.append((cs, rs, td.get_text(separator='
', strip=True))) + if cells: + parsed.append(cells) + if not parsed: + return '' + if has_merge: + lines = [''] + for ri, cells in enumerate(parsed): + lines.append('') + tag = 'th' if ri == 0 else 'td' + for cs, rs, text in cells: + attrs = (f' colspan="{cs}"' if cs > 1 else '') + (f' rowspan="{rs}"' if rs > 1 else '') + lines.append(f'<{tag}{attrs}>{text}') + lines.append('') + lines.append('
') + return '\n'.join(lines) + else: + rows_text = [[text for _, _, text in cells] for cells in parsed] + mc = max(len(r) for r in rows_text) + for r in rows_text: + r += [''] * (mc - len(r)) + def esc(s): return s.replace('|', '\\|') + lines = ['| ' + ' | '.join(esc(c) for c in rows_text[0]) + ' |', + '| ' + ' | '.join(['---'] * mc) + ' |'] + for row in rows_text[1:]: + lines.append('| ' + ' | '.join(esc(c) for c in row) + ' |') + return '\n'.join(lines) + + +def _detect_structure(text: str): + if not text: return 'paragraph', 0, text + if re.match(r'^\d+\.\d+\.\d+\s', text): return 'heading', 4, text + if re.match(r'^\d+\.\d+\s', text): return 'heading', 3, text + if re.match(r'^\d+\.\s.+', text): return 'heading', 2, text + if re.match(r'^[\d가-힣]+\)\s+.+', text):return 'heading', 3, text + if re.match(r'^[□■]\s*.+', text): return 'heading', 2, text + if re.match(r'^[○●◎]\s*.+', text): return 'heading', 3, text + if re.match(r'^[▶▷]\s*.+', text): return 'heading', 4, text + if re.match(r'^[▪▫\-]\s*.+', text): return 'bullet', 0, text + if re.match(r'^[※]', text): return 'paragraph', 0, f'> {text}' + return 'paragraph', 0, text + + +def _pyhwp_hwp_to_md(hwp_path: Path, output_path: Path, base_name: str) -> bool: + try: + from hwp5.hwp5html import HTMLTransform + from hwp5.xmlmodel import Hwp5File + from bs4 import BeautifulSoup + except ImportError as e: + print(f' pyhwp/bs4 미설치: {e}') + return False + + tmp_dir = Path(tempfile.mkdtemp()) + try: + f = Hwp5File(str(hwp_path)) + HTMLTransform().transform_hwp5_to_dir(f, str(tmp_dir)) + xhtml_path = tmp_dir / 'index.xhtml' + if not xhtml_path.exists(): + return False + + images_dir = output_path.parent / f'{base_name}_images' + images_dir.mkdir(exist_ok=True) + img_map = {} + bindata_dir = tmp_dir / 'bindata' + if bindata_dir.exists(): + for img in bindata_dir.iterdir(): + shutil.copy(img, images_dir / img.name) + img_map[img.name] = img.name + + soup = BeautifulSoup(xhtml_path.read_text(encoding='utf-8'), 'lxml-xml') + for area in soup.find_all(class_=re.compile(r'^(HeaderArea|FooterArea|Header parashape|Footer parashape)$')): + area.decompose() + + md_lines = [] + img_counter = [0] + for elem in soup.find_all(['p', 'table']): + if elem.find_parent('table'): + continue + if elem.name == 'table': + if not elem.find_parent('p'): + md = _table_to_md(elem) + if md: + md_lines += [md, ''] + elif elem.name == 'p': + for img in elem.find_all('img'): + fn = Path(img.get('src', '')).name + if fn in img_map: + img_counter[0] += 1 + md_lines += [f'![그림 {img_counter[0]}]({base_name}_images/{fn})', ''] + inner = elem.find('table') + if inner: + md = _table_to_md(inner) + if md: + md_lines += [md, ''] + continue + text = re.sub(r'\s+', ' ', elem.get_text(separator=' ', strip=True)).strip() + if not text: + continue + kind, level, fmt = _detect_structure(text) + if kind == 'heading': + if md_lines and md_lines[-1] != '': + md_lines.append('') + md_lines += [f'{"#" * level} {fmt}', ''] + elif kind == 'bullet': + md_lines.append(f'- {re.sub(r"^[▪▫-]\\s*", "", fmt)}') + else: + md_lines += [fmt, ''] + + output_path.write_text(re.sub(r'\n{3,}', '\n\n', '\n'.join(md_lines)), encoding='utf-8') + return True + except Exception as e: + print(f' pyhwp 오류: {e}') + return False + finally: + shutil.rmtree(tmp_dir, ignore_errors=True) + + +def convert_hwp(hwp_path: Path, output_dir: Path) -> dict: + """HWP → MD. AGENT_GUIDE 스펙 dict 반환.""" + hwp_path = Path(hwp_path) + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + md_path = output_dir / f'{hwp_path.stem}.md' + + result = { + "status": "ok", "input": str(hwp_path), + "output": str(md_path), "format": "hwp", + } + try: + hml_path = md_path.with_suffix('.hml') + if _com_hwp_to_hml(hwp_path, hml_path): + try: + from converters.hml import convert_hml + r = convert_hml(hml_path, output_dir) + hml_path.unlink(missing_ok=True) + if r['status'] == 'ok': + return result + except Exception: + hml_path.unlink(missing_ok=True) + + if _pyhwp_hwp_to_md(hwp_path, md_path, hwp_path.stem): + return result + + result['status'] = 'error' + result['error'] = 'COM + pyhwp 모두 실패' + except Exception as e: + result['status'] = 'error' + result['error'] = str(e) + return result diff --git a/converters/hwpx.py b/converters/hwpx.py new file mode 100644 index 0000000..52d4a2e --- /dev/null +++ b/converters/hwpx.py @@ -0,0 +1,188 @@ +#!/usr/bin/env python3 +"""HWPX → Markdown (ZIP+XML 직접 파싱, 이미지 추출 포함)""" +from __future__ import annotations + +import re +import zipfile +import xml.etree.ElementTree as ET +from pathlib import Path + +NS = { + 'hp': 'http://www.hancom.co.kr/hwpml/2011/paragraph', + 'hc': 'http://www.hancom.co.kr/hwpml/2011/core', +} + + +def _extract_images(zf: zipfile.ZipFile, images_dir: Path) -> dict: + images_dir.mkdir(parents=True, exist_ok=True) + id_to_file = {} + for name in zf.namelist(): + if not name.startswith('BinData/'): + continue + filename = Path(name).name + if not filename: + continue + out_path = images_dir / filename + out_path.write_bytes(zf.read(name)) + id_to_file[Path(filename).stem] = filename + return id_to_file + + +def _extract_text(p_elem) -> str: + parts = [] + for run in p_elem.findall('hp:run', NS): + for t in run.findall('hp:t', NS): + if t.text: + parts.append(t.text) + if run.findall('hp:tab', NS): + parts.append(' ') + return ''.join(parts).strip() + + +def _detect_structure(text: str): + if not text: return 'paragraph', 0, text + if re.match(r'^\d+\.\d+\.\d+\s', text): return 'heading', 4, text + if re.match(r'^\d+\.\d+\s', text): return 'heading', 3, text + if re.match(r'^\d+\.\s.+', text): return 'heading', 2, text + if re.match(r'^[\d가-힣]+\)\s+.+', text):return 'heading', 3, text + if re.match(r'^[□■]\s*.+', text): return 'heading', 2, text + if re.match(r'^[○●◎]\s*.+', text): return 'heading', 3, text + if re.match(r'^[▶▷]\s*.+', text): return 'heading', 4, text + if re.match(r'^[▪▫\-]\s*.+', text): return 'bullet', 0, text + if re.match(r'^[※]', text): return 'paragraph', 0, f'> {text}' + return 'paragraph', 0, text + + +def _cell_text(tc_elem) -> str: + parts = [] + for sub in tc_elem.findall('hp:subList', NS): + for p in sub.findall('hp:p', NS): + if p.find('.//hp:tbl', NS) is not None: + continue + t = _extract_text(p) + if t: + parts.append(t) + return '
'.join(parts) + + +def _get_span(tc_elem): + cs = int(tc_elem.get('colSpan', 1)) + rs = int(tc_elem.get('rowSpan', 1)) + span = tc_elem.find('hp:cellSpan', NS) + if span is not None: + cs = int(span.get('colSpan', cs)) + rs = int(span.get('rowSpan', rs)) + return cs, rs + + +def _extract_table(tbl_elem) -> str: + has_merge = False + raw_rows = [] + for tr in tbl_elem.findall('hp:tr', NS): + cells = [] + for tc in tr.findall('hp:tc', NS): + cs, rs = _get_span(tc) + if cs > 1 or rs > 1: + has_merge = True + cells.append((cs, rs, _cell_text(tc))) + if cells: + raw_rows.append(cells) + if not raw_rows: + return '' + if has_merge: + lines = [''] + for ri, cells in enumerate(raw_rows): + lines.append('') + tag = 'th' if ri == 0 else 'td' + for cs, rs, text in cells: + attrs = (f' colspan="{cs}"' if cs > 1 else '') + (f' rowspan="{rs}"' if rs > 1 else '') + lines.append(f'<{tag}{attrs}>{text}') + lines.append('') + lines.append('
') + return '\n'.join(lines) + else: + rows = [[t for _, _, t in cells] for cells in raw_rows] + mc = max(len(r) for r in rows) + for r in rows: + r += [''] * (mc - len(r)) + def esc(s): return s.replace('|', '\\|').replace('\n', ' ') + lines = ['| ' + ' | '.join(esc(c) for c in rows[0]) + ' |', + '| ' + ' | '.join(['---'] * mc) + ' |'] + for row in rows[1:]: + lines.append('| ' + ' | '.join(esc(c) for c in row) + ' |') + return '\n'.join(lines) + + +def _process_para(p_elem, pic_counter: list, id_to_file: dict, base_name: str) -> list[str]: + tbl = p_elem.find('.//hp:tbl', NS) + if tbl is not None: + md = _extract_table(tbl) + return [md] if md else [] + + pic = p_elem.find('.//hp:pic', NS) + if pic is not None: + idx = pic_counter[0] + pic_counter[0] += 1 + img_elem = pic.find('.//hc:img', NS) + if img_elem is not None: + ref_id = img_elem.get('binaryItemIDRef', '') + filename = id_to_file.get(ref_id, '') + if filename: + return [f'![그림 {idx+1}]({base_name}_images/{filename})'] + return [f'![그림 {idx+1}](그림_{idx+1}.png)'] + + text = _extract_text(p_elem) + if not text: + return [] + kind, level, fmt = _detect_structure(text) + if kind == 'heading': + return [f'{"#" * level} {fmt}'] + elif kind == 'bullet': + return [f'- {re.sub(r"^[▪▫-]\\s*", "", fmt)}'] + return [fmt] + + +def convert_hwpx(hwpx_path: Path, output_dir: Path) -> dict: + """HWPX → MD. AGENT_GUIDE 스펙 dict 반환.""" + hwpx_path = Path(hwpx_path) + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + md_path = output_dir / f'{hwpx_path.stem}.md' + images_dir = output_dir / f'{hwpx_path.stem}_images' + + result = { + "status": "ok", "input": str(hwpx_path), + "output": str(md_path), "format": "hwpx", "images": [], + } + try: + with zipfile.ZipFile(hwpx_path, 'r') as zf: + id_to_file = _extract_images(zf, images_dir) + result["images"] = [str(images_dir / f) for f in id_to_file.values()] + + section_files = sorted( + n for n in zf.namelist() + if re.match(r'Contents/section\d+\.xml', n) + ) + md_lines: list[str] = [] + pic_counter = [0] + for sec_file in section_files: + root = ET.fromstring(zf.read(sec_file)) + for p_elem in root.findall('hp:p', NS): + if p_elem.find('.//hp:secPr', NS) is not None: + continue + for line in _process_para(p_elem, pic_counter, id_to_file, hwpx_path.stem): + if line.startswith('#') or line.startswith(' Image.Image: + """PyMuPDF Pixmap → PIL Image.""" + mode = "RGBA" if pix.alpha else "RGB" + return Image.frombytes(mode, (pix.width, pix.height), pix.samples) + + +def _is_diagram_image(img: Image.Image) -> bool: + """ + 래스터 이미지가 다이어그램인지 판별. + 다이어그램 특성: 제한된 색상 팔레트 + 높은 흰 배경 비율. + """ + # 너무 작은 이미지(로고, 아이콘)는 스킵 + if img.width < 100 or img.height < 100: + return False + + # 색상 수 (64색으로 양자화 후 실제 사용 색상) + small = img.resize((200, 200), Image.LANCZOS).convert("RGB") + quantized = small.quantize(colors=64) + color_count = len(set(quantized.getdata())) + + # 흰 배경 비율 + gray = small.convert("L") + pixels = list(gray.getdata()) + white_ratio = sum(1 for p in pixels if p > 240) / len(pixels) + + return color_count < 32 and white_ratio > 0.35 + + +def classify_page(page: fitz.Page, doc: fitz.Document) -> str: + """ + 페이지를 분류한다. + 반환값: 'text' | 'text-with-photo' | 'diagram' | 'image-heavy' + """ + text = page.get_text().strip() + text_len = len(text) + page_area = page.rect.width * page.rect.height + + drawings = page.get_drawings() + images = page.get_images(full=True) + + text_density = text_len / page_area * 10_000 # 면적 대비 문자 수 + + # 벡터 드로잉 밀도 (flowchart, CAD export 등은 수백 개 드로잉 포함) + drawing_density = len(drawings) / page_area * 10_000 + + # 1) 텍스트가 충분하면 텍스트 계열 + if text_density > 4: + if not images: + return "text" + # 이미지가 있어도 작은 이미지(로고 등)면 text + large_images = [ + img for img in images + if doc.extract_image(img[0])["width"] > 150 + and doc.extract_image(img[0])["height"] > 150 + ] + return "text-with-photo" if large_images else "text" + + # 2) 벡터 드로잉이 많으면 다이어그램 + if drawing_density > 1.5: + return "diagram" + + # 3) 래스터 이미지가 있으면 다이어그램 여부 분석 + if images: + for img_info in images[:3]: # 최대 3개만 검사 (속도) + try: + xref = img_info[0] + pix = fitz.Pixmap(doc, xref) + if pix.colorspace and pix.colorspace.n > 1: + pil = _pix_to_pil(pix) + if _is_diagram_image(pil): + return "diagram" + except Exception: + pass + return "text-with-photo" if text_len > 50 else "image-heavy" + + # 4) 텍스트도 이미지도 거의 없음 + return "image-heavy" if not text_len else "text" + + +# ── 페이지 PNG 렌더링 ───────────────────────────────────────────────────────── + +def _render_page_png(page: fitz.Page, output_path: Path, scale: float = 2.0) -> None: + """페이지를 고해상도 PNG로 렌더링.""" + mat = fitz.Matrix(scale, scale) + pix = page.get_pixmap(matrix=mat) + pix.save(str(output_path)) + + +# ── 메인 변환 함수 ──────────────────────────────────────────────────────────── + +def convert_pdf(pdf_path: Path, output_dir: Path) -> dict: + """ + PDF → MD 변환. AGENT_GUIDE.md 스펙의 JSON 구조를 dict로 반환. + + 반환 dict: + status : "ok" | "error" + input : str + output : str (md 파일 경로) + format : "pdf" + pages : list of {n, type, image?} + has_diagrams : bool + diagram_pages : list[int] + images : list[str] + error? : str + """ + pdf_path = Path(pdf_path) + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + images_dir = output_dir / f"{pdf_path.stem}_images" + md_path = output_dir / f"{pdf_path.stem}.md" + + result: dict = { + "status": "ok", + "input": str(pdf_path), + "output": str(md_path), + "format": "pdf", + "pages": [], + "has_diagrams": False, + "diagram_pages": [], + "images": [], + } + + try: + doc = fitz.open(str(pdf_path)) + + # ── 1) 각 페이지 분류 ────────────────────────────────────────────── + page_types: list[str] = [] + for page in doc: + ptype = classify_page(page, doc) + page_types.append(ptype) + result["pages"].append({"n": page.number + 1, "type": ptype}) + + diagram_page_nums = [ + i + 1 for i, t in enumerate(page_types) + if t in ("diagram", "image-heavy") + ] + result["has_diagrams"] = bool(diagram_page_nums) + result["diagram_pages"] = diagram_page_nums + + # ── 2) 텍스트 추출 (marker-pdf) ──────────────────────────────────── + text_sections: list[str] = [] + try: + from marker.converters.pdf import PdfConverter + from marker.models import create_model_dict + from marker.output import text_from_rendered + + converter = PdfConverter(artifact_dict=create_model_dict()) + rendered = converter(str(pdf_path)) + full_text, _, marker_images = text_from_rendered(rendered) + + # marker 추출 이미지 저장 + if marker_images: + images_dir.mkdir(exist_ok=True) + for img_name, img_data in marker_images.items(): + try: + img_dest = images_dir / img_name + if isinstance(img_data, Image.Image): + img_data.save(str(img_dest)) + elif isinstance(img_data, bytes) and img_data: + img_dest.write_bytes(img_data) + result["images"].append(str(img_dest)) + except Exception: + pass + + # 이미지 경로 prefix 수정 + full_text = re.sub( + r'!\[([^\]]*)\]\((?!http)([^)]+)\)', + rf'![\1]({pdf_path.stem}_images/\2)', + full_text, + ) + text_sections.append(full_text) + + except ImportError: + # marker-pdf 없으면 PyMuPDF 텍스트 추출로 fallback + pages_text = [] + for page in doc: + t = page.get_text().strip() + if t: + pages_text.append(t) + text_sections.append("\n\n---\n\n".join(pages_text)) + + # ── 3) 다이어그램 페이지 PNG 렌더링 ──────────────────────────────── + if diagram_page_nums: + images_dir.mkdir(exist_ok=True) + diagram_section_lines = ["\n\n---\n\n## 다이어그램 페이지\n"] + + for page_num in diagram_page_nums: + page = doc[page_num - 1] + img_name = f"page_{page_num}.png" + img_path = images_dir / img_name + _render_page_png(page, img_path) + result["images"].append(str(img_path)) + diagram_section_lines.append( + f"\n### Page {page_num}\n" + f"![Page {page_num} — 다이어그램]" + f"({pdf_path.stem}_images/{img_name})\n" + ) + # pages 항목에 image 경로 추가 + for p in result["pages"]: + if p["n"] == page_num: + p["image"] = str(img_path) + + text_sections.append("".join(diagram_section_lines)) + + doc.close() + + # ── 4) MD 파일 저장 ──────────────────────────────────────────────── + final_md = re.sub(r'\n{3,}', '\n\n', "\n\n".join(text_sections)).strip() + md_path.write_text(final_md, encoding="utf-8") + + except Exception as e: + result["status"] = "error" + result["error"] = str(e) + import traceback + traceback.print_exc() + + return result diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..60da69a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,26 @@ +# doc2md 필수 패키지 +# pip install -r requirements.txt + +# PDF 변환 (텍스트/이미지 혼합) +marker-pdf>=1.0.0 + +# PDF 페이지 분석 + 렌더링 +PyMuPDF>=1.23.0 + +# 이미지 처리 (다이어그램 감지) +Pillow>=10.0.0 + +# XML 파싱 (HML, HWPX) — 표준 라이브러리 포함 +lxml>=4.9.0 + +# HTML 파싱 (HWP pyhwp fallback) +beautifulsoup4>=4.12.0 + +# HTML → MD +html2text>=2020.1.16 + +# HWP 변환 fallback (한컴오피스 미설치 환경) +pyhwp>=0.1.0b19 + +# Windows 전용: HWP COM 자동화 (한컴오피스 설치 시 자동 사용) +# pywin32>=306