feat: Implement full conversion pipeline (PDF/HWP/HWPX/HML/HTML)

- convert.py: 통합 CLI, --json 출력, --scan 폴더 모드
- converters/pdf.py: 페이지별 분류(text/diagram/mixed) + marker-pdf + PNG 렌더링
- converters/hwp.py: COM 자동화 + pyhwp fallback
- converters/hwpx.py: ZIP+XML 직접 파싱, 이미지 추출
- converters/hml.py: XML 파싱, Base64 이미지 추출, colspan/rowspan HTML 표
- converters/html.py: html2text (body_width=0)
- requirements.txt: 최소 의존성
- .env.example: 환경변수 템플릿

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
minsung
2026-04-20 09:06:34 +09:00
parent 6f365018f5
commit 2ec2759a20
12 changed files with 1072 additions and 0 deletions

1
.claude/hooks/token-usage/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
aptabase.json

43
.claude/settings.json Normal file
View File

@@ -0,0 +1,43 @@
{
"hooks": {
"UserPromptSubmit": [
{
"hooks": [
{
"type": "command",
"command": "t=$(mktemp);cat>\"$t\";e=./.claude/hooks/token-usage/claude-hook.exe;[ -x \"$e\" ] && \"$e\" session-context \"$t\";rm -f \"$t\"",
"timeout": 5
}
]
}
],
"Stop": [
{
"hooks": [
{
"type": "command",
"command": "t=$(mktemp);cat>\"$t\";e=./.claude/hooks/token-usage/claude-hook.exe;[ -x \"$e\" ] && \"$e\" stop-record \"$t\";rm -f \"$t\"",
"timeout": 5
}
]
}
],
"PostToolUse": [
{
"matcher": "Bash",
"hooks": [
{
"type": "command",
"command": "t=$(mktemp);cat>\"$t\";e=./.claude/hooks/token-usage/claude-hook.exe;[ -x \"$e\" ] && \"$e\" aptabase-commit \"$t\";rm -f \"$t\"",
"timeout": 15
}
]
}
]
},
"permissions": {
"allow": [
"mcp__gitea__issue_write"
]
}
}

5
.env.example Normal file
View File

@@ -0,0 +1,5 @@
# doc2md 환경변수 예시
# 이 파일을 .env로 복사 후 값 수정
# ParaWiki 등 외부 프로젝트에서 이 도구를 subprocess로 호출할 때 사용
# DOCU_CONVERTER_PATH=D:\MYCLAUDE_PROJECT\doc2md

2
.usage/token/.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
job-commit-pool.json
job-send-pool.json

136
convert.py Normal file
View File

@@ -0,0 +1,136 @@
#!/usr/bin/env python3
"""
doc2md — 통합 문서 변환기 (AI 에이전트용)
사용법: python convert.py <file> -o <output_dir> [--json]
python convert.py --scan <dir> -o <output_dir> [--json]
자세한 사용법: AGENT_GUIDE.md 참조
"""
from __future__ import annotations
import argparse
import json
import sys
from pathlib import Path
SUPPORTED = {'.pdf', '.hwp', '.hwpx', '.hml', '.html', '.htm'}
SKIP_NAMES = {'README.md', 'CLAUDE.md', 'AGENT_GUIDE.md'}
def convert_file(src: Path, output_dir: Path) -> dict:
"""파일 하나를 변환. AGENT_GUIDE 스펙 dict 반환."""
ext = src.suffix.lower()
try:
if ext == '.pdf':
from converters.pdf import convert_pdf
return convert_pdf(src, output_dir)
elif ext == '.hwp':
from converters.hwp import convert_hwp
return convert_hwp(src, output_dir)
elif ext == '.hwpx':
from converters.hwpx import convert_hwpx
return convert_hwpx(src, output_dir)
elif ext == '.hml':
from converters.hml import convert_hml
return convert_hml(src, output_dir)
elif ext in {'.html', '.htm'}:
from converters.html import convert_html
return convert_html(src, output_dir)
else:
return {"status": "skipped", "input": str(src), "reason": "unsupported_format"}
except Exception as e:
return {"status": "error", "input": str(src), "error": str(e)}
def scan_and_convert(scan_dir: Path, output_dir: Path) -> dict:
"""폴더 스캔 후 변환 대상 일괄 처리."""
targets = []
for ext in SUPPORTED:
targets.extend(scan_dir.rglob(f'*{ext}'))
targets.sort()
results = []
ok = fail = skipped = 0
for src in targets:
if src.name in SKIP_NAMES:
continue
# 이미 .md 존재하면 스킵
if src.with_suffix('.md').exists():
results.append({"input": str(src), "output": None,
"status": "skipped", "reason": "already_md"})
skipped += 1
continue
out_dir = output_dir / src.parent.relative_to(scan_dir)
r = convert_file(src, out_dir)
results.append(r)
if r['status'] == 'ok':
ok += 1
elif r['status'] == 'error':
fail += 1
else:
skipped += 1
return {
"status": "ok" if fail == 0 else ("error" if ok == 0 else "partial"),
"total": len(results),
"converted": ok,
"skipped": skipped,
"failed": fail,
"results": results,
}
def main():
parser = argparse.ArgumentParser(
description='doc2md — AI 에이전트용 문서 변환기',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog='자세한 사용법: AGENT_GUIDE.md'
)
parser.add_argument('file', nargs='?', help='변환할 파일')
parser.add_argument('-o', '--output', required=True, help='출력 폴더')
parser.add_argument('--scan', metavar='DIR', help='폴더 일괄 변환 모드')
parser.add_argument('--json', action='store_true', help='결과를 JSON으로 출력')
args = parser.parse_args()
output_dir = Path(args.output)
if args.scan:
result = scan_and_convert(Path(args.scan), output_dir)
exit_code = 0 if result['status'] == 'ok' else (1 if result['status'] == 'partial' else 2)
elif args.file:
src = Path(args.file)
if not src.exists():
err = {"status": "error", "input": str(src), "error": "파일 없음"}
if args.json:
print(json.dumps(err, ensure_ascii=False))
else:
print(f"오류: 파일 없음 — {src}", file=sys.stderr)
sys.exit(2)
result = convert_file(src, output_dir)
exit_code = 0 if result['status'] == 'ok' else 2
else:
parser.print_help()
sys.exit(1)
if args.json:
print(json.dumps(result, ensure_ascii=False, indent=2))
else:
# 사람이 읽기 쉬운 출력 (에이전트가 --json 없이 호출 시)
status = result.get('status', '')
if 'results' in result:
print(f"[doc2md] {result['converted']}개 변환 / {result['skipped']}개 스킵 / {result['failed']}개 실패")
else:
output = result.get('output', '')
print(f"[doc2md] {status.upper()}{output or result.get('error', '')}")
if result.get('has_diagrams'):
pages = result.get('diagram_pages', [])
print(f"[doc2md] 다이어그램 페이지: {pages} → Vision AI 처리 필요")
sys.exit(exit_code)
if __name__ == '__main__':
main()

0
converters/__init__.py Normal file
View File

188
converters/hml.py Normal file
View File

@@ -0,0 +1,188 @@
#!/usr/bin/env python3
"""HML → Markdown (XML 직접 파싱, Base64 이미지 추출)"""
from __future__ import annotations
import base64
import re
import xml.etree.ElementTree as ET
from pathlib import Path
def _extract_images(tree, images_dir: Path) -> tuple[dict, list]:
images_dir.mkdir(parents=True, exist_ok=True)
bin_format = {
item.get('BinData'): item.get('Format', 'PNG').lower()
for item in tree.findall('.//BINITEM') if item.get('BinData')
}
id_to_file = {}
for bindata in tree.findall('.//BINDATA'):
bid = bindata.get('Id')
raw = (bindata.text or '').strip()
if not raw or bindata.get('Encoding', 'Base64').lower() != 'base64':
continue
fmt = bin_format.get(bid, 'png')
filename = f'BIN{int(bid):04d}.{fmt}'
try:
(images_dir / filename).write_bytes(base64.b64decode(raw))
id_to_file[bid] = filename
except Exception:
pass
body = tree.find('.//BODY')
bin_order = []
if body is not None:
for pic in body.findall('.//PICTURE'):
imgs = pic.findall('.//IMAGE')
bin_order.append(imgs[0].get('BinItem') if imgs else None)
return id_to_file, bin_order
def _extract_text(p_elem) -> str:
parts = []
for t in p_elem.findall('TEXT'):
for child in t:
if child.tag == 'CHAR' and child.text:
parts.append(child.text)
elif child.tag == 'TAB':
parts.append(' ')
return ''.join(parts).strip()
def _detect_structure(text: str):
if not text: return 'paragraph', 0, text
if re.match(r'^\d+\.\d+\.\d+\s', text): return 'heading', 4, text
if re.match(r'^\d+\.\d+\s', text): return 'heading', 3, text
if re.match(r'^\d+\.\s.+', text): return 'heading', 2, text
if re.match(r'^[\d가-힣]+\)\s+.+', text):return 'heading', 3, text
if re.match(r'^[□■]\s*.+', text): return 'heading', 2, text
if re.match(r'^[○●◎]\s*.+', text): return 'heading', 3, text
if re.match(r'^[▶▷]\s*.+', text): return 'heading', 4, text
if re.match(r'^[▪▫\-]\s*.+', text): return 'bullet', 0, text
if re.match(r'^[※]', text): return 'paragraph', 0, f'> {text}'
return 'paragraph', 0, text
def _extract_table(table_elem) -> str:
col_count = int(table_elem.get('ColCount', 0))
has_merge = False
raw_rows = []
for ri, row_elem in enumerate(table_elem.findall('.//ROW')):
cells = []
for cell_elem in row_elem.findall('CELL'):
cs = int(cell_elem.get('ColSpan', 1))
rs = int(cell_elem.get('RowSpan', 1))
ca = int(cell_elem.get('ColAddr', 0))
if cs > 1 or rs > 1:
has_merge = True
parts = [_extract_text(p) for p in cell_elem.findall('.//P')]
cells.append((ca, cs, rs, '<br>'.join(p for p in parts if p)))
if cells:
raw_rows.append((ri, cells))
if not raw_rows:
return ''
if has_merge:
lines = ['<table>']
for ri, (_, cells) in enumerate(raw_rows):
lines.append('<tr>')
tag = 'th' if ri == 0 else 'td'
for _, cs, rs, text in cells:
attrs = (f' colspan="{cs}"' if cs > 1 else '') + (f' rowspan="{rs}"' if rs > 1 else '')
lines.append(f'<{tag}{attrs}>{text}</{tag}>')
lines.append('</tr>')
lines.append('</table>')
return '\n'.join(lines)
else:
rows = []
for _, cells in raw_rows:
grid = {ca: text for ca, _, _, text in cells}
n = col_count if col_count > 0 else (max(grid) + 1)
rows.append([grid.get(i, '') for i in range(n)])
mc = max(len(r) for r in rows)
for r in rows:
r += [''] * (mc - len(r))
def esc(s): return s.replace('|', '\\|').replace('\n', ' ')
lines = ['| ' + ' | '.join(esc(c) for c in rows[0]) + ' |',
'| ' + ' | '.join(['---'] * mc) + ' |']
for row in rows[1:]:
lines.append('| ' + ' | '.join(esc(c) for c in row) + ' |')
return '\n'.join(lines)
def _process_p(p_elem, pic_counter: list, bin_order: list, id_to_file: dict, base_name: str) -> list[str]:
lines = []
has_content = False
for text_elem in p_elem.findall('TEXT'):
for child in text_elem:
if child.tag == 'TABLE':
has_content = True
md = _extract_table(child)
if md:
lines.append(md)
elif child.tag == 'PICTURE':
has_content = True
idx = pic_counter[0]
pic_counter[0] += 1
bid = bin_order[idx] if idx < len(bin_order) else None
filename = id_to_file.get(bid, '') if bid else ''
ref = f'{base_name}_images/{filename}' if filename else f'그림_{idx+1}.png'
lines.append(f'![그림 {idx+1}]({ref})')
if not has_content:
text = _extract_text(p_elem)
if text:
kind, level, fmt = _detect_structure(text)
if kind == 'heading':
lines.append(f'{"#" * level} {fmt}')
elif kind == 'bullet':
lines.append(f'- {re.sub(r"^[▪▫-]\\s*", "", fmt)}')
else:
lines.append(fmt)
return lines
def convert_hml(hml_path: Path, output_dir: Path) -> dict:
"""HML → MD. AGENT_GUIDE 스펙 dict 반환."""
hml_path = Path(hml_path)
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
md_path = output_dir / f'{hml_path.stem}.md'
images_dir = output_dir / f'{hml_path.stem}_images'
result = {
"status": "ok", "input": str(hml_path),
"output": str(md_path), "format": "hml", "images": [],
}
try:
tree = ET.fromstring(hml_path.read_text(encoding='utf-8-sig'))
id_to_file, bin_order = _extract_images(tree, images_dir)
result["images"] = [str(images_dir / f) for f in id_to_file.values()]
title_elem = tree.find('.//TITLE')
doc_title = title_elem.text.strip() if (title_elem is not None and title_elem.text) else hml_path.stem
md_lines = [f'# {doc_title}', '']
body = tree.find('.//BODY')
if body is None:
result['status'] = 'error'
result['error'] = 'BODY 요소 없음'
return result
pic_counter = [0]
for section in body.findall('.//SECTION'):
for p_elem in section.findall('P'):
for line in _process_p(p_elem, pic_counter, bin_order, id_to_file, hml_path.stem):
if line.startswith('#'):
if md_lines and md_lines[-1] != '':
md_lines.append('')
md_lines += [line, '']
elif line.startswith('|') or line.startswith('<table') or line.startswith('!['):
md_lines += [line, '']
elif line:
md_lines += [line, '']
final = re.sub(r'\n{3,}', '\n\n', '\n'.join(md_lines)).strip()
md_path.write_text(final, encoding='utf-8')
except Exception as e:
result['status'] = 'error'
result['error'] = str(e)
return result

31
converters/html.py Normal file
View File

@@ -0,0 +1,31 @@
#!/usr/bin/env python3
"""HTML / HTM → Markdown (html2text, body_width=0)"""
from __future__ import annotations
from pathlib import Path
def convert_html(html_path: Path, output_dir: Path) -> dict:
"""HTML → MD. AGENT_GUIDE 스펙 dict 반환."""
html_path = Path(html_path)
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
md_path = output_dir / f'{html_path.stem}.md'
result = {
"status": "ok", "input": str(html_path),
"output": str(md_path), "format": "html",
}
try:
import html2text
h = html2text.HTML2Text()
h.body_width = 0
h.ignore_links = False
h.ignore_images = False
content = html_path.read_text(encoding='utf-8', errors='ignore')
md = h.handle(content)
md_path.write_text(md, encoding='utf-8')
except Exception as e:
result['status'] = 'error'
result['error'] = str(e)
return result

212
converters/hwp.py Normal file
View File

@@ -0,0 +1,212 @@
#!/usr/bin/env python3
"""HWP → Markdown (COM 자동화 우선, pyhwp fallback)"""
from __future__ import annotations
import re
import shutil
import tempfile
from pathlib import Path
def _com_hwp_to_hml(hwp_path: Path, hml_path: Path, timeout: int = 15) -> bool:
import threading
result = [False]
def _run():
try:
import pythoncom, win32com.client
except ImportError:
return
hwp = None
try:
pythoncom.CoInitialize()
hwp = win32com.client.Dispatch('HWPFrame.HwpObject')
try:
hwp.RegisterModule('FilePathCheckDLL', 'SecurityModule')
except Exception:
pass
ok = hwp.Open(str(hwp_path).replace('/', '\\'), 'HWP', 'forceopen:true')
if not ok:
return
hwp.SaveAs(str(hml_path).replace('/', '\\'), 'HML', '')
result[0] = hml_path.exists()
except Exception as e:
print(f' COM 오류: {e}')
finally:
if hwp:
try: hwp.Quit()
except Exception: pass
try: pythoncom.CoUninitialize()
except Exception: pass
t = threading.Thread(target=_run, daemon=True)
t.start()
t.join(timeout)
if t.is_alive():
print(f' COM 타임아웃 ({timeout}초) — pyhwp로 전환')
return result[0]
def _table_to_md(table_elem) -> str:
from bs4 import Tag
rows = table_elem.find_all('tr', recursive=False) or table_elem.find_all('tr')
if not rows:
return ''
has_merge = False
parsed = []
for tr in rows:
cells = []
for td in tr.find_all(['td', 'th']):
cs = int(td.get('colspan', 1))
rs = int(td.get('rowspan', 1))
if cs > 1 or rs > 1:
has_merge = True
cells.append((cs, rs, td.get_text(separator='<br>', strip=True)))
if cells:
parsed.append(cells)
if not parsed:
return ''
if has_merge:
lines = ['<table>']
for ri, cells in enumerate(parsed):
lines.append('<tr>')
tag = 'th' if ri == 0 else 'td'
for cs, rs, text in cells:
attrs = (f' colspan="{cs}"' if cs > 1 else '') + (f' rowspan="{rs}"' if rs > 1 else '')
lines.append(f'<{tag}{attrs}>{text}</{tag}>')
lines.append('</tr>')
lines.append('</table>')
return '\n'.join(lines)
else:
rows_text = [[text for _, _, text in cells] for cells in parsed]
mc = max(len(r) for r in rows_text)
for r in rows_text:
r += [''] * (mc - len(r))
def esc(s): return s.replace('|', '\\|')
lines = ['| ' + ' | '.join(esc(c) for c in rows_text[0]) + ' |',
'| ' + ' | '.join(['---'] * mc) + ' |']
for row in rows_text[1:]:
lines.append('| ' + ' | '.join(esc(c) for c in row) + ' |')
return '\n'.join(lines)
def _detect_structure(text: str):
if not text: return 'paragraph', 0, text
if re.match(r'^\d+\.\d+\.\d+\s', text): return 'heading', 4, text
if re.match(r'^\d+\.\d+\s', text): return 'heading', 3, text
if re.match(r'^\d+\.\s.+', text): return 'heading', 2, text
if re.match(r'^[\d가-힣]+\)\s+.+', text):return 'heading', 3, text
if re.match(r'^[□■]\s*.+', text): return 'heading', 2, text
if re.match(r'^[○●◎]\s*.+', text): return 'heading', 3, text
if re.match(r'^[▶▷]\s*.+', text): return 'heading', 4, text
if re.match(r'^[▪▫\-]\s*.+', text): return 'bullet', 0, text
if re.match(r'^[※]', text): return 'paragraph', 0, f'> {text}'
return 'paragraph', 0, text
def _pyhwp_hwp_to_md(hwp_path: Path, output_path: Path, base_name: str) -> bool:
try:
from hwp5.hwp5html import HTMLTransform
from hwp5.xmlmodel import Hwp5File
from bs4 import BeautifulSoup
except ImportError as e:
print(f' pyhwp/bs4 미설치: {e}')
return False
tmp_dir = Path(tempfile.mkdtemp())
try:
f = Hwp5File(str(hwp_path))
HTMLTransform().transform_hwp5_to_dir(f, str(tmp_dir))
xhtml_path = tmp_dir / 'index.xhtml'
if not xhtml_path.exists():
return False
images_dir = output_path.parent / f'{base_name}_images'
images_dir.mkdir(exist_ok=True)
img_map = {}
bindata_dir = tmp_dir / 'bindata'
if bindata_dir.exists():
for img in bindata_dir.iterdir():
shutil.copy(img, images_dir / img.name)
img_map[img.name] = img.name
soup = BeautifulSoup(xhtml_path.read_text(encoding='utf-8'), 'lxml-xml')
for area in soup.find_all(class_=re.compile(r'^(HeaderArea|FooterArea|Header parashape|Footer parashape)$')):
area.decompose()
md_lines = []
img_counter = [0]
for elem in soup.find_all(['p', 'table']):
if elem.find_parent('table'):
continue
if elem.name == 'table':
if not elem.find_parent('p'):
md = _table_to_md(elem)
if md:
md_lines += [md, '']
elif elem.name == 'p':
for img in elem.find_all('img'):
fn = Path(img.get('src', '')).name
if fn in img_map:
img_counter[0] += 1
md_lines += [f'![그림 {img_counter[0]}]({base_name}_images/{fn})', '']
inner = elem.find('table')
if inner:
md = _table_to_md(inner)
if md:
md_lines += [md, '']
continue
text = re.sub(r'\s+', ' ', elem.get_text(separator=' ', strip=True)).strip()
if not text:
continue
kind, level, fmt = _detect_structure(text)
if kind == 'heading':
if md_lines and md_lines[-1] != '':
md_lines.append('')
md_lines += [f'{"#" * level} {fmt}', '']
elif kind == 'bullet':
md_lines.append(f'- {re.sub(r"^[▪▫-]\\s*", "", fmt)}')
else:
md_lines += [fmt, '']
output_path.write_text(re.sub(r'\n{3,}', '\n\n', '\n'.join(md_lines)), encoding='utf-8')
return True
except Exception as e:
print(f' pyhwp 오류: {e}')
return False
finally:
shutil.rmtree(tmp_dir, ignore_errors=True)
def convert_hwp(hwp_path: Path, output_dir: Path) -> dict:
"""HWP → MD. AGENT_GUIDE 스펙 dict 반환."""
hwp_path = Path(hwp_path)
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
md_path = output_dir / f'{hwp_path.stem}.md'
result = {
"status": "ok", "input": str(hwp_path),
"output": str(md_path), "format": "hwp",
}
try:
hml_path = md_path.with_suffix('.hml')
if _com_hwp_to_hml(hwp_path, hml_path):
try:
from converters.hml import convert_hml
r = convert_hml(hml_path, output_dir)
hml_path.unlink(missing_ok=True)
if r['status'] == 'ok':
return result
except Exception:
hml_path.unlink(missing_ok=True)
if _pyhwp_hwp_to_md(hwp_path, md_path, hwp_path.stem):
return result
result['status'] = 'error'
result['error'] = 'COM + pyhwp 모두 실패'
except Exception as e:
result['status'] = 'error'
result['error'] = str(e)
return result

188
converters/hwpx.py Normal file
View File

@@ -0,0 +1,188 @@
#!/usr/bin/env python3
"""HWPX → Markdown (ZIP+XML 직접 파싱, 이미지 추출 포함)"""
from __future__ import annotations
import re
import zipfile
import xml.etree.ElementTree as ET
from pathlib import Path
NS = {
'hp': 'http://www.hancom.co.kr/hwpml/2011/paragraph',
'hc': 'http://www.hancom.co.kr/hwpml/2011/core',
}
def _extract_images(zf: zipfile.ZipFile, images_dir: Path) -> dict:
images_dir.mkdir(parents=True, exist_ok=True)
id_to_file = {}
for name in zf.namelist():
if not name.startswith('BinData/'):
continue
filename = Path(name).name
if not filename:
continue
out_path = images_dir / filename
out_path.write_bytes(zf.read(name))
id_to_file[Path(filename).stem] = filename
return id_to_file
def _extract_text(p_elem) -> str:
parts = []
for run in p_elem.findall('hp:run', NS):
for t in run.findall('hp:t', NS):
if t.text:
parts.append(t.text)
if run.findall('hp:tab', NS):
parts.append(' ')
return ''.join(parts).strip()
def _detect_structure(text: str):
if not text: return 'paragraph', 0, text
if re.match(r'^\d+\.\d+\.\d+\s', text): return 'heading', 4, text
if re.match(r'^\d+\.\d+\s', text): return 'heading', 3, text
if re.match(r'^\d+\.\s.+', text): return 'heading', 2, text
if re.match(r'^[\d가-힣]+\)\s+.+', text):return 'heading', 3, text
if re.match(r'^[□■]\s*.+', text): return 'heading', 2, text
if re.match(r'^[○●◎]\s*.+', text): return 'heading', 3, text
if re.match(r'^[▶▷]\s*.+', text): return 'heading', 4, text
if re.match(r'^[▪▫\-]\s*.+', text): return 'bullet', 0, text
if re.match(r'^[※]', text): return 'paragraph', 0, f'> {text}'
return 'paragraph', 0, text
def _cell_text(tc_elem) -> str:
parts = []
for sub in tc_elem.findall('hp:subList', NS):
for p in sub.findall('hp:p', NS):
if p.find('.//hp:tbl', NS) is not None:
continue
t = _extract_text(p)
if t:
parts.append(t)
return '<br>'.join(parts)
def _get_span(tc_elem):
cs = int(tc_elem.get('colSpan', 1))
rs = int(tc_elem.get('rowSpan', 1))
span = tc_elem.find('hp:cellSpan', NS)
if span is not None:
cs = int(span.get('colSpan', cs))
rs = int(span.get('rowSpan', rs))
return cs, rs
def _extract_table(tbl_elem) -> str:
has_merge = False
raw_rows = []
for tr in tbl_elem.findall('hp:tr', NS):
cells = []
for tc in tr.findall('hp:tc', NS):
cs, rs = _get_span(tc)
if cs > 1 or rs > 1:
has_merge = True
cells.append((cs, rs, _cell_text(tc)))
if cells:
raw_rows.append(cells)
if not raw_rows:
return ''
if has_merge:
lines = ['<table>']
for ri, cells in enumerate(raw_rows):
lines.append('<tr>')
tag = 'th' if ri == 0 else 'td'
for cs, rs, text in cells:
attrs = (f' colspan="{cs}"' if cs > 1 else '') + (f' rowspan="{rs}"' if rs > 1 else '')
lines.append(f'<{tag}{attrs}>{text}</{tag}>')
lines.append('</tr>')
lines.append('</table>')
return '\n'.join(lines)
else:
rows = [[t for _, _, t in cells] for cells in raw_rows]
mc = max(len(r) for r in rows)
for r in rows:
r += [''] * (mc - len(r))
def esc(s): return s.replace('|', '\\|').replace('\n', ' ')
lines = ['| ' + ' | '.join(esc(c) for c in rows[0]) + ' |',
'| ' + ' | '.join(['---'] * mc) + ' |']
for row in rows[1:]:
lines.append('| ' + ' | '.join(esc(c) for c in row) + ' |')
return '\n'.join(lines)
def _process_para(p_elem, pic_counter: list, id_to_file: dict, base_name: str) -> list[str]:
tbl = p_elem.find('.//hp:tbl', NS)
if tbl is not None:
md = _extract_table(tbl)
return [md] if md else []
pic = p_elem.find('.//hp:pic', NS)
if pic is not None:
idx = pic_counter[0]
pic_counter[0] += 1
img_elem = pic.find('.//hc:img', NS)
if img_elem is not None:
ref_id = img_elem.get('binaryItemIDRef', '')
filename = id_to_file.get(ref_id, '')
if filename:
return [f'![그림 {idx+1}]({base_name}_images/{filename})']
return [f'![그림 {idx+1}](그림_{idx+1}.png)']
text = _extract_text(p_elem)
if not text:
return []
kind, level, fmt = _detect_structure(text)
if kind == 'heading':
return [f'{"#" * level} {fmt}']
elif kind == 'bullet':
return [f'- {re.sub(r"^[▪▫-]\\s*", "", fmt)}']
return [fmt]
def convert_hwpx(hwpx_path: Path, output_dir: Path) -> dict:
"""HWPX → MD. AGENT_GUIDE 스펙 dict 반환."""
hwpx_path = Path(hwpx_path)
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
md_path = output_dir / f'{hwpx_path.stem}.md'
images_dir = output_dir / f'{hwpx_path.stem}_images'
result = {
"status": "ok", "input": str(hwpx_path),
"output": str(md_path), "format": "hwpx", "images": [],
}
try:
with zipfile.ZipFile(hwpx_path, 'r') as zf:
id_to_file = _extract_images(zf, images_dir)
result["images"] = [str(images_dir / f) for f in id_to_file.values()]
section_files = sorted(
n for n in zf.namelist()
if re.match(r'Contents/section\d+\.xml', n)
)
md_lines: list[str] = []
pic_counter = [0]
for sec_file in section_files:
root = ET.fromstring(zf.read(sec_file))
for p_elem in root.findall('hp:p', NS):
if p_elem.find('.//hp:secPr', NS) is not None:
continue
for line in _process_para(p_elem, pic_counter, id_to_file, hwpx_path.stem):
if line.startswith('#') or line.startswith('<table') or line.startswith('|') or line.startswith('!['):
if md_lines and md_lines[-1] != '':
md_lines.append('')
md_lines.append(line)
md_lines.append('')
elif line:
md_lines.append(line)
md_lines.append('')
final = re.sub(r'\n{3,}', '\n\n', '\n'.join(md_lines)).strip()
md_path.write_text(final, encoding='utf-8')
except Exception as e:
result['status'] = 'error'
result['error'] = str(e)
return result

240
converters/pdf.py Normal file
View File

@@ -0,0 +1,240 @@
#!/usr/bin/env python3
"""
PDF → Markdown 변환기 (페이지별 분류 + 라우팅)
페이지 타입:
text - 텍스트 위주 → marker-pdf 추출
text-with-photo - 텍스트 + 사진 → marker-pdf + 이미지 크롭
diagram - 다이어그램/도면 → 페이지 PNG 렌더링 (에이전트가 Vision으로 처리)
image-heavy - 텍스트 거의 없음 → 페이지 PNG 렌더링
"""
from __future__ import annotations
import io
import re
from pathlib import Path
import fitz # PyMuPDF
from PIL import Image
# ── 페이지 분류 ───────────────────────────────────────────────────────────────
def _pix_to_pil(pix: fitz.Pixmap) -> Image.Image:
"""PyMuPDF Pixmap → PIL Image."""
mode = "RGBA" if pix.alpha else "RGB"
return Image.frombytes(mode, (pix.width, pix.height), pix.samples)
def _is_diagram_image(img: Image.Image) -> bool:
"""
래스터 이미지가 다이어그램인지 판별.
다이어그램 특성: 제한된 색상 팔레트 + 높은 흰 배경 비율.
"""
# 너무 작은 이미지(로고, 아이콘)는 스킵
if img.width < 100 or img.height < 100:
return False
# 색상 수 (64색으로 양자화 후 실제 사용 색상)
small = img.resize((200, 200), Image.LANCZOS).convert("RGB")
quantized = small.quantize(colors=64)
color_count = len(set(quantized.getdata()))
# 흰 배경 비율
gray = small.convert("L")
pixels = list(gray.getdata())
white_ratio = sum(1 for p in pixels if p > 240) / len(pixels)
return color_count < 32 and white_ratio > 0.35
def classify_page(page: fitz.Page, doc: fitz.Document) -> str:
"""
페이지를 분류한다.
반환값: 'text' | 'text-with-photo' | 'diagram' | 'image-heavy'
"""
text = page.get_text().strip()
text_len = len(text)
page_area = page.rect.width * page.rect.height
drawings = page.get_drawings()
images = page.get_images(full=True)
text_density = text_len / page_area * 10_000 # 면적 대비 문자 수
# 벡터 드로잉 밀도 (flowchart, CAD export 등은 수백 개 드로잉 포함)
drawing_density = len(drawings) / page_area * 10_000
# 1) 텍스트가 충분하면 텍스트 계열
if text_density > 4:
if not images:
return "text"
# 이미지가 있어도 작은 이미지(로고 등)면 text
large_images = [
img for img in images
if doc.extract_image(img[0])["width"] > 150
and doc.extract_image(img[0])["height"] > 150
]
return "text-with-photo" if large_images else "text"
# 2) 벡터 드로잉이 많으면 다이어그램
if drawing_density > 1.5:
return "diagram"
# 3) 래스터 이미지가 있으면 다이어그램 여부 분석
if images:
for img_info in images[:3]: # 최대 3개만 검사 (속도)
try:
xref = img_info[0]
pix = fitz.Pixmap(doc, xref)
if pix.colorspace and pix.colorspace.n > 1:
pil = _pix_to_pil(pix)
if _is_diagram_image(pil):
return "diagram"
except Exception:
pass
return "text-with-photo" if text_len > 50 else "image-heavy"
# 4) 텍스트도 이미지도 거의 없음
return "image-heavy" if not text_len else "text"
# ── 페이지 PNG 렌더링 ─────────────────────────────────────────────────────────
def _render_page_png(page: fitz.Page, output_path: Path, scale: float = 2.0) -> None:
"""페이지를 고해상도 PNG로 렌더링."""
mat = fitz.Matrix(scale, scale)
pix = page.get_pixmap(matrix=mat)
pix.save(str(output_path))
# ── 메인 변환 함수 ────────────────────────────────────────────────────────────
def convert_pdf(pdf_path: Path, output_dir: Path) -> dict:
"""
PDF → MD 변환. AGENT_GUIDE.md 스펙의 JSON 구조를 dict로 반환.
반환 dict:
status : "ok" | "error"
input : str
output : str (md 파일 경로)
format : "pdf"
pages : list of {n, type, image?}
has_diagrams : bool
diagram_pages : list[int]
images : list[str]
error? : str
"""
pdf_path = Path(pdf_path)
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
images_dir = output_dir / f"{pdf_path.stem}_images"
md_path = output_dir / f"{pdf_path.stem}.md"
result: dict = {
"status": "ok",
"input": str(pdf_path),
"output": str(md_path),
"format": "pdf",
"pages": [],
"has_diagrams": False,
"diagram_pages": [],
"images": [],
}
try:
doc = fitz.open(str(pdf_path))
# ── 1) 각 페이지 분류 ──────────────────────────────────────────────
page_types: list[str] = []
for page in doc:
ptype = classify_page(page, doc)
page_types.append(ptype)
result["pages"].append({"n": page.number + 1, "type": ptype})
diagram_page_nums = [
i + 1 for i, t in enumerate(page_types)
if t in ("diagram", "image-heavy")
]
result["has_diagrams"] = bool(diagram_page_nums)
result["diagram_pages"] = diagram_page_nums
# ── 2) 텍스트 추출 (marker-pdf) ────────────────────────────────────
text_sections: list[str] = []
try:
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
converter = PdfConverter(artifact_dict=create_model_dict())
rendered = converter(str(pdf_path))
full_text, _, marker_images = text_from_rendered(rendered)
# marker 추출 이미지 저장
if marker_images:
images_dir.mkdir(exist_ok=True)
for img_name, img_data in marker_images.items():
try:
img_dest = images_dir / img_name
if isinstance(img_data, Image.Image):
img_data.save(str(img_dest))
elif isinstance(img_data, bytes) and img_data:
img_dest.write_bytes(img_data)
result["images"].append(str(img_dest))
except Exception:
pass
# 이미지 경로 prefix 수정
full_text = re.sub(
r'!\[([^\]]*)\]\((?!http)([^)]+)\)',
rf'![\1]({pdf_path.stem}_images/\2)',
full_text,
)
text_sections.append(full_text)
except ImportError:
# marker-pdf 없으면 PyMuPDF 텍스트 추출로 fallback
pages_text = []
for page in doc:
t = page.get_text().strip()
if t:
pages_text.append(t)
text_sections.append("\n\n---\n\n".join(pages_text))
# ── 3) 다이어그램 페이지 PNG 렌더링 ────────────────────────────────
if diagram_page_nums:
images_dir.mkdir(exist_ok=True)
diagram_section_lines = ["\n\n---\n\n## 다이어그램 페이지\n"]
for page_num in diagram_page_nums:
page = doc[page_num - 1]
img_name = f"page_{page_num}.png"
img_path = images_dir / img_name
_render_page_png(page, img_path)
result["images"].append(str(img_path))
diagram_section_lines.append(
f"\n### Page {page_num}\n"
f"![Page {page_num} — 다이어그램]"
f"({pdf_path.stem}_images/{img_name})\n"
)
# pages 항목에 image 경로 추가
for p in result["pages"]:
if p["n"] == page_num:
p["image"] = str(img_path)
text_sections.append("".join(diagram_section_lines))
doc.close()
# ── 4) MD 파일 저장 ────────────────────────────────────────────────
final_md = re.sub(r'\n{3,}', '\n\n', "\n\n".join(text_sections)).strip()
md_path.write_text(final_md, encoding="utf-8")
except Exception as e:
result["status"] = "error"
result["error"] = str(e)
import traceback
traceback.print_exc()
return result

26
requirements.txt Normal file
View File

@@ -0,0 +1,26 @@
# doc2md 필수 패키지
# pip install -r requirements.txt
# PDF 변환 (텍스트/이미지 혼합)
marker-pdf>=1.0.0
# PDF 페이지 분석 + 렌더링
PyMuPDF>=1.23.0
# 이미지 처리 (다이어그램 감지)
Pillow>=10.0.0
# XML 파싱 (HML, HWPX) — 표준 라이브러리 포함
lxml>=4.9.0
# HTML 파싱 (HWP pyhwp fallback)
beautifulsoup4>=4.12.0
# HTML → MD
html2text>=2020.1.16
# HWP 변환 fallback (한컴오피스 미설치 환경)
pyhwp>=0.1.0b19
# Windows 전용: HWP COM 자동화 (한컴오피스 설치 시 자동 사용)
# pywin32>=306