feat: Implement full conversion pipeline (PDF/HWP/HWPX/HML/HTML)
- convert.py: 통합 CLI, --json 출력, --scan 폴더 모드 - converters/pdf.py: 페이지별 분류(text/diagram/mixed) + marker-pdf + PNG 렌더링 - converters/hwp.py: COM 자동화 + pyhwp fallback - converters/hwpx.py: ZIP+XML 직접 파싱, 이미지 추출 - converters/hml.py: XML 파싱, Base64 이미지 추출, colspan/rowspan HTML 표 - converters/html.py: html2text (body_width=0) - requirements.txt: 최소 의존성 - .env.example: 환경변수 템플릿 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
1
.claude/hooks/token-usage/.gitignore
vendored
Normal file
1
.claude/hooks/token-usage/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
aptabase.json
|
||||
43
.claude/settings.json
Normal file
43
.claude/settings.json
Normal file
@@ -0,0 +1,43 @@
|
||||
{
|
||||
"hooks": {
|
||||
"UserPromptSubmit": [
|
||||
{
|
||||
"hooks": [
|
||||
{
|
||||
"type": "command",
|
||||
"command": "t=$(mktemp);cat>\"$t\";e=./.claude/hooks/token-usage/claude-hook.exe;[ -x \"$e\" ] && \"$e\" session-context \"$t\";rm -f \"$t\"",
|
||||
"timeout": 5
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"Stop": [
|
||||
{
|
||||
"hooks": [
|
||||
{
|
||||
"type": "command",
|
||||
"command": "t=$(mktemp);cat>\"$t\";e=./.claude/hooks/token-usage/claude-hook.exe;[ -x \"$e\" ] && \"$e\" stop-record \"$t\";rm -f \"$t\"",
|
||||
"timeout": 5
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"PostToolUse": [
|
||||
{
|
||||
"matcher": "Bash",
|
||||
"hooks": [
|
||||
{
|
||||
"type": "command",
|
||||
"command": "t=$(mktemp);cat>\"$t\";e=./.claude/hooks/token-usage/claude-hook.exe;[ -x \"$e\" ] && \"$e\" aptabase-commit \"$t\";rm -f \"$t\"",
|
||||
"timeout": 15
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"permissions": {
|
||||
"allow": [
|
||||
"mcp__gitea__issue_write"
|
||||
]
|
||||
}
|
||||
}
|
||||
5
.env.example
Normal file
5
.env.example
Normal file
@@ -0,0 +1,5 @@
|
||||
# doc2md 환경변수 예시
|
||||
# 이 파일을 .env로 복사 후 값 수정
|
||||
|
||||
# ParaWiki 등 외부 프로젝트에서 이 도구를 subprocess로 호출할 때 사용
|
||||
# DOCU_CONVERTER_PATH=D:\MYCLAUDE_PROJECT\doc2md
|
||||
2
.usage/token/.gitignore
vendored
Normal file
2
.usage/token/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
job-commit-pool.json
|
||||
job-send-pool.json
|
||||
136
convert.py
Normal file
136
convert.py
Normal file
@@ -0,0 +1,136 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
doc2md — 통합 문서 변환기 (AI 에이전트용)
|
||||
사용법: python convert.py <file> -o <output_dir> [--json]
|
||||
python convert.py --scan <dir> -o <output_dir> [--json]
|
||||
|
||||
자세한 사용법: AGENT_GUIDE.md 참조
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
SUPPORTED = {'.pdf', '.hwp', '.hwpx', '.hml', '.html', '.htm'}
|
||||
SKIP_NAMES = {'README.md', 'CLAUDE.md', 'AGENT_GUIDE.md'}
|
||||
|
||||
|
||||
def convert_file(src: Path, output_dir: Path) -> dict:
|
||||
"""파일 하나를 변환. AGENT_GUIDE 스펙 dict 반환."""
|
||||
ext = src.suffix.lower()
|
||||
try:
|
||||
if ext == '.pdf':
|
||||
from converters.pdf import convert_pdf
|
||||
return convert_pdf(src, output_dir)
|
||||
elif ext == '.hwp':
|
||||
from converters.hwp import convert_hwp
|
||||
return convert_hwp(src, output_dir)
|
||||
elif ext == '.hwpx':
|
||||
from converters.hwpx import convert_hwpx
|
||||
return convert_hwpx(src, output_dir)
|
||||
elif ext == '.hml':
|
||||
from converters.hml import convert_hml
|
||||
return convert_hml(src, output_dir)
|
||||
elif ext in {'.html', '.htm'}:
|
||||
from converters.html import convert_html
|
||||
return convert_html(src, output_dir)
|
||||
else:
|
||||
return {"status": "skipped", "input": str(src), "reason": "unsupported_format"}
|
||||
except Exception as e:
|
||||
return {"status": "error", "input": str(src), "error": str(e)}
|
||||
|
||||
|
||||
def scan_and_convert(scan_dir: Path, output_dir: Path) -> dict:
|
||||
"""폴더 스캔 후 변환 대상 일괄 처리."""
|
||||
targets = []
|
||||
for ext in SUPPORTED:
|
||||
targets.extend(scan_dir.rglob(f'*{ext}'))
|
||||
targets.sort()
|
||||
|
||||
results = []
|
||||
ok = fail = skipped = 0
|
||||
|
||||
for src in targets:
|
||||
if src.name in SKIP_NAMES:
|
||||
continue
|
||||
|
||||
# 이미 .md 존재하면 스킵
|
||||
if src.with_suffix('.md').exists():
|
||||
results.append({"input": str(src), "output": None,
|
||||
"status": "skipped", "reason": "already_md"})
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
out_dir = output_dir / src.parent.relative_to(scan_dir)
|
||||
r = convert_file(src, out_dir)
|
||||
results.append(r)
|
||||
if r['status'] == 'ok':
|
||||
ok += 1
|
||||
elif r['status'] == 'error':
|
||||
fail += 1
|
||||
else:
|
||||
skipped += 1
|
||||
|
||||
return {
|
||||
"status": "ok" if fail == 0 else ("error" if ok == 0 else "partial"),
|
||||
"total": len(results),
|
||||
"converted": ok,
|
||||
"skipped": skipped,
|
||||
"failed": fail,
|
||||
"results": results,
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='doc2md — AI 에이전트용 문서 변환기',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog='자세한 사용법: AGENT_GUIDE.md'
|
||||
)
|
||||
parser.add_argument('file', nargs='?', help='변환할 파일')
|
||||
parser.add_argument('-o', '--output', required=True, help='출력 폴더')
|
||||
parser.add_argument('--scan', metavar='DIR', help='폴더 일괄 변환 모드')
|
||||
parser.add_argument('--json', action='store_true', help='결과를 JSON으로 출력')
|
||||
args = parser.parse_args()
|
||||
|
||||
output_dir = Path(args.output)
|
||||
|
||||
if args.scan:
|
||||
result = scan_and_convert(Path(args.scan), output_dir)
|
||||
exit_code = 0 if result['status'] == 'ok' else (1 if result['status'] == 'partial' else 2)
|
||||
elif args.file:
|
||||
src = Path(args.file)
|
||||
if not src.exists():
|
||||
err = {"status": "error", "input": str(src), "error": "파일 없음"}
|
||||
if args.json:
|
||||
print(json.dumps(err, ensure_ascii=False))
|
||||
else:
|
||||
print(f"오류: 파일 없음 — {src}", file=sys.stderr)
|
||||
sys.exit(2)
|
||||
result = convert_file(src, output_dir)
|
||||
exit_code = 0 if result['status'] == 'ok' else 2
|
||||
else:
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
if args.json:
|
||||
print(json.dumps(result, ensure_ascii=False, indent=2))
|
||||
else:
|
||||
# 사람이 읽기 쉬운 출력 (에이전트가 --json 없이 호출 시)
|
||||
status = result.get('status', '')
|
||||
if 'results' in result:
|
||||
print(f"[doc2md] {result['converted']}개 변환 / {result['skipped']}개 스킵 / {result['failed']}개 실패")
|
||||
else:
|
||||
output = result.get('output', '')
|
||||
print(f"[doc2md] {status.upper()} — {output or result.get('error', '')}")
|
||||
if result.get('has_diagrams'):
|
||||
pages = result.get('diagram_pages', [])
|
||||
print(f"[doc2md] 다이어그램 페이지: {pages} → Vision AI 처리 필요")
|
||||
|
||||
sys.exit(exit_code)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
0
converters/__init__.py
Normal file
0
converters/__init__.py
Normal file
188
converters/hml.py
Normal file
188
converters/hml.py
Normal file
@@ -0,0 +1,188 @@
|
||||
#!/usr/bin/env python3
|
||||
"""HML → Markdown (XML 직접 파싱, Base64 이미지 추출)"""
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def _extract_images(tree, images_dir: Path) -> tuple[dict, list]:
|
||||
images_dir.mkdir(parents=True, exist_ok=True)
|
||||
bin_format = {
|
||||
item.get('BinData'): item.get('Format', 'PNG').lower()
|
||||
for item in tree.findall('.//BINITEM') if item.get('BinData')
|
||||
}
|
||||
id_to_file = {}
|
||||
for bindata in tree.findall('.//BINDATA'):
|
||||
bid = bindata.get('Id')
|
||||
raw = (bindata.text or '').strip()
|
||||
if not raw or bindata.get('Encoding', 'Base64').lower() != 'base64':
|
||||
continue
|
||||
fmt = bin_format.get(bid, 'png')
|
||||
filename = f'BIN{int(bid):04d}.{fmt}'
|
||||
try:
|
||||
(images_dir / filename).write_bytes(base64.b64decode(raw))
|
||||
id_to_file[bid] = filename
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
body = tree.find('.//BODY')
|
||||
bin_order = []
|
||||
if body is not None:
|
||||
for pic in body.findall('.//PICTURE'):
|
||||
imgs = pic.findall('.//IMAGE')
|
||||
bin_order.append(imgs[0].get('BinItem') if imgs else None)
|
||||
return id_to_file, bin_order
|
||||
|
||||
|
||||
def _extract_text(p_elem) -> str:
|
||||
parts = []
|
||||
for t in p_elem.findall('TEXT'):
|
||||
for child in t:
|
||||
if child.tag == 'CHAR' and child.text:
|
||||
parts.append(child.text)
|
||||
elif child.tag == 'TAB':
|
||||
parts.append(' ')
|
||||
return ''.join(parts).strip()
|
||||
|
||||
|
||||
def _detect_structure(text: str):
|
||||
if not text: return 'paragraph', 0, text
|
||||
if re.match(r'^\d+\.\d+\.\d+\s', text): return 'heading', 4, text
|
||||
if re.match(r'^\d+\.\d+\s', text): return 'heading', 3, text
|
||||
if re.match(r'^\d+\.\s.+', text): return 'heading', 2, text
|
||||
if re.match(r'^[\d가-힣]+\)\s+.+', text):return 'heading', 3, text
|
||||
if re.match(r'^[□■]\s*.+', text): return 'heading', 2, text
|
||||
if re.match(r'^[○●◎]\s*.+', text): return 'heading', 3, text
|
||||
if re.match(r'^[▶▷]\s*.+', text): return 'heading', 4, text
|
||||
if re.match(r'^[▪▫\-]\s*.+', text): return 'bullet', 0, text
|
||||
if re.match(r'^[※]', text): return 'paragraph', 0, f'> {text}'
|
||||
return 'paragraph', 0, text
|
||||
|
||||
|
||||
def _extract_table(table_elem) -> str:
|
||||
col_count = int(table_elem.get('ColCount', 0))
|
||||
has_merge = False
|
||||
raw_rows = []
|
||||
for ri, row_elem in enumerate(table_elem.findall('.//ROW')):
|
||||
cells = []
|
||||
for cell_elem in row_elem.findall('CELL'):
|
||||
cs = int(cell_elem.get('ColSpan', 1))
|
||||
rs = int(cell_elem.get('RowSpan', 1))
|
||||
ca = int(cell_elem.get('ColAddr', 0))
|
||||
if cs > 1 or rs > 1:
|
||||
has_merge = True
|
||||
parts = [_extract_text(p) for p in cell_elem.findall('.//P')]
|
||||
cells.append((ca, cs, rs, '<br>'.join(p for p in parts if p)))
|
||||
if cells:
|
||||
raw_rows.append((ri, cells))
|
||||
if not raw_rows:
|
||||
return ''
|
||||
|
||||
if has_merge:
|
||||
lines = ['<table>']
|
||||
for ri, (_, cells) in enumerate(raw_rows):
|
||||
lines.append('<tr>')
|
||||
tag = 'th' if ri == 0 else 'td'
|
||||
for _, cs, rs, text in cells:
|
||||
attrs = (f' colspan="{cs}"' if cs > 1 else '') + (f' rowspan="{rs}"' if rs > 1 else '')
|
||||
lines.append(f'<{tag}{attrs}>{text}</{tag}>')
|
||||
lines.append('</tr>')
|
||||
lines.append('</table>')
|
||||
return '\n'.join(lines)
|
||||
else:
|
||||
rows = []
|
||||
for _, cells in raw_rows:
|
||||
grid = {ca: text for ca, _, _, text in cells}
|
||||
n = col_count if col_count > 0 else (max(grid) + 1)
|
||||
rows.append([grid.get(i, '') for i in range(n)])
|
||||
mc = max(len(r) for r in rows)
|
||||
for r in rows:
|
||||
r += [''] * (mc - len(r))
|
||||
def esc(s): return s.replace('|', '\\|').replace('\n', ' ')
|
||||
lines = ['| ' + ' | '.join(esc(c) for c in rows[0]) + ' |',
|
||||
'| ' + ' | '.join(['---'] * mc) + ' |']
|
||||
for row in rows[1:]:
|
||||
lines.append('| ' + ' | '.join(esc(c) for c in row) + ' |')
|
||||
return '\n'.join(lines)
|
||||
|
||||
|
||||
def _process_p(p_elem, pic_counter: list, bin_order: list, id_to_file: dict, base_name: str) -> list[str]:
|
||||
lines = []
|
||||
has_content = False
|
||||
for text_elem in p_elem.findall('TEXT'):
|
||||
for child in text_elem:
|
||||
if child.tag == 'TABLE':
|
||||
has_content = True
|
||||
md = _extract_table(child)
|
||||
if md:
|
||||
lines.append(md)
|
||||
elif child.tag == 'PICTURE':
|
||||
has_content = True
|
||||
idx = pic_counter[0]
|
||||
pic_counter[0] += 1
|
||||
bid = bin_order[idx] if idx < len(bin_order) else None
|
||||
filename = id_to_file.get(bid, '') if bid else ''
|
||||
ref = f'{base_name}_images/{filename}' if filename else f'그림_{idx+1}.png'
|
||||
lines.append(f'')
|
||||
if not has_content:
|
||||
text = _extract_text(p_elem)
|
||||
if text:
|
||||
kind, level, fmt = _detect_structure(text)
|
||||
if kind == 'heading':
|
||||
lines.append(f'{"#" * level} {fmt}')
|
||||
elif kind == 'bullet':
|
||||
lines.append(f'- {re.sub(r"^[▪▫-]\\s*", "", fmt)}')
|
||||
else:
|
||||
lines.append(fmt)
|
||||
return lines
|
||||
|
||||
|
||||
def convert_hml(hml_path: Path, output_dir: Path) -> dict:
|
||||
"""HML → MD. AGENT_GUIDE 스펙 dict 반환."""
|
||||
hml_path = Path(hml_path)
|
||||
output_dir = Path(output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
md_path = output_dir / f'{hml_path.stem}.md'
|
||||
images_dir = output_dir / f'{hml_path.stem}_images'
|
||||
|
||||
result = {
|
||||
"status": "ok", "input": str(hml_path),
|
||||
"output": str(md_path), "format": "hml", "images": [],
|
||||
}
|
||||
try:
|
||||
tree = ET.fromstring(hml_path.read_text(encoding='utf-8-sig'))
|
||||
id_to_file, bin_order = _extract_images(tree, images_dir)
|
||||
result["images"] = [str(images_dir / f) for f in id_to_file.values()]
|
||||
|
||||
title_elem = tree.find('.//TITLE')
|
||||
doc_title = title_elem.text.strip() if (title_elem is not None and title_elem.text) else hml_path.stem
|
||||
md_lines = [f'# {doc_title}', '']
|
||||
|
||||
body = tree.find('.//BODY')
|
||||
if body is None:
|
||||
result['status'] = 'error'
|
||||
result['error'] = 'BODY 요소 없음'
|
||||
return result
|
||||
|
||||
pic_counter = [0]
|
||||
for section in body.findall('.//SECTION'):
|
||||
for p_elem in section.findall('P'):
|
||||
for line in _process_p(p_elem, pic_counter, bin_order, id_to_file, hml_path.stem):
|
||||
if line.startswith('#'):
|
||||
if md_lines and md_lines[-1] != '':
|
||||
md_lines.append('')
|
||||
md_lines += [line, '']
|
||||
elif line.startswith('|') or line.startswith('<table') or line.startswith('!['):
|
||||
md_lines += [line, '']
|
||||
elif line:
|
||||
md_lines += [line, '']
|
||||
|
||||
final = re.sub(r'\n{3,}', '\n\n', '\n'.join(md_lines)).strip()
|
||||
md_path.write_text(final, encoding='utf-8')
|
||||
except Exception as e:
|
||||
result['status'] = 'error'
|
||||
result['error'] = str(e)
|
||||
return result
|
||||
31
converters/html.py
Normal file
31
converters/html.py
Normal file
@@ -0,0 +1,31 @@
|
||||
#!/usr/bin/env python3
|
||||
"""HTML / HTM → Markdown (html2text, body_width=0)"""
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def convert_html(html_path: Path, output_dir: Path) -> dict:
|
||||
"""HTML → MD. AGENT_GUIDE 스펙 dict 반환."""
|
||||
html_path = Path(html_path)
|
||||
output_dir = Path(output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
md_path = output_dir / f'{html_path.stem}.md'
|
||||
|
||||
result = {
|
||||
"status": "ok", "input": str(html_path),
|
||||
"output": str(md_path), "format": "html",
|
||||
}
|
||||
try:
|
||||
import html2text
|
||||
h = html2text.HTML2Text()
|
||||
h.body_width = 0
|
||||
h.ignore_links = False
|
||||
h.ignore_images = False
|
||||
content = html_path.read_text(encoding='utf-8', errors='ignore')
|
||||
md = h.handle(content)
|
||||
md_path.write_text(md, encoding='utf-8')
|
||||
except Exception as e:
|
||||
result['status'] = 'error'
|
||||
result['error'] = str(e)
|
||||
return result
|
||||
212
converters/hwp.py
Normal file
212
converters/hwp.py
Normal file
@@ -0,0 +1,212 @@
|
||||
#!/usr/bin/env python3
|
||||
"""HWP → Markdown (COM 자동화 우선, pyhwp fallback)"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import shutil
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def _com_hwp_to_hml(hwp_path: Path, hml_path: Path, timeout: int = 15) -> bool:
|
||||
import threading
|
||||
result = [False]
|
||||
|
||||
def _run():
|
||||
try:
|
||||
import pythoncom, win32com.client
|
||||
except ImportError:
|
||||
return
|
||||
hwp = None
|
||||
try:
|
||||
pythoncom.CoInitialize()
|
||||
hwp = win32com.client.Dispatch('HWPFrame.HwpObject')
|
||||
try:
|
||||
hwp.RegisterModule('FilePathCheckDLL', 'SecurityModule')
|
||||
except Exception:
|
||||
pass
|
||||
ok = hwp.Open(str(hwp_path).replace('/', '\\'), 'HWP', 'forceopen:true')
|
||||
if not ok:
|
||||
return
|
||||
hwp.SaveAs(str(hml_path).replace('/', '\\'), 'HML', '')
|
||||
result[0] = hml_path.exists()
|
||||
except Exception as e:
|
||||
print(f' COM 오류: {e}')
|
||||
finally:
|
||||
if hwp:
|
||||
try: hwp.Quit()
|
||||
except Exception: pass
|
||||
try: pythoncom.CoUninitialize()
|
||||
except Exception: pass
|
||||
|
||||
t = threading.Thread(target=_run, daemon=True)
|
||||
t.start()
|
||||
t.join(timeout)
|
||||
if t.is_alive():
|
||||
print(f' COM 타임아웃 ({timeout}초) — pyhwp로 전환')
|
||||
return result[0]
|
||||
|
||||
|
||||
def _table_to_md(table_elem) -> str:
|
||||
from bs4 import Tag
|
||||
rows = table_elem.find_all('tr', recursive=False) or table_elem.find_all('tr')
|
||||
if not rows:
|
||||
return ''
|
||||
has_merge = False
|
||||
parsed = []
|
||||
for tr in rows:
|
||||
cells = []
|
||||
for td in tr.find_all(['td', 'th']):
|
||||
cs = int(td.get('colspan', 1))
|
||||
rs = int(td.get('rowspan', 1))
|
||||
if cs > 1 or rs > 1:
|
||||
has_merge = True
|
||||
cells.append((cs, rs, td.get_text(separator='<br>', strip=True)))
|
||||
if cells:
|
||||
parsed.append(cells)
|
||||
if not parsed:
|
||||
return ''
|
||||
if has_merge:
|
||||
lines = ['<table>']
|
||||
for ri, cells in enumerate(parsed):
|
||||
lines.append('<tr>')
|
||||
tag = 'th' if ri == 0 else 'td'
|
||||
for cs, rs, text in cells:
|
||||
attrs = (f' colspan="{cs}"' if cs > 1 else '') + (f' rowspan="{rs}"' if rs > 1 else '')
|
||||
lines.append(f'<{tag}{attrs}>{text}</{tag}>')
|
||||
lines.append('</tr>')
|
||||
lines.append('</table>')
|
||||
return '\n'.join(lines)
|
||||
else:
|
||||
rows_text = [[text for _, _, text in cells] for cells in parsed]
|
||||
mc = max(len(r) for r in rows_text)
|
||||
for r in rows_text:
|
||||
r += [''] * (mc - len(r))
|
||||
def esc(s): return s.replace('|', '\\|')
|
||||
lines = ['| ' + ' | '.join(esc(c) for c in rows_text[0]) + ' |',
|
||||
'| ' + ' | '.join(['---'] * mc) + ' |']
|
||||
for row in rows_text[1:]:
|
||||
lines.append('| ' + ' | '.join(esc(c) for c in row) + ' |')
|
||||
return '\n'.join(lines)
|
||||
|
||||
|
||||
def _detect_structure(text: str):
|
||||
if not text: return 'paragraph', 0, text
|
||||
if re.match(r'^\d+\.\d+\.\d+\s', text): return 'heading', 4, text
|
||||
if re.match(r'^\d+\.\d+\s', text): return 'heading', 3, text
|
||||
if re.match(r'^\d+\.\s.+', text): return 'heading', 2, text
|
||||
if re.match(r'^[\d가-힣]+\)\s+.+', text):return 'heading', 3, text
|
||||
if re.match(r'^[□■]\s*.+', text): return 'heading', 2, text
|
||||
if re.match(r'^[○●◎]\s*.+', text): return 'heading', 3, text
|
||||
if re.match(r'^[▶▷]\s*.+', text): return 'heading', 4, text
|
||||
if re.match(r'^[▪▫\-]\s*.+', text): return 'bullet', 0, text
|
||||
if re.match(r'^[※]', text): return 'paragraph', 0, f'> {text}'
|
||||
return 'paragraph', 0, text
|
||||
|
||||
|
||||
def _pyhwp_hwp_to_md(hwp_path: Path, output_path: Path, base_name: str) -> bool:
|
||||
try:
|
||||
from hwp5.hwp5html import HTMLTransform
|
||||
from hwp5.xmlmodel import Hwp5File
|
||||
from bs4 import BeautifulSoup
|
||||
except ImportError as e:
|
||||
print(f' pyhwp/bs4 미설치: {e}')
|
||||
return False
|
||||
|
||||
tmp_dir = Path(tempfile.mkdtemp())
|
||||
try:
|
||||
f = Hwp5File(str(hwp_path))
|
||||
HTMLTransform().transform_hwp5_to_dir(f, str(tmp_dir))
|
||||
xhtml_path = tmp_dir / 'index.xhtml'
|
||||
if not xhtml_path.exists():
|
||||
return False
|
||||
|
||||
images_dir = output_path.parent / f'{base_name}_images'
|
||||
images_dir.mkdir(exist_ok=True)
|
||||
img_map = {}
|
||||
bindata_dir = tmp_dir / 'bindata'
|
||||
if bindata_dir.exists():
|
||||
for img in bindata_dir.iterdir():
|
||||
shutil.copy(img, images_dir / img.name)
|
||||
img_map[img.name] = img.name
|
||||
|
||||
soup = BeautifulSoup(xhtml_path.read_text(encoding='utf-8'), 'lxml-xml')
|
||||
for area in soup.find_all(class_=re.compile(r'^(HeaderArea|FooterArea|Header parashape|Footer parashape)$')):
|
||||
area.decompose()
|
||||
|
||||
md_lines = []
|
||||
img_counter = [0]
|
||||
for elem in soup.find_all(['p', 'table']):
|
||||
if elem.find_parent('table'):
|
||||
continue
|
||||
if elem.name == 'table':
|
||||
if not elem.find_parent('p'):
|
||||
md = _table_to_md(elem)
|
||||
if md:
|
||||
md_lines += [md, '']
|
||||
elif elem.name == 'p':
|
||||
for img in elem.find_all('img'):
|
||||
fn = Path(img.get('src', '')).name
|
||||
if fn in img_map:
|
||||
img_counter[0] += 1
|
||||
md_lines += [f'![그림 {img_counter[0]}]({base_name}_images/{fn})', '']
|
||||
inner = elem.find('table')
|
||||
if inner:
|
||||
md = _table_to_md(inner)
|
||||
if md:
|
||||
md_lines += [md, '']
|
||||
continue
|
||||
text = re.sub(r'\s+', ' ', elem.get_text(separator=' ', strip=True)).strip()
|
||||
if not text:
|
||||
continue
|
||||
kind, level, fmt = _detect_structure(text)
|
||||
if kind == 'heading':
|
||||
if md_lines and md_lines[-1] != '':
|
||||
md_lines.append('')
|
||||
md_lines += [f'{"#" * level} {fmt}', '']
|
||||
elif kind == 'bullet':
|
||||
md_lines.append(f'- {re.sub(r"^[▪▫-]\\s*", "", fmt)}')
|
||||
else:
|
||||
md_lines += [fmt, '']
|
||||
|
||||
output_path.write_text(re.sub(r'\n{3,}', '\n\n', '\n'.join(md_lines)), encoding='utf-8')
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f' pyhwp 오류: {e}')
|
||||
return False
|
||||
finally:
|
||||
shutil.rmtree(tmp_dir, ignore_errors=True)
|
||||
|
||||
|
||||
def convert_hwp(hwp_path: Path, output_dir: Path) -> dict:
|
||||
"""HWP → MD. AGENT_GUIDE 스펙 dict 반환."""
|
||||
hwp_path = Path(hwp_path)
|
||||
output_dir = Path(output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
md_path = output_dir / f'{hwp_path.stem}.md'
|
||||
|
||||
result = {
|
||||
"status": "ok", "input": str(hwp_path),
|
||||
"output": str(md_path), "format": "hwp",
|
||||
}
|
||||
try:
|
||||
hml_path = md_path.with_suffix('.hml')
|
||||
if _com_hwp_to_hml(hwp_path, hml_path):
|
||||
try:
|
||||
from converters.hml import convert_hml
|
||||
r = convert_hml(hml_path, output_dir)
|
||||
hml_path.unlink(missing_ok=True)
|
||||
if r['status'] == 'ok':
|
||||
return result
|
||||
except Exception:
|
||||
hml_path.unlink(missing_ok=True)
|
||||
|
||||
if _pyhwp_hwp_to_md(hwp_path, md_path, hwp_path.stem):
|
||||
return result
|
||||
|
||||
result['status'] = 'error'
|
||||
result['error'] = 'COM + pyhwp 모두 실패'
|
||||
except Exception as e:
|
||||
result['status'] = 'error'
|
||||
result['error'] = str(e)
|
||||
return result
|
||||
188
converters/hwpx.py
Normal file
188
converters/hwpx.py
Normal file
@@ -0,0 +1,188 @@
|
||||
#!/usr/bin/env python3
|
||||
"""HWPX → Markdown (ZIP+XML 직접 파싱, 이미지 추출 포함)"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import zipfile
|
||||
import xml.etree.ElementTree as ET
|
||||
from pathlib import Path
|
||||
|
||||
NS = {
|
||||
'hp': 'http://www.hancom.co.kr/hwpml/2011/paragraph',
|
||||
'hc': 'http://www.hancom.co.kr/hwpml/2011/core',
|
||||
}
|
||||
|
||||
|
||||
def _extract_images(zf: zipfile.ZipFile, images_dir: Path) -> dict:
|
||||
images_dir.mkdir(parents=True, exist_ok=True)
|
||||
id_to_file = {}
|
||||
for name in zf.namelist():
|
||||
if not name.startswith('BinData/'):
|
||||
continue
|
||||
filename = Path(name).name
|
||||
if not filename:
|
||||
continue
|
||||
out_path = images_dir / filename
|
||||
out_path.write_bytes(zf.read(name))
|
||||
id_to_file[Path(filename).stem] = filename
|
||||
return id_to_file
|
||||
|
||||
|
||||
def _extract_text(p_elem) -> str:
|
||||
parts = []
|
||||
for run in p_elem.findall('hp:run', NS):
|
||||
for t in run.findall('hp:t', NS):
|
||||
if t.text:
|
||||
parts.append(t.text)
|
||||
if run.findall('hp:tab', NS):
|
||||
parts.append(' ')
|
||||
return ''.join(parts).strip()
|
||||
|
||||
|
||||
def _detect_structure(text: str):
|
||||
if not text: return 'paragraph', 0, text
|
||||
if re.match(r'^\d+\.\d+\.\d+\s', text): return 'heading', 4, text
|
||||
if re.match(r'^\d+\.\d+\s', text): return 'heading', 3, text
|
||||
if re.match(r'^\d+\.\s.+', text): return 'heading', 2, text
|
||||
if re.match(r'^[\d가-힣]+\)\s+.+', text):return 'heading', 3, text
|
||||
if re.match(r'^[□■]\s*.+', text): return 'heading', 2, text
|
||||
if re.match(r'^[○●◎]\s*.+', text): return 'heading', 3, text
|
||||
if re.match(r'^[▶▷]\s*.+', text): return 'heading', 4, text
|
||||
if re.match(r'^[▪▫\-]\s*.+', text): return 'bullet', 0, text
|
||||
if re.match(r'^[※]', text): return 'paragraph', 0, f'> {text}'
|
||||
return 'paragraph', 0, text
|
||||
|
||||
|
||||
def _cell_text(tc_elem) -> str:
|
||||
parts = []
|
||||
for sub in tc_elem.findall('hp:subList', NS):
|
||||
for p in sub.findall('hp:p', NS):
|
||||
if p.find('.//hp:tbl', NS) is not None:
|
||||
continue
|
||||
t = _extract_text(p)
|
||||
if t:
|
||||
parts.append(t)
|
||||
return '<br>'.join(parts)
|
||||
|
||||
|
||||
def _get_span(tc_elem):
|
||||
cs = int(tc_elem.get('colSpan', 1))
|
||||
rs = int(tc_elem.get('rowSpan', 1))
|
||||
span = tc_elem.find('hp:cellSpan', NS)
|
||||
if span is not None:
|
||||
cs = int(span.get('colSpan', cs))
|
||||
rs = int(span.get('rowSpan', rs))
|
||||
return cs, rs
|
||||
|
||||
|
||||
def _extract_table(tbl_elem) -> str:
|
||||
has_merge = False
|
||||
raw_rows = []
|
||||
for tr in tbl_elem.findall('hp:tr', NS):
|
||||
cells = []
|
||||
for tc in tr.findall('hp:tc', NS):
|
||||
cs, rs = _get_span(tc)
|
||||
if cs > 1 or rs > 1:
|
||||
has_merge = True
|
||||
cells.append((cs, rs, _cell_text(tc)))
|
||||
if cells:
|
||||
raw_rows.append(cells)
|
||||
if not raw_rows:
|
||||
return ''
|
||||
if has_merge:
|
||||
lines = ['<table>']
|
||||
for ri, cells in enumerate(raw_rows):
|
||||
lines.append('<tr>')
|
||||
tag = 'th' if ri == 0 else 'td'
|
||||
for cs, rs, text in cells:
|
||||
attrs = (f' colspan="{cs}"' if cs > 1 else '') + (f' rowspan="{rs}"' if rs > 1 else '')
|
||||
lines.append(f'<{tag}{attrs}>{text}</{tag}>')
|
||||
lines.append('</tr>')
|
||||
lines.append('</table>')
|
||||
return '\n'.join(lines)
|
||||
else:
|
||||
rows = [[t for _, _, t in cells] for cells in raw_rows]
|
||||
mc = max(len(r) for r in rows)
|
||||
for r in rows:
|
||||
r += [''] * (mc - len(r))
|
||||
def esc(s): return s.replace('|', '\\|').replace('\n', ' ')
|
||||
lines = ['| ' + ' | '.join(esc(c) for c in rows[0]) + ' |',
|
||||
'| ' + ' | '.join(['---'] * mc) + ' |']
|
||||
for row in rows[1:]:
|
||||
lines.append('| ' + ' | '.join(esc(c) for c in row) + ' |')
|
||||
return '\n'.join(lines)
|
||||
|
||||
|
||||
def _process_para(p_elem, pic_counter: list, id_to_file: dict, base_name: str) -> list[str]:
|
||||
tbl = p_elem.find('.//hp:tbl', NS)
|
||||
if tbl is not None:
|
||||
md = _extract_table(tbl)
|
||||
return [md] if md else []
|
||||
|
||||
pic = p_elem.find('.//hp:pic', NS)
|
||||
if pic is not None:
|
||||
idx = pic_counter[0]
|
||||
pic_counter[0] += 1
|
||||
img_elem = pic.find('.//hc:img', NS)
|
||||
if img_elem is not None:
|
||||
ref_id = img_elem.get('binaryItemIDRef', '')
|
||||
filename = id_to_file.get(ref_id, '')
|
||||
if filename:
|
||||
return [f'']
|
||||
return [f'']
|
||||
|
||||
text = _extract_text(p_elem)
|
||||
if not text:
|
||||
return []
|
||||
kind, level, fmt = _detect_structure(text)
|
||||
if kind == 'heading':
|
||||
return [f'{"#" * level} {fmt}']
|
||||
elif kind == 'bullet':
|
||||
return [f'- {re.sub(r"^[▪▫-]\\s*", "", fmt)}']
|
||||
return [fmt]
|
||||
|
||||
|
||||
def convert_hwpx(hwpx_path: Path, output_dir: Path) -> dict:
|
||||
"""HWPX → MD. AGENT_GUIDE 스펙 dict 반환."""
|
||||
hwpx_path = Path(hwpx_path)
|
||||
output_dir = Path(output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
md_path = output_dir / f'{hwpx_path.stem}.md'
|
||||
images_dir = output_dir / f'{hwpx_path.stem}_images'
|
||||
|
||||
result = {
|
||||
"status": "ok", "input": str(hwpx_path),
|
||||
"output": str(md_path), "format": "hwpx", "images": [],
|
||||
}
|
||||
try:
|
||||
with zipfile.ZipFile(hwpx_path, 'r') as zf:
|
||||
id_to_file = _extract_images(zf, images_dir)
|
||||
result["images"] = [str(images_dir / f) for f in id_to_file.values()]
|
||||
|
||||
section_files = sorted(
|
||||
n for n in zf.namelist()
|
||||
if re.match(r'Contents/section\d+\.xml', n)
|
||||
)
|
||||
md_lines: list[str] = []
|
||||
pic_counter = [0]
|
||||
for sec_file in section_files:
|
||||
root = ET.fromstring(zf.read(sec_file))
|
||||
for p_elem in root.findall('hp:p', NS):
|
||||
if p_elem.find('.//hp:secPr', NS) is not None:
|
||||
continue
|
||||
for line in _process_para(p_elem, pic_counter, id_to_file, hwpx_path.stem):
|
||||
if line.startswith('#') or line.startswith('<table') or line.startswith('|') or line.startswith('!['):
|
||||
if md_lines and md_lines[-1] != '':
|
||||
md_lines.append('')
|
||||
md_lines.append(line)
|
||||
md_lines.append('')
|
||||
elif line:
|
||||
md_lines.append(line)
|
||||
md_lines.append('')
|
||||
|
||||
final = re.sub(r'\n{3,}', '\n\n', '\n'.join(md_lines)).strip()
|
||||
md_path.write_text(final, encoding='utf-8')
|
||||
except Exception as e:
|
||||
result['status'] = 'error'
|
||||
result['error'] = str(e)
|
||||
return result
|
||||
240
converters/pdf.py
Normal file
240
converters/pdf.py
Normal file
@@ -0,0 +1,240 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
PDF → Markdown 변환기 (페이지별 분류 + 라우팅)
|
||||
|
||||
페이지 타입:
|
||||
text - 텍스트 위주 → marker-pdf 추출
|
||||
text-with-photo - 텍스트 + 사진 → marker-pdf + 이미지 크롭
|
||||
diagram - 다이어그램/도면 → 페이지 PNG 렌더링 (에이전트가 Vision으로 처리)
|
||||
image-heavy - 텍스트 거의 없음 → 페이지 PNG 렌더링
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
import fitz # PyMuPDF
|
||||
from PIL import Image
|
||||
|
||||
|
||||
# ── 페이지 분류 ───────────────────────────────────────────────────────────────
|
||||
|
||||
def _pix_to_pil(pix: fitz.Pixmap) -> Image.Image:
|
||||
"""PyMuPDF Pixmap → PIL Image."""
|
||||
mode = "RGBA" if pix.alpha else "RGB"
|
||||
return Image.frombytes(mode, (pix.width, pix.height), pix.samples)
|
||||
|
||||
|
||||
def _is_diagram_image(img: Image.Image) -> bool:
|
||||
"""
|
||||
래스터 이미지가 다이어그램인지 판별.
|
||||
다이어그램 특성: 제한된 색상 팔레트 + 높은 흰 배경 비율.
|
||||
"""
|
||||
# 너무 작은 이미지(로고, 아이콘)는 스킵
|
||||
if img.width < 100 or img.height < 100:
|
||||
return False
|
||||
|
||||
# 색상 수 (64색으로 양자화 후 실제 사용 색상)
|
||||
small = img.resize((200, 200), Image.LANCZOS).convert("RGB")
|
||||
quantized = small.quantize(colors=64)
|
||||
color_count = len(set(quantized.getdata()))
|
||||
|
||||
# 흰 배경 비율
|
||||
gray = small.convert("L")
|
||||
pixels = list(gray.getdata())
|
||||
white_ratio = sum(1 for p in pixels if p > 240) / len(pixels)
|
||||
|
||||
return color_count < 32 and white_ratio > 0.35
|
||||
|
||||
|
||||
def classify_page(page: fitz.Page, doc: fitz.Document) -> str:
|
||||
"""
|
||||
페이지를 분류한다.
|
||||
반환값: 'text' | 'text-with-photo' | 'diagram' | 'image-heavy'
|
||||
"""
|
||||
text = page.get_text().strip()
|
||||
text_len = len(text)
|
||||
page_area = page.rect.width * page.rect.height
|
||||
|
||||
drawings = page.get_drawings()
|
||||
images = page.get_images(full=True)
|
||||
|
||||
text_density = text_len / page_area * 10_000 # 면적 대비 문자 수
|
||||
|
||||
# 벡터 드로잉 밀도 (flowchart, CAD export 등은 수백 개 드로잉 포함)
|
||||
drawing_density = len(drawings) / page_area * 10_000
|
||||
|
||||
# 1) 텍스트가 충분하면 텍스트 계열
|
||||
if text_density > 4:
|
||||
if not images:
|
||||
return "text"
|
||||
# 이미지가 있어도 작은 이미지(로고 등)면 text
|
||||
large_images = [
|
||||
img for img in images
|
||||
if doc.extract_image(img[0])["width"] > 150
|
||||
and doc.extract_image(img[0])["height"] > 150
|
||||
]
|
||||
return "text-with-photo" if large_images else "text"
|
||||
|
||||
# 2) 벡터 드로잉이 많으면 다이어그램
|
||||
if drawing_density > 1.5:
|
||||
return "diagram"
|
||||
|
||||
# 3) 래스터 이미지가 있으면 다이어그램 여부 분석
|
||||
if images:
|
||||
for img_info in images[:3]: # 최대 3개만 검사 (속도)
|
||||
try:
|
||||
xref = img_info[0]
|
||||
pix = fitz.Pixmap(doc, xref)
|
||||
if pix.colorspace and pix.colorspace.n > 1:
|
||||
pil = _pix_to_pil(pix)
|
||||
if _is_diagram_image(pil):
|
||||
return "diagram"
|
||||
except Exception:
|
||||
pass
|
||||
return "text-with-photo" if text_len > 50 else "image-heavy"
|
||||
|
||||
# 4) 텍스트도 이미지도 거의 없음
|
||||
return "image-heavy" if not text_len else "text"
|
||||
|
||||
|
||||
# ── 페이지 PNG 렌더링 ─────────────────────────────────────────────────────────
|
||||
|
||||
def _render_page_png(page: fitz.Page, output_path: Path, scale: float = 2.0) -> None:
|
||||
"""페이지를 고해상도 PNG로 렌더링."""
|
||||
mat = fitz.Matrix(scale, scale)
|
||||
pix = page.get_pixmap(matrix=mat)
|
||||
pix.save(str(output_path))
|
||||
|
||||
|
||||
# ── 메인 변환 함수 ────────────────────────────────────────────────────────────
|
||||
|
||||
def convert_pdf(pdf_path: Path, output_dir: Path) -> dict:
|
||||
"""
|
||||
PDF → MD 변환. AGENT_GUIDE.md 스펙의 JSON 구조를 dict로 반환.
|
||||
|
||||
반환 dict:
|
||||
status : "ok" | "error"
|
||||
input : str
|
||||
output : str (md 파일 경로)
|
||||
format : "pdf"
|
||||
pages : list of {n, type, image?}
|
||||
has_diagrams : bool
|
||||
diagram_pages : list[int]
|
||||
images : list[str]
|
||||
error? : str
|
||||
"""
|
||||
pdf_path = Path(pdf_path)
|
||||
output_dir = Path(output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
images_dir = output_dir / f"{pdf_path.stem}_images"
|
||||
md_path = output_dir / f"{pdf_path.stem}.md"
|
||||
|
||||
result: dict = {
|
||||
"status": "ok",
|
||||
"input": str(pdf_path),
|
||||
"output": str(md_path),
|
||||
"format": "pdf",
|
||||
"pages": [],
|
||||
"has_diagrams": False,
|
||||
"diagram_pages": [],
|
||||
"images": [],
|
||||
}
|
||||
|
||||
try:
|
||||
doc = fitz.open(str(pdf_path))
|
||||
|
||||
# ── 1) 각 페이지 분류 ──────────────────────────────────────────────
|
||||
page_types: list[str] = []
|
||||
for page in doc:
|
||||
ptype = classify_page(page, doc)
|
||||
page_types.append(ptype)
|
||||
result["pages"].append({"n": page.number + 1, "type": ptype})
|
||||
|
||||
diagram_page_nums = [
|
||||
i + 1 for i, t in enumerate(page_types)
|
||||
if t in ("diagram", "image-heavy")
|
||||
]
|
||||
result["has_diagrams"] = bool(diagram_page_nums)
|
||||
result["diagram_pages"] = diagram_page_nums
|
||||
|
||||
# ── 2) 텍스트 추출 (marker-pdf) ────────────────────────────────────
|
||||
text_sections: list[str] = []
|
||||
try:
|
||||
from marker.converters.pdf import PdfConverter
|
||||
from marker.models import create_model_dict
|
||||
from marker.output import text_from_rendered
|
||||
|
||||
converter = PdfConverter(artifact_dict=create_model_dict())
|
||||
rendered = converter(str(pdf_path))
|
||||
full_text, _, marker_images = text_from_rendered(rendered)
|
||||
|
||||
# marker 추출 이미지 저장
|
||||
if marker_images:
|
||||
images_dir.mkdir(exist_ok=True)
|
||||
for img_name, img_data in marker_images.items():
|
||||
try:
|
||||
img_dest = images_dir / img_name
|
||||
if isinstance(img_data, Image.Image):
|
||||
img_data.save(str(img_dest))
|
||||
elif isinstance(img_data, bytes) and img_data:
|
||||
img_dest.write_bytes(img_data)
|
||||
result["images"].append(str(img_dest))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 이미지 경로 prefix 수정
|
||||
full_text = re.sub(
|
||||
r'!\[([^\]]*)\]\((?!http)([^)]+)\)',
|
||||
rf'',
|
||||
full_text,
|
||||
)
|
||||
text_sections.append(full_text)
|
||||
|
||||
except ImportError:
|
||||
# marker-pdf 없으면 PyMuPDF 텍스트 추출로 fallback
|
||||
pages_text = []
|
||||
for page in doc:
|
||||
t = page.get_text().strip()
|
||||
if t:
|
||||
pages_text.append(t)
|
||||
text_sections.append("\n\n---\n\n".join(pages_text))
|
||||
|
||||
# ── 3) 다이어그램 페이지 PNG 렌더링 ────────────────────────────────
|
||||
if diagram_page_nums:
|
||||
images_dir.mkdir(exist_ok=True)
|
||||
diagram_section_lines = ["\n\n---\n\n## 다이어그램 페이지\n"]
|
||||
|
||||
for page_num in diagram_page_nums:
|
||||
page = doc[page_num - 1]
|
||||
img_name = f"page_{page_num}.png"
|
||||
img_path = images_dir / img_name
|
||||
_render_page_png(page, img_path)
|
||||
result["images"].append(str(img_path))
|
||||
diagram_section_lines.append(
|
||||
f"\n### Page {page_num}\n"
|
||||
f"![Page {page_num} — 다이어그램]"
|
||||
f"({pdf_path.stem}_images/{img_name})\n"
|
||||
)
|
||||
# pages 항목에 image 경로 추가
|
||||
for p in result["pages"]:
|
||||
if p["n"] == page_num:
|
||||
p["image"] = str(img_path)
|
||||
|
||||
text_sections.append("".join(diagram_section_lines))
|
||||
|
||||
doc.close()
|
||||
|
||||
# ── 4) MD 파일 저장 ────────────────────────────────────────────────
|
||||
final_md = re.sub(r'\n{3,}', '\n\n', "\n\n".join(text_sections)).strip()
|
||||
md_path.write_text(final_md, encoding="utf-8")
|
||||
|
||||
except Exception as e:
|
||||
result["status"] = "error"
|
||||
result["error"] = str(e)
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
return result
|
||||
26
requirements.txt
Normal file
26
requirements.txt
Normal file
@@ -0,0 +1,26 @@
|
||||
# doc2md 필수 패키지
|
||||
# pip install -r requirements.txt
|
||||
|
||||
# PDF 변환 (텍스트/이미지 혼합)
|
||||
marker-pdf>=1.0.0
|
||||
|
||||
# PDF 페이지 분석 + 렌더링
|
||||
PyMuPDF>=1.23.0
|
||||
|
||||
# 이미지 처리 (다이어그램 감지)
|
||||
Pillow>=10.0.0
|
||||
|
||||
# XML 파싱 (HML, HWPX) — 표준 라이브러리 포함
|
||||
lxml>=4.9.0
|
||||
|
||||
# HTML 파싱 (HWP pyhwp fallback)
|
||||
beautifulsoup4>=4.12.0
|
||||
|
||||
# HTML → MD
|
||||
html2text>=2020.1.16
|
||||
|
||||
# HWP 변환 fallback (한컴오피스 미설치 환경)
|
||||
pyhwp>=0.1.0b19
|
||||
|
||||
# Windows 전용: HWP COM 자동화 (한컴오피스 설치 시 자동 사용)
|
||||
# pywin32>=306
|
||||
Reference in New Issue
Block a user