feat: MD 파일 병합 및 이미지 경로 통합 스크립트 추가 (#1)

- merge_markdown.py: 96개 페이지별 MD를 단일 파일로 병합
  - 이미지를 output/images/ 폴더로 통합, p{NN}_ prefix로 파일명 충돌 방지
  - file_range 파라미터로 부분 테스트 가능
- docs/tutorial.md: merge 명령어 및 사용법 문서화
- docs/history: 작업 이력 파일 추가

소요 시간: 10분 | Context: input 18k / output 2k tokens

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
minsung
2026-04-01 11:00:28 +09:00
parent 892e4ecafb
commit 8d4339302e
24 changed files with 2335 additions and 0 deletions

111
convert_pdfs.py Normal file
View File

@@ -0,0 +1,111 @@
#!/usr/bin/env python3
"""
Batch PDF to Markdown converter using marker-pdf library
"""
import os
import glob
from pathlib import Path
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
def convert_pdf_to_markdown(pdf_path: str, output_dir: str = "output"):
"""
Convert PDF file to Markdown
Args:
pdf_path: Path to the PDF file
output_dir: Directory to save the output (default: "output")
"""
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Get the base filename without extension
pdf_file = Path(pdf_path)
base_name = pdf_file.stem
print(f"\nConverting {pdf_file.name} to Markdown...")
try:
# Initialize the converter with model dictionary
converter = PdfConverter(
artifact_dict=create_model_dict(),
)
# Convert the PDF file
rendered = converter(pdf_path)
# Extract text and images from rendered output
text, metadata, images = text_from_rendered(rendered)
# Save as markdown
output_path = os.path.join(output_dir, f"{base_name}.md")
with open(output_path, "w", encoding="utf-8") as f:
f.write(text)
print(f" OK Output saved to: {output_path}")
# Save images if any
if images:
images_dir = os.path.join(output_dir, f"{base_name}_images")
os.makedirs(images_dir, exist_ok=True)
for img_name, img_data in images.items():
img_path = os.path.join(images_dir, img_name)
with open(img_path, "wb") as f:
f.write(img_data)
print(f" OK {len(images)} images saved to: {images_dir}")
# Save metadata if available
if metadata:
metadata_path = os.path.join(output_dir, f"{base_name}_metadata.json")
import json
with open(metadata_path, "w", encoding="utf-8") as f:
json.dump(metadata, f, indent=2, ensure_ascii=False)
print(f" OK Metadata saved to: {metadata_path}")
return True
except Exception as e:
print(f" ERROR: Failed to convert {pdf_file.name}: {e}")
return False
def convert_all_pdfs(input_dir: str = "input", output_dir: str = "output"):
"""
Convert all PDF files in the input directory to Markdown
Args:
input_dir: Directory containing PDF files
output_dir: Directory to save the output
"""
# Find all PDF files
pdf_pattern = os.path.join(input_dir, "*.pdf")
pdf_files = sorted(glob.glob(pdf_pattern))
if not pdf_files:
print(f"No PDF files found in {input_dir}")
return
print(f"Found {len(pdf_files)} PDF files to convert")
print("=" * 60)
successful = 0
failed = 0
for pdf_file in pdf_files:
if convert_pdf_to_markdown(pdf_file, output_dir):
successful += 1
else:
failed += 1
print("\n" + "=" * 60)
print(f"Conversion complete!")
print(f" Successful: {successful}")
print(f" Failed: {failed}")
print(f" Total: {len(pdf_files)}")
if __name__ == "__main__":
convert_all_pdfs()