feat: MD 파일 병합 및 이미지 경로 통합 스크립트 추가 (#1)

- merge_markdown.py: 96개 페이지별 MD를 단일 파일로 병합 - 이미지를 output/images/ 폴더로 통합, p{NN}_ prefix로 파일명 충돌 방지 - file_range 파라미터로 부분 테스트 가능 - docs/tutorial.md: merge 명령어 및 사용법 문서화 - docs/history: 작업 이력 파일 추가 소요 시간: 10분 | Context: input 18k / output 2k tokens Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-01 11:00:28 +09:00
parent 892e4ecafb
commit 8d4339302e
24 changed files with 2335 additions and 0 deletions
--- a/convert_pdfs.py
+++ b/convert_pdfs.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+"""
+Batch PDF to Markdown converter using marker-pdf library
+"""
+
+import os
+import glob
+from pathlib import Path
+from marker.converters.pdf import PdfConverter
+from marker.models import create_model_dict
+from marker.output import text_from_rendered
+
+
+def convert_pdf_to_markdown(pdf_path: str, output_dir: str = "output"):
+    """
+    Convert PDF file to Markdown
+
+    Args:
+        pdf_path: Path to the PDF file
+        output_dir: Directory to save the output (default: "output")
+    """
+    # Create output directory if it doesn't exist
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Get the base filename without extension
+    pdf_file = Path(pdf_path)
+    base_name = pdf_file.stem
+
+    print(f"\nConverting {pdf_file.name} to Markdown...")
+
+    try:
+        # Initialize the converter with model dictionary
+        converter = PdfConverter(
+            artifact_dict=create_model_dict(),
+        )
+
+        # Convert the PDF file
+        rendered = converter(pdf_path)
+
+        # Extract text and images from rendered output
+        text, metadata, images = text_from_rendered(rendered)
+
+        # Save as markdown
+        output_path = os.path.join(output_dir, f"{base_name}.md")
+        with open(output_path, "w", encoding="utf-8") as f:
+            f.write(text)
+
+        print(f"  OK Output saved to: {output_path}")
+
+        # Save images if any
+        if images:
+            images_dir = os.path.join(output_dir, f"{base_name}_images")
+            os.makedirs(images_dir, exist_ok=True)
+            for img_name, img_data in images.items():
+                img_path = os.path.join(images_dir, img_name)
+                with open(img_path, "wb") as f:
+                    f.write(img_data)
+            print(f"  OK {len(images)} images saved to: {images_dir}")
+
+        # Save metadata if available
+        if metadata:
+            metadata_path = os.path.join(output_dir, f"{base_name}_metadata.json")
+            import json
+            with open(metadata_path, "w", encoding="utf-8") as f:
+                json.dump(metadata, f, indent=2, ensure_ascii=False)
+            print(f"  OK Metadata saved to: {metadata_path}")
+
+        return True
+
+    except Exception as e:
+        print(f"  ERROR: Failed to convert {pdf_file.name}: {e}")
+        return False
+
+
+def convert_all_pdfs(input_dir: str = "input", output_dir: str = "output"):
+    """
+    Convert all PDF files in the input directory to Markdown
+
+    Args:
+        input_dir: Directory containing PDF files
+        output_dir: Directory to save the output
+    """
+    # Find all PDF files
+    pdf_pattern = os.path.join(input_dir, "*.pdf")
+    pdf_files = sorted(glob.glob(pdf_pattern))
+
+    if not pdf_files:
+        print(f"No PDF files found in {input_dir}")
+        return
+
+    print(f"Found {len(pdf_files)} PDF files to convert")
+    print("=" * 60)
+
+    successful = 0
+    failed = 0
+
+    for pdf_file in pdf_files:
+        if convert_pdf_to_markdown(pdf_file, output_dir):
+            successful += 1
+        else:
+            failed += 1
+
+    print("\n" + "=" * 60)
+    print(f"Conversion complete!")
+    print(f"  Successful: {successful}")
+    print(f"  Failed: {failed}")
+    print(f"  Total: {len(pdf_files)}")
+
+
+if __name__ == "__main__":
+    convert_all_pdfs()