feat: MD 파일 병합 및 이미지 경로 통합 스크립트 추가 (#1)

- merge_markdown.py: 96개 페이지별 MD를 단일 파일로 병합 - 이미지를 output/images/ 폴더로 통합, p{NN}_ prefix로 파일명 충돌 방지 - file_range 파라미터로 부분 테스트 가능 - docs/tutorial.md: merge 명령어 및 사용법 문서화 - docs/history: 작업 이력 파일 추가 소요 시간: 10분 | Context: input 18k / output 2k tokens Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-01 11:00:28 +09:00
parent 892e4ecafb
commit 8d4339302e
24 changed files with 2335 additions and 0 deletions
--- a/convert_pdfs_fast.py
+++ b/convert_pdfs_fast.py
@@ -0,0 +1,150 @@
+#!/usr/bin/env python3
+"""
+Fast PDF to Markdown converter - optimized for text-heavy documents
+"""
+
+import argparse
+import os
+import glob
+from pathlib import Path
+from marker.converters.pdf import PdfConverter
+from marker.models import create_model_dict
+from marker.output import text_from_rendered
+from marker.config.parser import ConfigParser
+
+
+def convert_pdf_to_markdown_fast(pdf_path: str, output_dir: str = "output", languages: str = None):
+    """
+    Convert PDF file to Markdown with speed optimizations for text-heavy documents
+
+    Args:
+        pdf_path: Path to the PDF file
+        output_dir: Directory to save the output (default: "output")
+        languages: Comma-separated language codes for OCR (e.g. "ko", "ko,en")
+    """
+    # Create output directory if it doesn't exist
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Get the base filename without extension
+    pdf_file = Path(pdf_path)
+    base_name = pdf_file.stem
+
+    print(f"\nConverting {pdf_file.name} to Markdown...")
+    if languages:
+        print(f"  Languages: {languages}")
+
+    try:
+        # Configure for speed - text-focused processing
+        config = {
+            "output_format": "markdown",
+            # Disable image extraction for speed (images won't be saved separately)
+            # "disable_image_extraction": True,  # Uncomment if you want to skip all images
+        }
+
+        if languages:
+            config["languages"] = languages.split(",")
+
+        config_parser = ConfigParser(config)
+
+        # Initialize the converter with optimized settings
+        converter = PdfConverter(
+            config=config_parser.generate_config_dict(),
+            artifact_dict=create_model_dict(),
+            processor_list=config_parser.get_processors(),
+            renderer=config_parser.get_renderer(),
+        )
+
+        # Convert the PDF file
+        rendered = converter(pdf_path)
+
+        # Extract text and images from rendered output
+        text, metadata, images = text_from_rendered(rendered)
+
+        # Save as markdown
+        output_path = os.path.join(output_dir, f"{base_name}.md")
+        with open(output_path, "w", encoding="utf-8") as f:
+            f.write(text)
+
+        print(f"  OK Output saved to: {output_path}")
+
+        # Save images
+        if images:
+            images_dir = os.path.join(output_dir, f"{base_name}_images")
+            os.makedirs(images_dir, exist_ok=True)
+            for img_name, img_data in images.items():
+                img_path = os.path.join(images_dir, img_name)
+                with open(img_path, "wb") as f:
+                    f.write(img_data)
+            print(f"  OK {len(images)} images saved to: {images_dir}")
+
+        # Skip metadata saving for speed
+        # if metadata:
+        #     metadata_path = os.path.join(output_dir, f"{base_name}_metadata.json")
+        #     import json
+        #     with open(metadata_path, "w", encoding="utf-8") as f:
+        #         json.dump(metadata, f, indent=2, ensure_ascii=False)
+        #     print(f"  OK Metadata saved to: {metadata_path}")
+
+        return (True, pdf_file.name)
+
+    except Exception as e:
+        print(f"  ERROR: Failed to convert {pdf_file.name}: {e}")
+        return (False, pdf_file.name)
+
+
+def convert_all_pdfs_fast(input_dir: str = "input", output_dir: str = "output", languages: str = None):
+    """
+    Convert all PDF files in the input directory to Markdown (sequential, memory-safe)
+
+    Args:
+        input_dir: Directory containing PDF files
+        output_dir: Directory to save the output
+        languages: Comma-separated language codes for OCR (e.g. "ko", "ko,en")
+    """
+    # Find all PDF files
+    pdf_pattern = os.path.join(input_dir, "*.pdf")
+    pdf_files = sorted(glob.glob(pdf_pattern))
+
+    if not pdf_files:
+        print(f"No PDF files found in {input_dir}")
+        return
+
+    print(f"Found {len(pdf_files)} PDF files to convert")
+    print("Mode: FAST (text-focused, sequential processing)")
+    if languages:
+        print(f"Languages: {languages}")
+    print("=" * 60)
+
+    successful = 0
+    failed = 0
+    failed_files = []
+
+    for i, pdf_file in enumerate(pdf_files, 1):
+        print(f"\n[{i}/{len(pdf_files)}]", end=" ")
+        success, filename = convert_pdf_to_markdown_fast(pdf_file, output_dir, languages)
+        if success:
+            successful += 1
+        else:
+            failed += 1
+            failed_files.append(filename)
+
+    print("\n" + "=" * 60)
+    print(f"Conversion complete!")
+    print(f"  Successful: {successful}")
+    print(f"  Failed: {failed}")
+    print(f"  Total: {len(pdf_files)}")
+
+    if failed_files:
+        print(f"\nFailed files:")
+        for filename in failed_files:
+            print(f"  - {filename}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Fast PDF to Markdown converter")
+    parser.add_argument("--input_dir", default="input", help="Input directory containing PDF files")
+    parser.add_argument("--output_dir", default="output", help="Output directory for markdown files")
+    parser.add_argument("--languages", default=None, help="Comma-separated language codes for OCR (e.g. ko, ko,en)")
+    args = parser.parse_args()
+
+    convert_all_pdfs_fast(args.input_dir, args.output_dir, args.languages)