feat: MD 파일 병합 및 이미지 경로 통합 스크립트 추가 (#1)

- merge_markdown.py: 96개 페이지별 MD를 단일 파일로 병합
  - 이미지를 output/images/ 폴더로 통합, p{NN}_ prefix로 파일명 충돌 방지
  - file_range 파라미터로 부분 테스트 가능
- docs/tutorial.md: merge 명령어 및 사용법 문서화
- docs/history: 작업 이력 파일 추가

소요 시간: 10분 | Context: input 18k / output 2k tokens

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
minsung
2026-04-01 11:00:28 +09:00
parent 892e4ecafb
commit 8d4339302e
24 changed files with 2335 additions and 0 deletions

150
convert_pdfs_fast.py Normal file
View File

@@ -0,0 +1,150 @@
#!/usr/bin/env python3
"""
Fast PDF to Markdown converter - optimized for text-heavy documents
"""
import argparse
import os
import glob
from pathlib import Path
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
from marker.config.parser import ConfigParser
def convert_pdf_to_markdown_fast(pdf_path: str, output_dir: str = "output", languages: str = None):
"""
Convert PDF file to Markdown with speed optimizations for text-heavy documents
Args:
pdf_path: Path to the PDF file
output_dir: Directory to save the output (default: "output")
languages: Comma-separated language codes for OCR (e.g. "ko", "ko,en")
"""
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Get the base filename without extension
pdf_file = Path(pdf_path)
base_name = pdf_file.stem
print(f"\nConverting {pdf_file.name} to Markdown...")
if languages:
print(f" Languages: {languages}")
try:
# Configure for speed - text-focused processing
config = {
"output_format": "markdown",
# Disable image extraction for speed (images won't be saved separately)
# "disable_image_extraction": True, # Uncomment if you want to skip all images
}
if languages:
config["languages"] = languages.split(",")
config_parser = ConfigParser(config)
# Initialize the converter with optimized settings
converter = PdfConverter(
config=config_parser.generate_config_dict(),
artifact_dict=create_model_dict(),
processor_list=config_parser.get_processors(),
renderer=config_parser.get_renderer(),
)
# Convert the PDF file
rendered = converter(pdf_path)
# Extract text and images from rendered output
text, metadata, images = text_from_rendered(rendered)
# Save as markdown
output_path = os.path.join(output_dir, f"{base_name}.md")
with open(output_path, "w", encoding="utf-8") as f:
f.write(text)
print(f" OK Output saved to: {output_path}")
# Save images
if images:
images_dir = os.path.join(output_dir, f"{base_name}_images")
os.makedirs(images_dir, exist_ok=True)
for img_name, img_data in images.items():
img_path = os.path.join(images_dir, img_name)
with open(img_path, "wb") as f:
f.write(img_data)
print(f" OK {len(images)} images saved to: {images_dir}")
# Skip metadata saving for speed
# if metadata:
# metadata_path = os.path.join(output_dir, f"{base_name}_metadata.json")
# import json
# with open(metadata_path, "w", encoding="utf-8") as f:
# json.dump(metadata, f, indent=2, ensure_ascii=False)
# print(f" OK Metadata saved to: {metadata_path}")
return (True, pdf_file.name)
except Exception as e:
print(f" ERROR: Failed to convert {pdf_file.name}: {e}")
return (False, pdf_file.name)
def convert_all_pdfs_fast(input_dir: str = "input", output_dir: str = "output", languages: str = None):
"""
Convert all PDF files in the input directory to Markdown (sequential, memory-safe)
Args:
input_dir: Directory containing PDF files
output_dir: Directory to save the output
languages: Comma-separated language codes for OCR (e.g. "ko", "ko,en")
"""
# Find all PDF files
pdf_pattern = os.path.join(input_dir, "*.pdf")
pdf_files = sorted(glob.glob(pdf_pattern))
if not pdf_files:
print(f"No PDF files found in {input_dir}")
return
print(f"Found {len(pdf_files)} PDF files to convert")
print("Mode: FAST (text-focused, sequential processing)")
if languages:
print(f"Languages: {languages}")
print("=" * 60)
successful = 0
failed = 0
failed_files = []
for i, pdf_file in enumerate(pdf_files, 1):
print(f"\n[{i}/{len(pdf_files)}]", end=" ")
success, filename = convert_pdf_to_markdown_fast(pdf_file, output_dir, languages)
if success:
successful += 1
else:
failed += 1
failed_files.append(filename)
print("\n" + "=" * 60)
print(f"Conversion complete!")
print(f" Successful: {successful}")
print(f" Failed: {failed}")
print(f" Total: {len(pdf_files)}")
if failed_files:
print(f"\nFailed files:")
for filename in failed_files:
print(f" - {filename}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Fast PDF to Markdown converter")
parser.add_argument("--input_dir", default="input", help="Input directory containing PDF files")
parser.add_argument("--output_dir", default="output", help="Output directory for markdown files")
parser.add_argument("--languages", default=None, help="Comma-separated language codes for OCR (e.g. ko, ko,en)")
args = parser.parse_args()
convert_all_pdfs_fast(args.input_dir, args.output_dir, args.languages)