#!/usr/bin/env python3 """ PDF to Markdown converter with cropped figure extraction Uses marker-pdf to detect figures, then crops them from page images. Supports 2-column (multi-column) → single-column reordering. """ import os import re import glob from pathlib import Path from marker.converters.pdf import PdfConverter from marker.models import create_model_dict from marker.output import text_from_rendered from PIL import Image import fitz # PyMuPDF def is_scanned_pdf(pdf_path: str, sample_pages: int = 3) -> bool: """페이지에 선택 가능한 텍스트가 없으면 스캔 PDF로 판단""" doc = fitz.open(pdf_path) total = min(sample_pages, len(doc)) text_chars = 0 for i in range(total): text_chars += len(doc[i].get_text().strip()) doc.close() return text_chars < 50 # 글자 수가 매우 적으면 스캔본 def reorder_text_by_columns(pdf_path: str) -> str: """ 텍스트 기반 PDF 전용: PyMuPDF 블록 좌표로 2단 → 1단 순서 재정렬. 각 페이지에서 좌측 컬럼 전체 → 우측 컬럼 전체 순으로 읽음. """ doc = fitz.open(pdf_path) pages_text = [] for page in doc: blocks = page.get_text("blocks", sort=False) text_blocks = [b for b in blocks if b[6] == 0 and b[4].strip()] if not text_blocks: continue page_width = page.rect.width mid_x = page_width / 2 left = [b for b in text_blocks if b[2] <= mid_x + 30] right = [b for b in text_blocks if b[0] >= mid_x - 30] span = [b for b in text_blocks if b[0] < mid_x - 30 and b[2] > mid_x + 30] is_two_col = len(left) >= 2 and len(right) >= 2 and not span if is_two_col: left.sort(key=lambda b: b[1]) right.sort(key=lambda b: b[1]) ordered = left + right else: ordered = sorted(text_blocks, key=lambda b: (b[1], b[0])) pages_text.append("\n\n".join(b[4].strip() for b in ordered)) doc.close() return "\n\n---\n\n".join(pages_text) def extract_figure_images(pdf_path: str, rendered, output_dir: str, base_name: str): """ Extract figure images by cropping from page images based on marker's detection Args: pdf_path: Path to PDF file rendered: Marker's rendered output with figure positions output_dir: Output directory base_name: Base filename Returns: dict: Mapping of image names to image data """ images_dict = {} # Check if rendered has pages with image information if not hasattr(rendered, 'pages') or not rendered.pages: print(" No page information in rendered output") return images_dict # Open PDF with PyMuPDF to render pages as images doc = fitz.open(pdf_path) print(f" Processing {len(rendered.pages)} pages for figure extraction...") for page_idx, page_data in enumerate(rendered.pages): page_num = page_idx + 1 # Check if page has images/figures if not hasattr(page_data, 'images') or not page_data.images: continue print(f" Page {page_num}: Found {len(page_data.images)} figure(s)") # Render page as image pdf_page = doc[page_idx] # Render at 2x resolution for better quality mat = fitz.Matrix(2, 2) pix = pdf_page.get_pixmap(matrix=mat) # Convert to PIL Image import io img_data = pix.tobytes("png") page_img = Image.open(io.BytesIO(img_data)) # Extract each figure from this page for fig_idx, fig_info in enumerate(page_data.images): try: # Get bounding box (marker stores positions) if hasattr(fig_info, 'bbox'): bbox = fig_info.bbox # Scale bbox coordinates (marker uses PDF coordinates) # Adjust for 2x rendering x0, y0, x1, y1 = bbox x0, y0, x1, y1 = int(x0 * 2), int(y0 * 2), int(x1 * 2), int(y1 * 2) # Crop the figure cropped = page_img.crop((x0, y0, x1, y1)) # Save to bytes from io import BytesIO img_bytes = BytesIO() cropped.save(img_bytes, format='PNG') # Generate image name img_name = f"_page_{page_num}_Figure_{fig_idx + 1}.png" images_dict[img_name] = img_bytes.getvalue() print(f" Cropped figure {fig_idx + 1}: {x1-x0}x{y1-y0}px") except Exception as e: print(f" Warning: Could not crop figure {fig_idx + 1}: {e}") doc.close() return images_dict def convert_pdf_with_cropped_images(pdf_path: str, output_dir: str = "output"): """ Convert PDF to Markdown with cropped figure images. - 스캔 PDF: marker-pdf OCR + 레이아웃 검출 (2단 자동 처리) - 텍스트 PDF: PyMuPDF 블록 좌표 기반 2단→1단 재정렬 """ import io os.makedirs(output_dir, exist_ok=True) pdf_file = Path(pdf_path) base_name = pdf_file.stem print(f"\nConverting {pdf_file.name}...") scanned = is_scanned_pdf(pdf_path) print(f" PDF type: {'scanned (OCR)' if scanned else 'text-based (PyMuPDF column reorder)'}") try: if not scanned: # 텍스트 기반 PDF: PyMuPDF로 2단 재정렬 추출 print(" Extracting text with column reordering...") text = reorder_text_by_columns(pdf_path) metadata = None marker_images = {} else: # 스캔 PDF: marker-pdf가 OCR + 레이아웃(2단) 처리 converter = PdfConverter( artifact_dict=create_model_dict(), ) print(" Running marker-pdf OCR and layout detection...") rendered = converter(pdf_path) text, metadata, marker_images = text_from_rendered(rendered) # Fix image paths: prepend {base_name}_images/ folder to image references # 공백을 %20으로 인코딩 — Obsidian(CommonMark) 경로 파싱 오류 방지 safe_base_name = base_name.replace(' ', '%20') text = re.sub( r'!\[([^\]]*)\]\(([^/)][^)]*)\)', rf'![\1]({safe_base_name}_images/\2)', text ) # Save markdown output_path = os.path.join(output_dir, f"{base_name}.md") with open(output_path, "w", encoding="utf-8") as f: f.write(text) print(f" OK Markdown saved: {output_path}") # Extract cropped figure images print(" Extracting figures from pages...") cropped_images = extract_figure_images(pdf_path, rendered, output_dir, base_name) if cropped_images: images_dir = os.path.join(output_dir, f"{base_name}_images") os.makedirs(images_dir, exist_ok=True) for img_name, img_data in cropped_images.items(): img_path = os.path.join(images_dir, img_name) with open(img_path, "wb") as f: f.write(img_data) print(f" OK {len(cropped_images)} figures saved to: {images_dir}") else: print(" ! No figures extracted (trying alternative method...)") # Fallback: use marker's images if available if marker_images: images_dir = os.path.join(output_dir, f"{base_name}_images") os.makedirs(images_dir, exist_ok=True) saved_count = 0 for img_name, img_data in marker_images.items(): try: from io import BytesIO if isinstance(img_data, Image.Image): img_bytes = BytesIO() img_data.save(img_bytes, format='PNG') img_bytes = img_bytes.getvalue() else: img_bytes = img_data if img_bytes and len(img_bytes) > 0: img_path = os.path.join(images_dir, img_name) with open(img_path, "wb") as f: f.write(img_bytes) saved_count += 1 except Exception as e: print(f" Warning: Could not save {img_name}: {e}") if saved_count > 0: print(f" OK {saved_count} images from marker saved") else: print(" ! No valid images to save") # Save metadata if metadata: import json metadata_path = os.path.join(output_dir, f"{base_name}_metadata.json") with open(metadata_path, "w", encoding="utf-8") as f: json.dump(metadata, f, indent=2, ensure_ascii=False) return True except Exception as e: print(f" ERROR: {e}") import traceback traceback.print_exc() return False def convert_all_pdfs(input_dir: str = "input", output_dir: str = "output"): """ Convert all PDFs with cropped figure extraction Each PDF is converted in a separate process to avoid multiprocessing issues """ pdf_pattern = os.path.join(input_dir, "*.pdf") pdf_files = sorted(glob.glob(pdf_pattern)) if not pdf_files: print(f"No PDF files found in {input_dir}") return print(f"Found {len(pdf_files)} PDF file(s)") print("=" * 60) successful = 0 failed = 0 import subprocess import sys for pdf_file in pdf_files: print(f"\nStarting conversion of: {os.path.basename(pdf_file)}") result = subprocess.run( [sys.executable, __file__, "--single", pdf_file, output_dir], capture_output=False ) if result.returncode == 0: successful += 1 else: failed += 1 print(f" FAILED: {os.path.basename(pdf_file)}") print("\n" + "=" * 60) print(f"Conversion complete!") print(f" Successful: {successful}") print(f" Failed: {failed}") print(f" Total: {len(pdf_files)}") if __name__ == "__main__": import sys # Check if running in single-file mode (called by subprocess) if len(sys.argv) >= 4 and sys.argv[1] == "--single": pdf_file = sys.argv[2] output_dir = sys.argv[3] success = convert_pdf_with_cropped_images(pdf_file, output_dir) sys.exit(0 if success else 1) else: # Normal batch mode convert_all_pdfs()