Files
documan/debug_marker_images.py
minsung 8d4339302e feat: MD 파일 병합 및 이미지 경로 통합 스크립트 추가 (#1)
- merge_markdown.py: 96개 페이지별 MD를 단일 파일로 병합
  - 이미지를 output/images/ 폴더로 통합, p{NN}_ prefix로 파일명 충돌 방지
  - file_range 파라미터로 부분 테스트 가능
- docs/tutorial.md: merge 명령어 및 사용법 문서화
- docs/history: 작업 이력 파일 추가

소요 시간: 10분 | Context: input 18k / output 2k tokens

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-01 11:00:28 +09:00

96 lines
3.6 KiB
Python

#!/usr/bin/env python3
"""
Debug marker-pdf image extraction
"""
import os
from pathlib import Path
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
def debug_image_extraction(pdf_path: str):
"""
Debug why images are not being extracted properly
"""
pdf_file = Path(pdf_path)
print(f"Debugging image extraction for: {pdf_file.name}")
print("=" * 60)
try:
# Initialize converter
converter = PdfConverter(
artifact_dict=create_model_dict(),
)
# Convert
print("\nConverting PDF...")
rendered = converter(pdf_path)
print(f" Rendered type: {type(rendered)}")
print(f" Rendered attributes: {dir(rendered)}")
# Check what's in rendered
if hasattr(rendered, 'images'):
print(f"\n rendered.images exists: {len(rendered.images) if rendered.images else 0} images")
if rendered.images:
for idx, (key, val) in enumerate(list(rendered.images.items())[:3]):
print(f" Image {idx}: {key}, data size: {len(val) if val else 0}")
# Extract text and images
print("\nExtracting text and images...")
text, metadata, images = text_from_rendered(rendered)
print(f"\n Text length: {len(text)} characters")
print(f" Metadata: {type(metadata)}")
print(f" Images dict: {len(images) if images else 0} items")
if images:
print("\n Detailed image info:")
for idx, (img_name, img_data) in enumerate(images.items()):
print(f" {idx + 1}. Name: {img_name}")
print(f" Data type: {type(img_data)}")
print(f" Data size: {len(img_data) if img_data else 0} bytes")
if img_data:
print(f" First 20 bytes: {img_data[:20]}")
else:
print(f" WARNING: Empty data!")
else:
print("\n WARNING: No images returned!")
# Check rendered object for image data
print("\n Checking rendered object structure:")
if hasattr(rendered, '__dict__'):
for key, val in rendered.__dict__.items():
if 'image' in key.lower():
print(f" {key}: {type(val)}, length: {len(val) if hasattr(val, '__len__') else 'N/A'}")
# Try to access images directly from rendered
if hasattr(rendered, 'images') and rendered.images:
print("\n Attempting direct image access:")
print(f" Total images in rendered: {len(rendered.images)}")
for idx, (img_name, img_obj) in enumerate(list(rendered.images.items())[:3]):
print(f"\n Image {idx + 1}: {img_name}")
print(f" Type: {type(img_obj)}")
print(f" Attributes: {dir(img_obj) if hasattr(img_obj, '__dir__') else 'None'}")
if hasattr(img_obj, 'tobytes'):
img_bytes = img_obj.tobytes()
print(f" Bytes: {len(img_bytes)}")
elif hasattr(img_obj, 'save'):
print(f" Has save method (PIL Image?)")
except Exception as e:
print(f"\n ERROR: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
# Debug the first PDF in input folder
import glob
pdf_files = glob.glob("input/*.pdf")
if pdf_files:
debug_image_extraction(pdf_files[0])
else:
print("No PDF files found in input folder")