feat: MD 파일 병합 및 이미지 경로 통합 스크립트 추가 (#1)
- merge_markdown.py: 96개 페이지별 MD를 단일 파일로 병합
- 이미지를 output/images/ 폴더로 통합, p{NN}_ prefix로 파일명 충돌 방지
- file_range 파라미터로 부분 테스트 가능
- docs/tutorial.md: merge 명령어 및 사용법 문서화
- docs/history: 작업 이력 파일 추가
소요 시간: 10분 | Context: input 18k / output 2k tokens
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
99
debug_single_page.py
Normal file
99
debug_single_page.py
Normal file
@@ -0,0 +1,99 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Debug image extraction for a single page
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
from marker.converters.pdf import PdfConverter
|
||||
from marker.models import create_model_dict
|
||||
from marker.output import text_from_rendered
|
||||
import pypdfium2 as pdfium
|
||||
|
||||
|
||||
def debug_single_page(pdf_path: str, page_num: int = 1):
|
||||
"""
|
||||
Debug image extraction for a specific page (page_num is 1-indexed)
|
||||
"""
|
||||
pdf_file = Path(pdf_path)
|
||||
print(f"Debugging page {page_num} of: {pdf_file.name}")
|
||||
print("=" * 60)
|
||||
|
||||
# First check what PyPDFium2 sees
|
||||
print("\n1. Checking with PyPDFium2:")
|
||||
try:
|
||||
pdf = pdfium.PdfDocument(pdf_path)
|
||||
page = pdf[page_num - 1] # 0-indexed
|
||||
|
||||
print(f" Page {page_num} objects:")
|
||||
obj_count = 0
|
||||
for obj in page.get_objects():
|
||||
obj_count += 1
|
||||
if hasattr(pdfium, 'FPDF_PAGEOBJ_IMAGE'):
|
||||
if obj.type == pdfium.FPDF_PAGEOBJ_IMAGE:
|
||||
print(f" - Image object found (old API)")
|
||||
else:
|
||||
print(f" - Object type: {obj.type}")
|
||||
|
||||
print(f" Total objects on page: {obj_count}")
|
||||
pdf.close()
|
||||
except Exception as e:
|
||||
print(f" PyPDFium2 error: {e}")
|
||||
|
||||
# Now check marker-pdf
|
||||
print("\n2. Checking with marker-pdf:")
|
||||
try:
|
||||
converter = PdfConverter(
|
||||
artifact_dict=create_model_dict(),
|
||||
)
|
||||
|
||||
print(" Converting...")
|
||||
rendered = converter(pdf_path)
|
||||
|
||||
# Check rendered object
|
||||
print(f"\n Rendered type: {type(rendered)}")
|
||||
|
||||
if hasattr(rendered, 'images'):
|
||||
print(f" rendered.images: {len(rendered.images) if rendered.images else 0} images")
|
||||
if rendered.images:
|
||||
for img_name, img_data in list(rendered.images.items())[:5]:
|
||||
print(f" - {img_name}: {len(img_data) if img_data else 0} bytes")
|
||||
|
||||
# Extract using text_from_rendered
|
||||
print("\n3. Extracting with text_from_rendered:")
|
||||
text, metadata, images = text_from_rendered(rendered)
|
||||
|
||||
print(f" Extracted images: {len(images) if images else 0}")
|
||||
if images:
|
||||
for img_name, img_data in images.items():
|
||||
print(f" - {img_name}: {len(img_data) if img_data else 0} bytes")
|
||||
if not img_data or len(img_data) == 0:
|
||||
print(f" ⚠️ WARNING: Empty image data!")
|
||||
|
||||
# Save a test image if available
|
||||
if images:
|
||||
output_dir = "output/debug_test"
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
for img_name, img_data in images.items():
|
||||
if img_data and len(img_data) > 0:
|
||||
img_path = os.path.join(output_dir, img_name)
|
||||
with open(img_path, "wb") as f:
|
||||
f.write(img_data)
|
||||
print(f"\n ✓ Saved test image: {img_path}")
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
print(f" marker-pdf error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import glob
|
||||
pdf_files = glob.glob("input/*.pdf")
|
||||
if pdf_files:
|
||||
# Test page 2 (should have Figure 1.2, 1.3 according to the markdown)
|
||||
debug_single_page(pdf_files[0], page_num=2)
|
||||
else:
|
||||
print("No PDF files found in input folder")
|
||||
Reference in New Issue
Block a user