feat: MD 파일 병합 및 이미지 경로 통합 스크립트 추가 (#1)

- merge_markdown.py: 96개 페이지별 MD를 단일 파일로 병합
  - 이미지를 output/images/ 폴더로 통합, p{NN}_ prefix로 파일명 충돌 방지
  - file_range 파라미터로 부분 테스트 가능
- docs/tutorial.md: merge 명령어 및 사용법 문서화
- docs/history: 작업 이력 파일 추가

소요 시간: 10분 | Context: input 18k / output 2k tokens

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
minsung
2026-04-01 11:00:28 +09:00
parent 892e4ecafb
commit 8d4339302e
24 changed files with 2335 additions and 0 deletions

175
extract_images.py Normal file
View File

@@ -0,0 +1,175 @@
#!/usr/bin/env python3
"""
Extract embedded images from PDF files
"""
import os
import glob
from pathlib import Path
def extract_images_pypdfium2(pdf_path: str, output_dir: str = "output"):
"""
Extract images using pypdfium2
"""
try:
import pypdfium2 as pdfium
from PIL import Image
import io
pdf_file = Path(pdf_path)
base_name = pdf_file.stem
images_dir = os.path.join(output_dir, f"{base_name}_extracted_images")
os.makedirs(images_dir, exist_ok=True)
print(f"\nExtracting images from {pdf_file.name}...")
pdf = pdfium.PdfDocument(pdf_path)
image_count = 0
for page_num in range(len(pdf)):
page = pdf[page_num]
# Get images from page
for obj_index, obj in enumerate(page.get_objects()):
if obj.type == pdfium.FPDF_PAGEOBJ_IMAGE:
try:
# Extract image
bitmap = obj.get_bitmap()
pil_image = bitmap.to_pil()
# Skip very small images (likely noise or artifacts)
if pil_image.width < 50 or pil_image.height < 50:
continue
image_count += 1
img_filename = f"page_{page_num + 1}_img_{obj_index + 1}.png"
img_path = os.path.join(images_dir, img_filename)
pil_image.save(img_path)
print(f" Saved: {img_filename} ({pil_image.width}x{pil_image.height})")
except Exception as e:
print(f" Warning: Could not extract image {obj_index} from page {page_num + 1}: {e}")
pdf.close()
if image_count > 0:
print(f" OK Total {image_count} images extracted to: {images_dir}")
return True
else:
print(f" INFO: No images found in {pdf_file.name}")
return True
except Exception as e:
print(f" ERROR: Failed with pypdfium2: {e}")
return False
def extract_images_pymupdf(pdf_path: str, output_dir: str = "output"):
"""
Extract images using PyMuPDF (fitz) - fallback method
"""
try:
import fitz # PyMuPDF
pdf_file = Path(pdf_path)
base_name = pdf_file.stem
images_dir = os.path.join(output_dir, f"{base_name}_extracted_images")
os.makedirs(images_dir, exist_ok=True)
print(f"\nExtracting images from {pdf_file.name} using PyMuPDF...")
doc = fitz.open(pdf_path)
image_count = 0
for page_num in range(len(doc)):
page = doc[page_num]
image_list = page.get_images(full=True)
for img_index, img_info in enumerate(image_list):
xref = img_info[0]
try:
# Extract image
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
image_ext = base_image["ext"]
# Skip very small images
if len(image_bytes) < 1000: # Less than 1KB
continue
image_count += 1
img_filename = f"page_{page_num + 1}_img_{img_index + 1}.{image_ext}"
img_path = os.path.join(images_dir, img_filename)
with open(img_path, "wb") as f:
f.write(image_bytes)
print(f" Saved: {img_filename} ({len(image_bytes)} bytes)")
except Exception as e:
print(f" Warning: Could not extract image {img_index} from page {page_num + 1}: {e}")
doc.close()
if image_count > 0:
print(f" OK Total {image_count} images extracted to: {images_dir}")
return True
else:
print(f" INFO: No images found in {pdf_file.name}")
return True
except ImportError:
print(" ERROR: PyMuPDF not installed. Install with: pip install PyMuPDF")
return False
except Exception as e:
print(f" ERROR: Failed with PyMuPDF: {e}")
return False
def extract_images_from_pdf(pdf_path: str, output_dir: str = "output"):
"""
Try to extract images using available methods
"""
# Try pypdfium2 first (already installed)
success = extract_images_pypdfium2(pdf_path, output_dir)
if not success:
print("\nTrying PyMuPDF as fallback...")
success = extract_images_pymupdf(pdf_path, output_dir)
return success
def extract_all_images(input_dir: str = "input", output_dir: str = "output"):
"""
Extract images from all PDF files in the input directory
"""
pdf_pattern = os.path.join(input_dir, "*.pdf")
pdf_files = sorted(glob.glob(pdf_pattern))
if not pdf_files:
print(f"No PDF files found in {input_dir}")
return
print(f"Found {len(pdf_files)} PDF files")
print("=" * 60)
successful = 0
failed = 0
for pdf_file in pdf_files:
if extract_images_from_pdf(pdf_file, output_dir):
successful += 1
else:
failed += 1
print("\n" + "=" * 60)
print(f"Image extraction complete!")
print(f" Successful: {successful}")
print(f" Failed: {failed}")
print(f" Total: {len(pdf_files)}")
if __name__ == "__main__":
extract_all_images()