- merge_markdown.py: 96개 페이지별 MD를 단일 파일로 병합
- 이미지를 output/images/ 폴더로 통합, p{NN}_ prefix로 파일명 충돌 방지
- file_range 파라미터로 부분 테스트 가능
- docs/tutorial.md: merge 명령어 및 사용법 문서화
- docs/history: 작업 이력 파일 추가
소요 시간: 10분 | Context: input 18k / output 2k tokens
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
95 lines
2.8 KiB
Python
95 lines
2.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Update image paths in markdown files to point to extracted images
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import glob
|
|
from pathlib import Path
|
|
|
|
|
|
def update_markdown_image_paths(md_path: str, output_dir: str = "output"):
|
|
"""
|
|
Update image paths in markdown file to point to extracted images
|
|
"""
|
|
md_file = Path(md_path)
|
|
base_name = md_file.stem
|
|
|
|
# Path to extracted images folder
|
|
extracted_images_dir = f"{base_name}_extracted_images"
|
|
|
|
# Check if extracted images folder exists
|
|
extracted_images_path = os.path.join(output_dir, extracted_images_dir)
|
|
if not os.path.exists(extracted_images_path):
|
|
print(f"No extracted images folder found: {extracted_images_path}")
|
|
return False
|
|
|
|
# Read markdown content
|
|
with open(md_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
original_content = content
|
|
|
|
# Pattern to match image references like 
|
|
# Replace with actual extracted images
|
|
def replace_image_path(match):
|
|
old_path = match.group(1)
|
|
|
|
# Extract page number from old path (e.g., _page_1_Figure_1.jpeg -> page 1)
|
|
page_match = re.search(r'_page_(\d+)_', old_path)
|
|
if page_match:
|
|
page_num = page_match.group(1)
|
|
# Map to extracted image: page_1_img_1.png
|
|
new_path = f"{extracted_images_dir}/page_{page_num}_img_1.png"
|
|
return f''
|
|
|
|
return match.group(0) # Return original if no match
|
|
|
|
# Replace all image paths
|
|
content = re.sub(r'\!\[\]\(([^)]+\.jpeg)\)', replace_image_path, content)
|
|
|
|
if content == original_content:
|
|
print(f"No changes needed for {md_file.name}")
|
|
return True
|
|
|
|
# Save updated markdown
|
|
output_path = md_path.replace('.md', '_updated.md')
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
f.write(content)
|
|
|
|
print(f"Updated markdown saved to: {output_path}")
|
|
|
|
# Count replacements
|
|
old_count = len(re.findall(r'\!\[\]\([^)]+\.jpeg\)', original_content))
|
|
new_count = len(re.findall(r'\!\[\]\([^)]+\.png\)', content))
|
|
print(f" Replaced {new_count} image paths (out of {old_count} references)")
|
|
|
|
return True
|
|
|
|
|
|
def update_all_markdown_files(output_dir: str = "output"):
|
|
"""
|
|
Update image paths in all markdown files
|
|
"""
|
|
md_pattern = os.path.join(output_dir, "*.md")
|
|
md_files = [f for f in glob.glob(md_pattern) if not f.endswith('_updated.md')]
|
|
|
|
if not md_files:
|
|
print(f"No markdown files found in {output_dir}")
|
|
return
|
|
|
|
print(f"Found {len(md_files)} markdown files")
|
|
print("=" * 60)
|
|
|
|
for md_file in md_files:
|
|
update_markdown_image_paths(md_file, output_dir)
|
|
print()
|
|
|
|
print("=" * 60)
|
|
print("Done!")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
update_all_markdown_files()
|