feat: MD 파일 병합 및 이미지 경로 통합 스크립트 추가 (#1)

- merge_markdown.py: 96개 페이지별 MD를 단일 파일로 병합
  - 이미지를 output/images/ 폴더로 통합, p{NN}_ prefix로 파일명 충돌 방지
  - file_range 파라미터로 부분 테스트 가능
- docs/tutorial.md: merge 명령어 및 사용법 문서화
- docs/history: 작업 이력 파일 추가

소요 시간: 10분 | Context: input 18k / output 2k tokens

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
minsung
2026-04-01 11:00:28 +09:00
parent 892e4ecafb
commit 8d4339302e
24 changed files with 2335 additions and 0 deletions

94
update_image_paths.py Normal file
View File

@@ -0,0 +1,94 @@
#!/usr/bin/env python3
"""
Update image paths in markdown files to point to extracted images
"""
import os
import re
import glob
from pathlib import Path
def update_markdown_image_paths(md_path: str, output_dir: str = "output"):
"""
Update image paths in markdown file to point to extracted images
"""
md_file = Path(md_path)
base_name = md_file.stem
# Path to extracted images folder
extracted_images_dir = f"{base_name}_extracted_images"
# Check if extracted images folder exists
extracted_images_path = os.path.join(output_dir, extracted_images_dir)
if not os.path.exists(extracted_images_path):
print(f"No extracted images folder found: {extracted_images_path}")
return False
# Read markdown content
with open(md_path, 'r', encoding='utf-8') as f:
content = f.read()
original_content = content
# Pattern to match image references like ![](_page_1_Figure_1.jpeg)
# Replace with actual extracted images
def replace_image_path(match):
old_path = match.group(1)
# Extract page number from old path (e.g., _page_1_Figure_1.jpeg -> page 1)
page_match = re.search(r'_page_(\d+)_', old_path)
if page_match:
page_num = page_match.group(1)
# Map to extracted image: page_1_img_1.png
new_path = f"{extracted_images_dir}/page_{page_num}_img_1.png"
return f'![]({new_path})'
return match.group(0) # Return original if no match
# Replace all image paths
content = re.sub(r'\!\[\]\(([^)]+\.jpeg)\)', replace_image_path, content)
if content == original_content:
print(f"No changes needed for {md_file.name}")
return True
# Save updated markdown
output_path = md_path.replace('.md', '_updated.md')
with open(output_path, 'w', encoding='utf-8') as f:
f.write(content)
print(f"Updated markdown saved to: {output_path}")
# Count replacements
old_count = len(re.findall(r'\!\[\]\([^)]+\.jpeg\)', original_content))
new_count = len(re.findall(r'\!\[\]\([^)]+\.png\)', content))
print(f" Replaced {new_count} image paths (out of {old_count} references)")
return True
def update_all_markdown_files(output_dir: str = "output"):
"""
Update image paths in all markdown files
"""
md_pattern = os.path.join(output_dir, "*.md")
md_files = [f for f in glob.glob(md_pattern) if not f.endswith('_updated.md')]
if not md_files:
print(f"No markdown files found in {output_dir}")
return
print(f"Found {len(md_files)} markdown files")
print("=" * 60)
for md_file in md_files:
update_markdown_image_paths(md_file, output_dir)
print()
print("=" * 60)
print("Done!")
if __name__ == "__main__":
update_all_markdown_files()