feat: MD 파일 병합 및 이미지 경로 통합 스크립트 추가 (#1)
- merge_markdown.py: 96개 페이지별 MD를 단일 파일로 병합
- 이미지를 output/images/ 폴더로 통합, p{NN}_ prefix로 파일명 충돌 방지
- file_range 파라미터로 부분 테스트 가능
- docs/tutorial.md: merge 명령어 및 사용법 문서화
- docs/history: 작업 이력 파일 추가
소요 시간: 10분 | Context: input 18k / output 2k tokens
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
119
merge_markdown.py
Normal file
119
merge_markdown.py
Normal file
@@ -0,0 +1,119 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Merge multiple Markdown files into a single file,
|
||||
consolidating all images into a single images/ folder with unique names.
|
||||
|
||||
Image rename rule:
|
||||
{stem}_images/_page_0_Figure_3.jpeg -> images/p006_Figure_3.jpeg
|
||||
{stem}_images/_page_0_Picture_12.jpeg -> images/p006_Picture_12.jpeg
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import glob
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def merge_markdown_files(
|
||||
input_dir: str = "output",
|
||||
output_file: str = "merged_all.md",
|
||||
images_subdir: str = "images",
|
||||
file_range: tuple = None, # e.g. (6, 8) to process only pages 06~08
|
||||
):
|
||||
md_pattern = os.path.join(input_dir, "*.md")
|
||||
all_md_files = sorted(glob.glob(md_pattern))
|
||||
|
||||
# Only include files matching Manual-NN pattern (skip merged outputs)
|
||||
all_md_files = [f for f in all_md_files if re.search(r'-\d+\.md$', f)]
|
||||
|
||||
# Filter by page number range if given
|
||||
if file_range:
|
||||
start, end = file_range
|
||||
md_files = []
|
||||
for f in all_md_files:
|
||||
m = re.search(r'-(\d+)\.md$', f)
|
||||
if m and start <= int(m.group(1)) <= end:
|
||||
md_files.append(f)
|
||||
else:
|
||||
md_files = all_md_files
|
||||
|
||||
if not md_files:
|
||||
print(f"No markdown files found in {input_dir}")
|
||||
return
|
||||
|
||||
print(f"Files to merge: {len(md_files)}")
|
||||
for f in md_files:
|
||||
print(f" {Path(f).name}")
|
||||
print("=" * 60)
|
||||
|
||||
# Create unified images directory
|
||||
unified_images_path = os.path.join(input_dir, images_subdir)
|
||||
os.makedirs(unified_images_path, exist_ok=True)
|
||||
|
||||
merged_content = []
|
||||
|
||||
for md_file in md_files:
|
||||
file_path = Path(md_file)
|
||||
|
||||
# Extract zero-padded page number from filename (e.g. "06" from "Manual-06.md")
|
||||
m = re.search(r'-(\d+)\.md$', str(file_path))
|
||||
page_num = m.group(1) if m else "000"
|
||||
|
||||
print(f"Processing [{page_num}] {file_path.name} ...")
|
||||
|
||||
with open(md_file, "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
|
||||
# Replace each image reference
|
||||
def replace_image(match):
|
||||
alt = match.group(1)
|
||||
old_path = match.group(2)
|
||||
|
||||
# Decode %20 → space for filesystem access
|
||||
old_path_decoded = old_path.replace("%20", " ")
|
||||
|
||||
# Filename only: _page_0_Figure_3.jpeg
|
||||
img_filename = Path(old_path_decoded).name
|
||||
|
||||
# Strip leading _page_N_ to get: Figure_3.jpeg or Picture_12.jpeg
|
||||
clean_name = re.sub(r'^_page_\d+_', '', img_filename)
|
||||
|
||||
# New unique name: p006_Figure_3.jpeg
|
||||
new_name = f"p{page_num}_{clean_name}"
|
||||
|
||||
# Copy image to unified folder
|
||||
src = os.path.join(input_dir, old_path_decoded)
|
||||
dst = os.path.join(unified_images_path, new_name)
|
||||
if os.path.exists(src):
|
||||
shutil.copy2(src, dst)
|
||||
else:
|
||||
print(f" WARNING: image not found: {src}")
|
||||
|
||||
return f""
|
||||
|
||||
new_content = re.sub(r'!\[([^\]]*)\]\(([^)]+)\)', replace_image, content)
|
||||
merged_content.append(new_content)
|
||||
|
||||
if not merged_content:
|
||||
print("No content to merge")
|
||||
return
|
||||
|
||||
final_content = "\n\n---\n\n".join(merged_content)
|
||||
|
||||
output_path = os.path.join(input_dir, output_file)
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
f.write(final_content)
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print(f"SUCCESS: {output_path}")
|
||||
print(f" Files merged : {len(merged_content)}")
|
||||
print(f" Total chars : {len(final_content):,}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
merge_markdown_files(
|
||||
input_dir="output",
|
||||
output_file="merged_all.md",
|
||||
images_subdir="images",
|
||||
)
|
||||
Reference in New Issue
Block a user