- merge_markdown.py: 96개 페이지별 MD를 단일 파일로 병합
- 이미지를 output/images/ 폴더로 통합, p{NN}_ prefix로 파일명 충돌 방지
- file_range 파라미터로 부분 테스트 가능
- docs/tutorial.md: merge 명령어 및 사용법 문서화
- docs/history: 작업 이력 파일 추가
소요 시간: 10분 | Context: input 18k / output 2k tokens
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
120 lines
3.6 KiB
Python
120 lines
3.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Merge multiple Markdown files into a single file,
|
|
consolidating all images into a single images/ folder with unique names.
|
|
|
|
Image rename rule:
|
|
{stem}_images/_page_0_Figure_3.jpeg -> images/p006_Figure_3.jpeg
|
|
{stem}_images/_page_0_Picture_12.jpeg -> images/p006_Picture_12.jpeg
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import glob
|
|
import shutil
|
|
from pathlib import Path
|
|
|
|
|
|
def merge_markdown_files(
|
|
input_dir: str = "output",
|
|
output_file: str = "merged_all.md",
|
|
images_subdir: str = "images",
|
|
file_range: tuple = None, # e.g. (6, 8) to process only pages 06~08
|
|
):
|
|
md_pattern = os.path.join(input_dir, "*.md")
|
|
all_md_files = sorted(glob.glob(md_pattern))
|
|
|
|
# Only include files matching Manual-NN pattern (skip merged outputs)
|
|
all_md_files = [f for f in all_md_files if re.search(r'-\d+\.md$', f)]
|
|
|
|
# Filter by page number range if given
|
|
if file_range:
|
|
start, end = file_range
|
|
md_files = []
|
|
for f in all_md_files:
|
|
m = re.search(r'-(\d+)\.md$', f)
|
|
if m and start <= int(m.group(1)) <= end:
|
|
md_files.append(f)
|
|
else:
|
|
md_files = all_md_files
|
|
|
|
if not md_files:
|
|
print(f"No markdown files found in {input_dir}")
|
|
return
|
|
|
|
print(f"Files to merge: {len(md_files)}")
|
|
for f in md_files:
|
|
print(f" {Path(f).name}")
|
|
print("=" * 60)
|
|
|
|
# Create unified images directory
|
|
unified_images_path = os.path.join(input_dir, images_subdir)
|
|
os.makedirs(unified_images_path, exist_ok=True)
|
|
|
|
merged_content = []
|
|
|
|
for md_file in md_files:
|
|
file_path = Path(md_file)
|
|
|
|
# Extract zero-padded page number from filename (e.g. "06" from "Manual-06.md")
|
|
m = re.search(r'-(\d+)\.md$', str(file_path))
|
|
page_num = m.group(1) if m else "000"
|
|
|
|
print(f"Processing [{page_num}] {file_path.name} ...")
|
|
|
|
with open(md_file, "r", encoding="utf-8") as f:
|
|
content = f.read()
|
|
|
|
# Replace each image reference
|
|
def replace_image(match):
|
|
alt = match.group(1)
|
|
old_path = match.group(2)
|
|
|
|
# Decode %20 → space for filesystem access
|
|
old_path_decoded = old_path.replace("%20", " ")
|
|
|
|
# Filename only: _page_0_Figure_3.jpeg
|
|
img_filename = Path(old_path_decoded).name
|
|
|
|
# Strip leading _page_N_ to get: Figure_3.jpeg or Picture_12.jpeg
|
|
clean_name = re.sub(r'^_page_\d+_', '', img_filename)
|
|
|
|
# New unique name: p006_Figure_3.jpeg
|
|
new_name = f"p{page_num}_{clean_name}"
|
|
|
|
# Copy image to unified folder
|
|
src = os.path.join(input_dir, old_path_decoded)
|
|
dst = os.path.join(unified_images_path, new_name)
|
|
if os.path.exists(src):
|
|
shutil.copy2(src, dst)
|
|
else:
|
|
print(f" WARNING: image not found: {src}")
|
|
|
|
return f""
|
|
|
|
new_content = re.sub(r'!\[([^\]]*)\]\(([^)]+)\)', replace_image, content)
|
|
merged_content.append(new_content)
|
|
|
|
if not merged_content:
|
|
print("No content to merge")
|
|
return
|
|
|
|
final_content = "\n\n---\n\n".join(merged_content)
|
|
|
|
output_path = os.path.join(input_dir, output_file)
|
|
with open(output_path, "w", encoding="utf-8") as f:
|
|
f.write(final_content)
|
|
|
|
print("\n" + "=" * 60)
|
|
print(f"SUCCESS: {output_path}")
|
|
print(f" Files merged : {len(merged_content)}")
|
|
print(f" Total chars : {len(final_content):,}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
merge_markdown_files(
|
|
input_dir="output",
|
|
output_file="merged_all.md",
|
|
images_subdir="images",
|
|
)
|