Files
documan/merge_markdown.py
minsung 8d4339302e feat: MD 파일 병합 및 이미지 경로 통합 스크립트 추가 (#1)
- merge_markdown.py: 96개 페이지별 MD를 단일 파일로 병합
  - 이미지를 output/images/ 폴더로 통합, p{NN}_ prefix로 파일명 충돌 방지
  - file_range 파라미터로 부분 테스트 가능
- docs/tutorial.md: merge 명령어 및 사용법 문서화
- docs/history: 작업 이력 파일 추가

소요 시간: 10분 | Context: input 18k / output 2k tokens

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-01 11:00:28 +09:00

120 lines
3.6 KiB
Python

#!/usr/bin/env python3
"""
Merge multiple Markdown files into a single file,
consolidating all images into a single images/ folder with unique names.
Image rename rule:
{stem}_images/_page_0_Figure_3.jpeg -> images/p006_Figure_3.jpeg
{stem}_images/_page_0_Picture_12.jpeg -> images/p006_Picture_12.jpeg
"""
import os
import re
import glob
import shutil
from pathlib import Path
def merge_markdown_files(
input_dir: str = "output",
output_file: str = "merged_all.md",
images_subdir: str = "images",
file_range: tuple = None, # e.g. (6, 8) to process only pages 06~08
):
md_pattern = os.path.join(input_dir, "*.md")
all_md_files = sorted(glob.glob(md_pattern))
# Only include files matching Manual-NN pattern (skip merged outputs)
all_md_files = [f for f in all_md_files if re.search(r'-\d+\.md$', f)]
# Filter by page number range if given
if file_range:
start, end = file_range
md_files = []
for f in all_md_files:
m = re.search(r'-(\d+)\.md$', f)
if m and start <= int(m.group(1)) <= end:
md_files.append(f)
else:
md_files = all_md_files
if not md_files:
print(f"No markdown files found in {input_dir}")
return
print(f"Files to merge: {len(md_files)}")
for f in md_files:
print(f" {Path(f).name}")
print("=" * 60)
# Create unified images directory
unified_images_path = os.path.join(input_dir, images_subdir)
os.makedirs(unified_images_path, exist_ok=True)
merged_content = []
for md_file in md_files:
file_path = Path(md_file)
# Extract zero-padded page number from filename (e.g. "06" from "Manual-06.md")
m = re.search(r'-(\d+)\.md$', str(file_path))
page_num = m.group(1) if m else "000"
print(f"Processing [{page_num}] {file_path.name} ...")
with open(md_file, "r", encoding="utf-8") as f:
content = f.read()
# Replace each image reference
def replace_image(match):
alt = match.group(1)
old_path = match.group(2)
# Decode %20 → space for filesystem access
old_path_decoded = old_path.replace("%20", " ")
# Filename only: _page_0_Figure_3.jpeg
img_filename = Path(old_path_decoded).name
# Strip leading _page_N_ to get: Figure_3.jpeg or Picture_12.jpeg
clean_name = re.sub(r'^_page_\d+_', '', img_filename)
# New unique name: p006_Figure_3.jpeg
new_name = f"p{page_num}_{clean_name}"
# Copy image to unified folder
src = os.path.join(input_dir, old_path_decoded)
dst = os.path.join(unified_images_path, new_name)
if os.path.exists(src):
shutil.copy2(src, dst)
else:
print(f" WARNING: image not found: {src}")
return f"![{alt}]({images_subdir}/{new_name})"
new_content = re.sub(r'!\[([^\]]*)\]\(([^)]+)\)', replace_image, content)
merged_content.append(new_content)
if not merged_content:
print("No content to merge")
return
final_content = "\n\n---\n\n".join(merged_content)
output_path = os.path.join(input_dir, output_file)
with open(output_path, "w", encoding="utf-8") as f:
f.write(final_content)
print("\n" + "=" * 60)
print(f"SUCCESS: {output_path}")
print(f" Files merged : {len(merged_content)}")
print(f" Total chars : {len(final_content):,}")
if __name__ == "__main__":
merge_markdown_files(
input_dir="output",
output_file="merged_all.md",
images_subdir="images",
)