#!/usr/bin/env python3 """ Merge multiple Markdown files into a single file, consolidating all images into a single images/ folder with unique names. Image rename rule: {stem}_images/_page_0_Figure_3.jpeg -> images/p006_Figure_3.jpeg {stem}_images/_page_0_Picture_12.jpeg -> images/p006_Picture_12.jpeg """ import os import re import glob import shutil from pathlib import Path def merge_markdown_files( input_dir: str = "output", output_file: str = "merged_all.md", images_subdir: str = "images", file_range: tuple = None, # e.g. (6, 8) to process only pages 06~08 ): md_pattern = os.path.join(input_dir, "*.md") all_md_files = sorted(glob.glob(md_pattern)) # Only include files matching Manual-NN pattern (skip merged outputs) all_md_files = [f for f in all_md_files if re.search(r'-\d+\.md$', f)] # Filter by page number range if given if file_range: start, end = file_range md_files = [] for f in all_md_files: m = re.search(r'-(\d+)\.md$', f) if m and start <= int(m.group(1)) <= end: md_files.append(f) else: md_files = all_md_files if not md_files: print(f"No markdown files found in {input_dir}") return print(f"Files to merge: {len(md_files)}") for f in md_files: print(f" {Path(f).name}") print("=" * 60) # Create unified images directory unified_images_path = os.path.join(input_dir, images_subdir) os.makedirs(unified_images_path, exist_ok=True) merged_content = [] for md_file in md_files: file_path = Path(md_file) # Extract zero-padded page number from filename (e.g. "06" from "Manual-06.md") m = re.search(r'-(\d+)\.md$', str(file_path)) page_num = m.group(1) if m else "000" print(f"Processing [{page_num}] {file_path.name} ...") with open(md_file, "r", encoding="utf-8") as f: content = f.read() # Replace each image reference def replace_image(match): alt = match.group(1) old_path = match.group(2) # Decode %20 → space for filesystem access old_path_decoded = old_path.replace("%20", " ") # Filename only: _page_0_Figure_3.jpeg img_filename = Path(old_path_decoded).name # Strip leading _page_N_ to get: Figure_3.jpeg or Picture_12.jpeg clean_name = re.sub(r'^_page_\d+_', '', img_filename) # New unique name: p006_Figure_3.jpeg new_name = f"p{page_num}_{clean_name}" # Copy image to unified folder src = os.path.join(input_dir, old_path_decoded) dst = os.path.join(unified_images_path, new_name) if os.path.exists(src): shutil.copy2(src, dst) else: print(f" WARNING: image not found: {src}") return f"![{alt}]({images_subdir}/{new_name})" new_content = re.sub(r'!\[([^\]]*)\]\(([^)]+)\)', replace_image, content) merged_content.append(new_content) if not merged_content: print("No content to merge") return final_content = "\n\n---\n\n".join(merged_content) output_path = os.path.join(input_dir, output_file) with open(output_path, "w", encoding="utf-8") as f: f.write(final_content) print("\n" + "=" * 60) print(f"SUCCESS: {output_path}") print(f" Files merged : {len(merged_content)}") print(f" Total chars : {len(final_content):,}") if __name__ == "__main__": merge_markdown_files( input_dir="output", output_file="merged_all.md", images_subdir="images", )