documan/merge_markdown.py

#!/usr/bin/env python3
"""
Merge multiple Markdown files into a single file,
consolidating all images into a single images/ folder with unique names.

Image rename rule:
  {stem}_images/_page_0_Figure_3.jpeg  ->  images/p006_Figure_3.jpeg
  {stem}_images/_page_0_Picture_12.jpeg -> images/p006_Picture_12.jpeg
"""

import os
import re
import glob
import shutil
from pathlib import Path


def merge_markdown_files(
    input_dir: str = "output",
    output_file: str = "merged_all.md",
    images_subdir: str = "images",
    file_range: tuple = None,  # e.g. (6, 8) to process only pages 06~08
):
    md_pattern = os.path.join(input_dir, "*.md")
    all_md_files = sorted(glob.glob(md_pattern))

    # Only include files matching Manual-NN pattern (skip merged outputs)
    all_md_files = [f for f in all_md_files if re.search(r'-\d+\.md$', f)]

    # Filter by page number range if given
    if file_range:
        start, end = file_range
        md_files = []
        for f in all_md_files:
            m = re.search(r'-(\d+)\.md$', f)
            if m and start <= int(m.group(1)) <= end:
                md_files.append(f)
    else:
        md_files = all_md_files

    if not md_files:
        print(f"No markdown files found in {input_dir}")
        return

    print(f"Files to merge: {len(md_files)}")
    for f in md_files:
        print(f"  {Path(f).name}")
    print("=" * 60)

    # Create unified images directory
    unified_images_path = os.path.join(input_dir, images_subdir)
    os.makedirs(unified_images_path, exist_ok=True)

    merged_content = []

    for md_file in md_files:
        file_path = Path(md_file)

        # Extract zero-padded page number from filename (e.g. "06" from "Manual-06.md")
        m = re.search(r'-(\d+)\.md$', str(file_path))
        page_num = m.group(1) if m else "000"

        print(f"Processing [{page_num}] {file_path.name} ...")

        with open(md_file, "r", encoding="utf-8") as f:
            content = f.read()

        # Replace each image reference
        def replace_image(match):
            alt = match.group(1)
            old_path = match.group(2)

            # Decode %20 → space for filesystem access
            old_path_decoded = old_path.replace("%20", " ")

            # Filename only: _page_0_Figure_3.jpeg
            img_filename = Path(old_path_decoded).name

            # Strip leading _page_N_ to get: Figure_3.jpeg  or  Picture_12.jpeg
            clean_name = re.sub(r'^_page_\d+_', '', img_filename)

            # New unique name: p006_Figure_3.jpeg
            new_name = f"p{page_num}_{clean_name}"

            # Copy image to unified folder
            src = os.path.join(input_dir, old_path_decoded)
            dst = os.path.join(unified_images_path, new_name)
            if os.path.exists(src):
                shutil.copy2(src, dst)
            else:
                print(f"  WARNING: image not found: {src}")

            return f"![{alt}]({images_subdir}/{new_name})"

        new_content = re.sub(r'!\[([^\]]*)\]\(([^)]+)\)', replace_image, content)
        merged_content.append(new_content)

    if not merged_content:
        print("No content to merge")
        return

    final_content = "\n\n---\n\n".join(merged_content)

    output_path = os.path.join(input_dir, output_file)
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(final_content)

    print("\n" + "=" * 60)
    print(f"SUCCESS: {output_path}")
    print(f"  Files merged : {len(merged_content)}")
    print(f"  Total chars  : {len(final_content):,}")


if __name__ == "__main__":
    merge_markdown_files(
        input_dir="output",
        output_file="merged_all.md",
        images_subdir="images",
    )