feat: MD 파일 병합 및 이미지 경로 통합 스크립트 추가 (#1)

- merge_markdown.py: 96개 페이지별 MD를 단일 파일로 병합 - 이미지를 output/images/ 폴더로 통합, p{NN}_ prefix로 파일명 충돌 방지 - file_range 파라미터로 부분 테스트 가능 - docs/tutorial.md: merge 명령어 및 사용법 문서화 - docs/history: 작업 이력 파일 추가 소요 시간: 10분 | Context: input 18k / output 2k tokens Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-01 11:00:28 +09:00
parent 892e4ecafb
commit 8d4339302e
24 changed files with 2335 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,89 @@
 # Python virtual environment
 .venv/
 venv/
 env/
 ENV/
 # Python cache
 __pycache__/
 *.py[cod]
 *$py.class
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 # PyInstaller
 *.manifest
 *.spec
 # Unit test / coverage
 htmlcov/
 .tox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 .pytest_cache/
 # Jupyter Notebook
 .ipynb_checkpoints
 # pyenv
 .python-version
 # IDE
 .vscode/
 .idea/
 *.swp
 *.swo
 *~
 # OS
 .DS_Store
 Thumbs.db
 Desktop.ini
 # Environment variables
 .env
 .env.*
 !.env.example
 # Project specific
 backup_epub/
 input/
 output/
 back/
 # Large binary files
 *.epub
 *.pdf
 *.png
 *.jpg
 *.jpeg
 *.gif
 *.bmp
 *.tiff
 *.zip
 *.tar
 *.tar.gz
 *.rar
 !README.md
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -0,0 +1,47 @@
 # CLAUDE.md — documan 프로젝트 규칙
 ## 작업 히스토리 기록 규칙
 모든 작업 세션이 끝나면 반드시 히스토리 파일을 작성해야 한다.
 ### 기록 위치
 ```
 docs/history/YYYY-MM-DD_{작업명}.md
 ```
 ### 필수 포함 항목 (누락 시 저장 차단됨)
 ```markdown
 **소요 시간**: X분
 **Context 사용량**: input Xk / output Xk tokens
 ```
 ### 선택 포함 항목
 ```markdown
 **이슈**: #N
 ```
 - 작업이 특정 Gitea 이슈와 연관된 경우 이슈 번호를 기재
 - 나중에 이슈별 토큰 사용량 집계에 활용됨
 ### 히스토리 파일 작성 기준
 - 사용자 요청이 완료된 직후, 응답 마지막 단계에서 작성
 - 작업명은 핵심 내용을 한국어로 간결하게 (예: `2026-03-31_PDF2단변환기능추가.md`)
 - 변경한 파일 목록, 주요 결정사항, 발생한 문제와 해결 방법 포함
 ### 히스토리 파일 템플릿
 ```markdown
 **이슈**: #N
 **소요 시간**: X분
 **Context 사용량**: input Xk / output Xk tokens
 ## 작업 내용
 {작업 요약}
 ## 변경 파일
 - `파일경로` : 변경 내용
 ## 주요 결정사항
 {설계 판단, 선택한 이유}
 ## 문제 및 해결
 {발생한 문제와 해결 방법}
 ```
--- a/convert_epub.py
+++ b/convert_epub.py
@@ -0,0 +1,205 @@
 #!/usr/bin/env python3
 """
 EPUB to Markdown converter using ebooklib and html2text
 """
 import os
 import json
 import re
 from pathlib import Path
 import ebooklib
 from ebooklib import epub
 from bs4 import BeautifulSoup
 def html_to_markdown(soup):
    """Convert BeautifulSoup HTML to Markdown format"""
    def process_element(element):
        if isinstance(element, str):
            text = element.strip()
            if text:
                return text
            return ""
        tag = element.name
        # Headers
        if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            level_num = int(tag[1])
            text = element.get_text().strip()
            return '\n' + '#' * level_num + ' ' + text + '\n'
        # Paragraphs
        elif tag == 'p':
            text = ''.join(process_element(child) for child in element.children)
            return '\n' + text.strip() + '\n'
        # Line breaks
        elif tag == 'br':
            return '\n'
        # Bold
        elif tag in ['strong', 'b']:
            text = ''.join(process_element(child) for child in element.children)
            return '**' + text.strip() + '**'
        # Italic
        elif tag in ['em', 'i']:
            text = ''.join(process_element(child) for child in element.children)
            return '*' + text.strip() + '*'
        # Links
        elif tag == 'a':
            text = ''.join(process_element(child) for child in element.children)
            href = element.get('href', '')
            if href:
                return f'[{text.strip()}]({href})'
            return text.strip()
        # Images
        elif tag == 'img':
            src = element.get('src', '')
            alt = element.get('alt', '')
            return f'![{alt}]({src})'
        # Lists
        elif tag == 'ul':
            items = []
            for li in element.find_all('li', recursive=False):
                text = ''.join(process_element(child) for child in li.children)
                items.append('- ' + text.strip())
            return '\n' + '\n'.join(items) + '\n'
        elif tag == 'ol':
            items = []
            for i, li in enumerate(element.find_all('li', recursive=False), 1):
                text = ''.join(process_element(child) for child in li.children)
                items.append(f'{i}. ' + text.strip())
            return '\n' + '\n'.join(items) + '\n'
        # Blockquote
        elif tag == 'blockquote':
            text = ''.join(process_element(child) for child in element.children)
            lines = text.strip().split('\n')
            return '\n' + '\n'.join('> ' + line for line in lines) + '\n'
        # Code
        elif tag == 'code':
            text = element.get_text()
            return '`' + text + '`'
        elif tag == 'pre':
            text = element.get_text()
            return '\n```\n' + text + '\n```\n'
        # Div and span - just process children
        elif tag in ['div', 'span', 'section', 'article']:
            return ''.join(process_element(child) for child in element.children)
        # Default - process children
        else:
            return ''.join(process_element(child) for child in element.children)
    # Process body or entire soup
    body = soup.find('body') if soup.find('body') else soup
    markdown = process_element(body)
    # Clean up multiple newlines
    markdown = re.sub(r'\n{3,}', '\n\n', markdown)
    return markdown.strip()
 def convert_epub_to_markdown(epub_path: str, output_dir: str = "output"):
    """
    Convert EPUB file to Markdown
    Args:
        epub_path: Path to the EPUB file
        output_dir: Directory to save the output (default: "output")
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    # Get the base filename without extension
    epub_file = Path(epub_path)
    base_name = epub_file.stem
    print(f"Converting {epub_path} to Markdown...")
    # Read the EPUB file
    book = epub.read_epub(epub_path)
    # Extract all text content
    chapters = []
    images = {}
    image_counter = 0
    for item in book.get_items():
        if item.get_type() == ebooklib.ITEM_DOCUMENT:
            # Get HTML content
            html_content = item.get_content().decode('utf-8')
            # Parse with BeautifulSoup
            soup = BeautifulSoup(html_content, 'html.parser')
            # Convert to markdown-like format
            markdown_content = html_to_markdown(soup)
            # Clean up the markdown
            markdown_content = markdown_content.strip()
            if markdown_content:
                chapters.append(markdown_content)
        elif item.get_type() == ebooklib.ITEM_IMAGE:
            # Save image
            image_counter += 1
            img_name = item.get_name().split('/')[-1]
            if not img_name:
                img_name = f"image_{image_counter}.{item.media_type.split('/')[-1]}"
            images[img_name] = item.get_content()
    # Combine all chapters
    full_markdown = "\n\n---\n\n".join(chapters)
    # Save as markdown
    output_path = os.path.join(output_dir, f"{base_name}.md")
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(full_markdown)
    print(f"OK Conversion complete!")
    print(f"OK Output saved to: {output_path}")
    print(f"OK Total chapters: {len(chapters)}")
    # Save images if any
    if images:
        images_dir = os.path.join(output_dir, f"{base_name}_images")
        os.makedirs(images_dir, exist_ok=True)
        for img_name, img_data in images.items():
            img_path = os.path.join(images_dir, img_name)
            with open(img_path, "wb") as f:
                f.write(img_data)
        print(f"OK {len(images)} images saved to: {images_dir}")
    # Save metadata if available
    metadata = {
        'title': book.get_metadata('DC', 'title'),
        'creator': book.get_metadata('DC', 'creator'),
        'language': book.get_metadata('DC', 'language'),
        'publisher': book.get_metadata('DC', 'publisher'),
        'description': book.get_metadata('DC', 'description'),
    }
    if any(metadata.values()):
        metadata_path = os.path.join(output_dir, f"{base_name}_metadata.json")
        with open(metadata_path, "w", encoding="utf-8") as f:
            json.dump(metadata, f, indent=2, ensure_ascii=False)
        print(f"OK Metadata saved to: {metadata_path}")
 if __name__ == "__main__":
    # Convert the EPUB file in the input directory
    epub_path = "input/the-art-of-spending-money.epub"
    convert_epub_to_markdown(epub_path)
--- a/convert_pdfs.py
+++ b/convert_pdfs.py
@@ -0,0 +1,111 @@
 #!/usr/bin/env python3
 """
 Batch PDF to Markdown converter using marker-pdf library
 """
 import os
 import glob
 from pathlib import Path
 from marker.converters.pdf import PdfConverter
 from marker.models import create_model_dict
 from marker.output import text_from_rendered
 def convert_pdf_to_markdown(pdf_path: str, output_dir: str = "output"):
    """
    Convert PDF file to Markdown
    Args:
        pdf_path: Path to the PDF file
        output_dir: Directory to save the output (default: "output")
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    # Get the base filename without extension
    pdf_file = Path(pdf_path)
    base_name = pdf_file.stem
    print(f"\nConverting {pdf_file.name} to Markdown...")
    try:
        # Initialize the converter with model dictionary
        converter = PdfConverter(
            artifact_dict=create_model_dict(),
        )
        # Convert the PDF file
        rendered = converter(pdf_path)
        # Extract text and images from rendered output
        text, metadata, images = text_from_rendered(rendered)
        # Save as markdown
        output_path = os.path.join(output_dir, f"{base_name}.md")
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(text)
        print(f"  OK Output saved to: {output_path}")
        # Save images if any
        if images:
            images_dir = os.path.join(output_dir, f"{base_name}_images")
            os.makedirs(images_dir, exist_ok=True)
            for img_name, img_data in images.items():
                img_path = os.path.join(images_dir, img_name)
                with open(img_path, "wb") as f:
                    f.write(img_data)
            print(f"  OK {len(images)} images saved to: {images_dir}")
        # Save metadata if available
        if metadata:
            metadata_path = os.path.join(output_dir, f"{base_name}_metadata.json")
            import json
            with open(metadata_path, "w", encoding="utf-8") as f:
                json.dump(metadata, f, indent=2, ensure_ascii=False)
            print(f"  OK Metadata saved to: {metadata_path}")
        return True
    except Exception as e:
        print(f"  ERROR: Failed to convert {pdf_file.name}: {e}")
        return False
 def convert_all_pdfs(input_dir: str = "input", output_dir: str = "output"):
    """
    Convert all PDF files in the input directory to Markdown
    Args:
        input_dir: Directory containing PDF files
        output_dir: Directory to save the output
    """
    # Find all PDF files
    pdf_pattern = os.path.join(input_dir, "*.pdf")
    pdf_files = sorted(glob.glob(pdf_pattern))
    if not pdf_files:
        print(f"No PDF files found in {input_dir}")
        return
    print(f"Found {len(pdf_files)} PDF files to convert")
    print("=" * 60)
    successful = 0
    failed = 0
    for pdf_file in pdf_files:
        if convert_pdf_to_markdown(pdf_file, output_dir):
            successful += 1
        else:
            failed += 1
    print("\n" + "=" * 60)
    print(f"Conversion complete!")
    print(f"  Successful: {successful}")
    print(f"  Failed: {failed}")
    print(f"  Total: {len(pdf_files)}")
 if __name__ == "__main__":
    convert_all_pdfs()
--- a/convert_pdfs_fast.py
+++ b/convert_pdfs_fast.py
@@ -0,0 +1,150 @@
 #!/usr/bin/env python3
 """
 Fast PDF to Markdown converter - optimized for text-heavy documents
 """
 import argparse
 import os
 import glob
 from pathlib import Path
 from marker.converters.pdf import PdfConverter
 from marker.models import create_model_dict
 from marker.output import text_from_rendered
 from marker.config.parser import ConfigParser
 def convert_pdf_to_markdown_fast(pdf_path: str, output_dir: str = "output", languages: str = None):
    """
    Convert PDF file to Markdown with speed optimizations for text-heavy documents
    Args:
        pdf_path: Path to the PDF file
        output_dir: Directory to save the output (default: "output")
        languages: Comma-separated language codes for OCR (e.g. "ko", "ko,en")
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    # Get the base filename without extension
    pdf_file = Path(pdf_path)
    base_name = pdf_file.stem
    print(f"\nConverting {pdf_file.name} to Markdown...")
    if languages:
        print(f"  Languages: {languages}")
    try:
        # Configure for speed - text-focused processing
        config = {
            "output_format": "markdown",
            # Disable image extraction for speed (images won't be saved separately)
            # "disable_image_extraction": True,  # Uncomment if you want to skip all images
        }
        if languages:
            config["languages"] = languages.split(",")
        config_parser = ConfigParser(config)
        # Initialize the converter with optimized settings
        converter = PdfConverter(
            config=config_parser.generate_config_dict(),
            artifact_dict=create_model_dict(),
            processor_list=config_parser.get_processors(),
            renderer=config_parser.get_renderer(),
        )
        # Convert the PDF file
        rendered = converter(pdf_path)
        # Extract text and images from rendered output
        text, metadata, images = text_from_rendered(rendered)
        # Save as markdown
        output_path = os.path.join(output_dir, f"{base_name}.md")
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(text)
        print(f"  OK Output saved to: {output_path}")
        # Save images
        if images:
            images_dir = os.path.join(output_dir, f"{base_name}_images")
            os.makedirs(images_dir, exist_ok=True)
            for img_name, img_data in images.items():
                img_path = os.path.join(images_dir, img_name)
                with open(img_path, "wb") as f:
                    f.write(img_data)
            print(f"  OK {len(images)} images saved to: {images_dir}")
        # Skip metadata saving for speed
        # if metadata:
        #     metadata_path = os.path.join(output_dir, f"{base_name}_metadata.json")
        #     import json
        #     with open(metadata_path, "w", encoding="utf-8") as f:
        #         json.dump(metadata, f, indent=2, ensure_ascii=False)
        #     print(f"  OK Metadata saved to: {metadata_path}")
        return (True, pdf_file.name)
    except Exception as e:
        print(f"  ERROR: Failed to convert {pdf_file.name}: {e}")
        return (False, pdf_file.name)
 def convert_all_pdfs_fast(input_dir: str = "input", output_dir: str = "output", languages: str = None):
    """
    Convert all PDF files in the input directory to Markdown (sequential, memory-safe)
    Args:
        input_dir: Directory containing PDF files
        output_dir: Directory to save the output
        languages: Comma-separated language codes for OCR (e.g. "ko", "ko,en")
    """
    # Find all PDF files
    pdf_pattern = os.path.join(input_dir, "*.pdf")
    pdf_files = sorted(glob.glob(pdf_pattern))
    if not pdf_files:
        print(f"No PDF files found in {input_dir}")
        return
    print(f"Found {len(pdf_files)} PDF files to convert")
    print("Mode: FAST (text-focused, sequential processing)")
    if languages:
        print(f"Languages: {languages}")
    print("=" * 60)
    successful = 0
    failed = 0
    failed_files = []
    for i, pdf_file in enumerate(pdf_files, 1):
        print(f"\n[{i}/{len(pdf_files)}]", end=" ")
        success, filename = convert_pdf_to_markdown_fast(pdf_file, output_dir, languages)
        if success:
            successful += 1
        else:
            failed += 1
            failed_files.append(filename)
    print("\n" + "=" * 60)
    print(f"Conversion complete!")
    print(f"  Successful: {successful}")
    print(f"  Failed: {failed}")
    print(f"  Total: {len(pdf_files)}")
    if failed_files:
        print(f"\nFailed files:")
        for filename in failed_files:
            print(f"  - {filename}")
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Fast PDF to Markdown converter")
    parser.add_argument("--input_dir", default="input", help="Input directory containing PDF files")
    parser.add_argument("--output_dir", default="output", help="Output directory for markdown files")
    parser.add_argument("--languages", default=None, help="Comma-separated language codes for OCR (e.g. ko, ko,en)")
    args = parser.parse_args()
    convert_all_pdfs_fast(args.input_dir, args.output_dir, args.languages)
--- a/convert_pdfs_parallel.py
+++ b/convert_pdfs_parallel.py
@@ -0,0 +1,149 @@
 #!/usr/bin/env python3
 """
 Batch PDF to Markdown converter with parallel processing using marker-pdf library
 """
 import os
 import glob
 from pathlib import Path
 from marker.converters.pdf import PdfConverter
 from marker.models import create_model_dict
 from marker.output import text_from_rendered
 from concurrent.futures import ProcessPoolExecutor, as_completed
 import multiprocessing
 def convert_pdf_to_markdown(pdf_path: str, output_dir: str = "output"):
    """
    Convert PDF file to Markdown
    Args:
        pdf_path: Path to the PDF file
        output_dir: Directory to save the output (default: "output")
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    # Get the base filename without extension
    pdf_file = Path(pdf_path)
    base_name = pdf_file.stem
    print(f"\nConverting {pdf_file.name} to Markdown...")
    try:
        # Initialize the converter with model dictionary
        converter = PdfConverter(
            artifact_dict=create_model_dict(),
        )
        # Convert the PDF file
        rendered = converter(pdf_path)
        # Extract text and images from rendered output
        text, metadata, images = text_from_rendered(rendered)
        # Save as markdown
        output_path = os.path.join(output_dir, f"{base_name}.md")
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(text)
        print(f"  OK Output saved to: {output_path}")
        # Save images if any
        if images:
            images_dir = os.path.join(output_dir, f"{base_name}_images")
            os.makedirs(images_dir, exist_ok=True)
            for img_name, img_data in images.items():
                img_path = os.path.join(images_dir, img_name)
                with open(img_path, "wb") as f:
                    f.write(img_data)
            print(f"  OK {len(images)} images saved to: {images_dir}")
        # Save metadata if available
        if metadata:
            metadata_path = os.path.join(output_dir, f"{base_name}_metadata.json")
            import json
            with open(metadata_path, "w", encoding="utf-8") as f:
                json.dump(metadata, f, indent=2, ensure_ascii=False)
            print(f"  OK Metadata saved to: {metadata_path}")
        return (True, pdf_file.name)
    except Exception as e:
        print(f"  ERROR: Failed to convert {pdf_file.name}: {e}")
        return (False, pdf_file.name)
 def convert_all_pdfs_parallel(input_dir: str = "input", output_dir: str = "output", max_workers: int = None):
    """
    Convert all PDF files in the input directory to Markdown using parallel processing
    Args:
        input_dir: Directory containing PDF files
        output_dir: Directory to save the output
        max_workers: Maximum number of parallel workers (default: CPU count - 1)
    """
    # Find all PDF files
    pdf_pattern = os.path.join(input_dir, "*.pdf")
    pdf_files = sorted(glob.glob(pdf_pattern))
    if not pdf_files:
        print(f"No PDF files found in {input_dir}")
        return
    # Determine number of workers
    if max_workers is None:
        max_workers = max(1, multiprocessing.cpu_count() - 1)
    print(f"Found {len(pdf_files)} PDF files to convert")
    print(f"Using {max_workers} parallel workers")
    print("=" * 60)
    successful = 0
    failed = 0
    failed_files = []
    # Process PDFs in parallel
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        # Submit all tasks
        future_to_pdf = {
            executor.submit(convert_pdf_to_markdown, pdf_file, output_dir): pdf_file
            for pdf_file in pdf_files
        }
        # Process completed tasks as they finish
        for future in as_completed(future_to_pdf):
            pdf_file = future_to_pdf[future]
            try:
                success, filename = future.result()
                if success:
                    successful += 1
                else:
                    failed += 1
                    failed_files.append(filename)
            except Exception as e:
                print(f"  ERROR: Exception occurred for {pdf_file}: {e}")
                failed += 1
                failed_files.append(Path(pdf_file).name)
    print("\n" + "=" * 60)
    print(f"Conversion complete!")
    print(f"  Successful: {successful}")
    print(f"  Failed: {failed}")
    print(f"  Total: {len(pdf_files)}")
    if failed_files:
        print(f"\nFailed files:")
        for filename in failed_files:
            print(f"  - {filename}")
 if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description="Parallel PDF to Markdown converter")
    parser.add_argument("--input_dir", default="input", help="Input directory containing PDF files")
    parser.add_argument("--output_dir", default="output", help="Output directory for markdown files")
    parser.add_argument("--workers", type=int, default=2, help="Number of parallel workers (default: 2)")
    args = parser.parse_args()
    convert_all_pdfs_parallel(args.input_dir, args.output_dir, args.workers)
--- a/convert_with_cropped_images.py
+++ b/convert_with_cropped_images.py
@@ -0,0 +1,310 @@
 #!/usr/bin/env python3
 """
 PDF to Markdown converter with cropped figure extraction
 Uses marker-pdf to detect figures, then crops them from page images.
 Supports 2-column (multi-column) → single-column reordering.
 """
 import os
 import re
 import glob
 from pathlib import Path
 from marker.converters.pdf import PdfConverter
 from marker.models import create_model_dict
 from marker.output import text_from_rendered
 from PIL import Image
 import fitz  # PyMuPDF
 def is_scanned_pdf(pdf_path: str, sample_pages: int = 3) -> bool:
    """페이지에 선택 가능한 텍스트가 없으면 스캔 PDF로 판단"""
    doc = fitz.open(pdf_path)
    total = min(sample_pages, len(doc))
    text_chars = 0
    for i in range(total):
        text_chars += len(doc[i].get_text().strip())
    doc.close()
    return text_chars < 50  # 글자 수가 매우 적으면 스캔본
 def reorder_text_by_columns(pdf_path: str) -> str:
    """
    텍스트 기반 PDF 전용: PyMuPDF 블록 좌표로 2단 → 1단 순서 재정렬.
    각 페이지에서 좌측 컬럼 전체 → 우측 컬럼 전체 순으로 읽음.
    """
    doc = fitz.open(pdf_path)
    pages_text = []
    for page in doc:
        blocks = page.get_text("blocks", sort=False)
        text_blocks = [b for b in blocks if b[6] == 0 and b[4].strip()]
        if not text_blocks:
            continue
        page_width = page.rect.width
        mid_x = page_width / 2
        left = [b for b in text_blocks if b[2] <= mid_x + 30]
        right = [b for b in text_blocks if b[0] >= mid_x - 30]
        span = [b for b in text_blocks if b[0] < mid_x - 30 and b[2] > mid_x + 30]
        is_two_col = len(left) >= 2 and len(right) >= 2 and not span
        if is_two_col:
            left.sort(key=lambda b: b[1])
            right.sort(key=lambda b: b[1])
            ordered = left + right
        else:
            ordered = sorted(text_blocks, key=lambda b: (b[1], b[0]))
        pages_text.append("\n\n".join(b[4].strip() for b in ordered))
    doc.close()
    return "\n\n---\n\n".join(pages_text)
 def extract_figure_images(pdf_path: str, rendered, output_dir: str, base_name: str):
    """
    Extract figure images by cropping from page images based on marker's detection
    Args:
        pdf_path: Path to PDF file
        rendered: Marker's rendered output with figure positions
        output_dir: Output directory
        base_name: Base filename
    Returns:
        dict: Mapping of image names to image data
    """
    images_dict = {}
    # Check if rendered has pages with image information
    if not hasattr(rendered, 'pages') or not rendered.pages:
        print("  No page information in rendered output")
        return images_dict
    # Open PDF with PyMuPDF to render pages as images
    doc = fitz.open(pdf_path)
    print(f"  Processing {len(rendered.pages)} pages for figure extraction...")
    for page_idx, page_data in enumerate(rendered.pages):
        page_num = page_idx + 1
        # Check if page has images/figures
        if not hasattr(page_data, 'images') or not page_data.images:
            continue
        print(f"    Page {page_num}: Found {len(page_data.images)} figure(s)")
        # Render page as image
        pdf_page = doc[page_idx]
        # Render at 2x resolution for better quality
        mat = fitz.Matrix(2, 2)
        pix = pdf_page.get_pixmap(matrix=mat)
        # Convert to PIL Image
        import io
        img_data = pix.tobytes("png")
        page_img = Image.open(io.BytesIO(img_data))
        # Extract each figure from this page
        for fig_idx, fig_info in enumerate(page_data.images):
            try:
                # Get bounding box (marker stores positions)
                if hasattr(fig_info, 'bbox'):
                    bbox = fig_info.bbox
                    # Scale bbox coordinates (marker uses PDF coordinates)
                    # Adjust for 2x rendering
                    x0, y0, x1, y1 = bbox
                    x0, y0, x1, y1 = int(x0 * 2), int(y0 * 2), int(x1 * 2), int(y1 * 2)
                    # Crop the figure
                    cropped = page_img.crop((x0, y0, x1, y1))
                    # Save to bytes
                    from io import BytesIO
                    img_bytes = BytesIO()
                    cropped.save(img_bytes, format='PNG')
                    # Generate image name
                    img_name = f"_page_{page_num}_Figure_{fig_idx + 1}.png"
                    images_dict[img_name] = img_bytes.getvalue()
                    print(f"      Cropped figure {fig_idx + 1}: {x1-x0}x{y1-y0}px")
            except Exception as e:
                print(f"      Warning: Could not crop figure {fig_idx + 1}: {e}")
    doc.close()
    return images_dict
 def convert_pdf_with_cropped_images(pdf_path: str, output_dir: str = "output"):
    """
    Convert PDF to Markdown with cropped figure images.
    - 스캔 PDF: marker-pdf OCR + 레이아웃 검출 (2단 자동 처리)
    - 텍스트 PDF: PyMuPDF 블록 좌표 기반 2단→1단 재정렬
    """
    import io
    os.makedirs(output_dir, exist_ok=True)
    pdf_file = Path(pdf_path)
    base_name = pdf_file.stem
    print(f"\nConverting {pdf_file.name}...")
    scanned = is_scanned_pdf(pdf_path)
    print(f"  PDF type: {'scanned (OCR)' if scanned else 'text-based (PyMuPDF column reorder)'}")
    try:
        if not scanned:
            # 텍스트 기반 PDF: PyMuPDF로 2단 재정렬 추출
            print("  Extracting text with column reordering...")
            text = reorder_text_by_columns(pdf_path)
            metadata = None
            marker_images = {}
        else:
            # 스캔 PDF: marker-pdf가 OCR + 레이아웃(2단) 처리
            converter = PdfConverter(
                artifact_dict=create_model_dict(),
            )
            print("  Running marker-pdf OCR and layout detection...")
            rendered = converter(pdf_path)
            text, metadata, marker_images = text_from_rendered(rendered)
        # Fix image paths: prepend {base_name}_images/ folder to image references
        # 공백을 %20으로 인코딩 — Obsidian(CommonMark) 경로 파싱 오류 방지
        safe_base_name = base_name.replace(' ', '%20')
        text = re.sub(
            r'!\[([^\]]*)\]\(([^/)][^)]*)\)',
            rf'![\1]({safe_base_name}_images/\2)',
            text
        )
        # Save markdown
        output_path = os.path.join(output_dir, f"{base_name}.md")
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(text)
        print(f"  OK Markdown saved: {output_path}")
        # Extract cropped figure images
        print("  Extracting figures from pages...")
        cropped_images = extract_figure_images(pdf_path, rendered, output_dir, base_name)
        if cropped_images:
            images_dir = os.path.join(output_dir, f"{base_name}_images")
            os.makedirs(images_dir, exist_ok=True)
            for img_name, img_data in cropped_images.items():
                img_path = os.path.join(images_dir, img_name)
                with open(img_path, "wb") as f:
                    f.write(img_data)
            print(f"  OK {len(cropped_images)} figures saved to: {images_dir}")
        else:
            print("  ! No figures extracted (trying alternative method...)")
            # Fallback: use marker's images if available
            if marker_images:
                images_dir = os.path.join(output_dir, f"{base_name}_images")
                os.makedirs(images_dir, exist_ok=True)
                saved_count = 0
                for img_name, img_data in marker_images.items():
                    try:
                        from io import BytesIO
                        if isinstance(img_data, Image.Image):
                            img_bytes = BytesIO()
                            img_data.save(img_bytes, format='PNG')
                            img_bytes = img_bytes.getvalue()
                        else:
                            img_bytes = img_data
                        if img_bytes and len(img_bytes) > 0:
                            img_path = os.path.join(images_dir, img_name)
                            with open(img_path, "wb") as f:
                                f.write(img_bytes)
                            saved_count += 1
                    except Exception as e:
                        print(f"    Warning: Could not save {img_name}: {e}")
                if saved_count > 0:
                    print(f"  OK {saved_count} images from marker saved")
                else:
                    print("  ! No valid images to save")
        # Save metadata
        if metadata:
            import json
            metadata_path = os.path.join(output_dir, f"{base_name}_metadata.json")
            with open(metadata_path, "w", encoding="utf-8") as f:
                json.dump(metadata, f, indent=2, ensure_ascii=False)
        return True
    except Exception as e:
        print(f"  ERROR: {e}")
        import traceback
        traceback.print_exc()
        return False
 def convert_all_pdfs(input_dir: str = "input", output_dir: str = "output"):
    """
    Convert all PDFs with cropped figure extraction
    Each PDF is converted in a separate process to avoid multiprocessing issues
    """
    pdf_pattern = os.path.join(input_dir, "*.pdf")
    pdf_files = sorted(glob.glob(pdf_pattern))
    if not pdf_files:
        print(f"No PDF files found in {input_dir}")
        return
    print(f"Found {len(pdf_files)} PDF file(s)")
    print("=" * 60)
    successful = 0
    failed = 0
    import subprocess
    import sys
    for pdf_file in pdf_files:
        print(f"\nStarting conversion of: {os.path.basename(pdf_file)}")
        result = subprocess.run(
            [sys.executable, __file__, "--single", pdf_file, output_dir],
            capture_output=False
        )
        if result.returncode == 0:
            successful += 1
        else:
            failed += 1
            print(f"  FAILED: {os.path.basename(pdf_file)}")
    print("\n" + "=" * 60)
    print(f"Conversion complete!")
    print(f"  Successful: {successful}")
    print(f"  Failed: {failed}")
    print(f"  Total: {len(pdf_files)}")
 if __name__ == "__main__":
    import sys
    # Check if running in single-file mode (called by subprocess)
    if len(sys.argv) >= 4 and sys.argv[1] == "--single":
        pdf_file = sys.argv[2]
        output_dir = sys.argv[3]
        success = convert_pdf_with_cropped_images(pdf_file, output_dir)
        sys.exit(0 if success else 1)
    else:
        # Normal batch mode
        convert_all_pdfs()
--- a/debug_marker_images.py
+++ b/debug_marker_images.py
@@ -0,0 +1,95 @@
 #!/usr/bin/env python3
 """
 Debug marker-pdf image extraction
 """
 import os
 from pathlib import Path
 from marker.converters.pdf import PdfConverter
 from marker.models import create_model_dict
 from marker.output import text_from_rendered
 def debug_image_extraction(pdf_path: str):
    """
    Debug why images are not being extracted properly
    """
    pdf_file = Path(pdf_path)
    print(f"Debugging image extraction for: {pdf_file.name}")
    print("=" * 60)
    try:
        # Initialize converter
        converter = PdfConverter(
            artifact_dict=create_model_dict(),
        )
        # Convert
        print("\nConverting PDF...")
        rendered = converter(pdf_path)
        print(f"  Rendered type: {type(rendered)}")
        print(f"  Rendered attributes: {dir(rendered)}")
        # Check what's in rendered
        if hasattr(rendered, 'images'):
            print(f"\n  rendered.images exists: {len(rendered.images) if rendered.images else 0} images")
            if rendered.images:
                for idx, (key, val) in enumerate(list(rendered.images.items())[:3]):
                    print(f"    Image {idx}: {key}, data size: {len(val) if val else 0}")
        # Extract text and images
        print("\nExtracting text and images...")
        text, metadata, images = text_from_rendered(rendered)
        print(f"\n  Text length: {len(text)} characters")
        print(f"  Metadata: {type(metadata)}")
        print(f"  Images dict: {len(images) if images else 0} items")
        if images:
            print("\n  Detailed image info:")
            for idx, (img_name, img_data) in enumerate(images.items()):
                print(f"    {idx + 1}. Name: {img_name}")
                print(f"       Data type: {type(img_data)}")
                print(f"       Data size: {len(img_data) if img_data else 0} bytes")
                if img_data:
                    print(f"       First 20 bytes: {img_data[:20]}")
                else:
                    print(f"       WARNING: Empty data!")
        else:
            print("\n  WARNING: No images returned!")
        # Check rendered object for image data
        print("\n  Checking rendered object structure:")
        if hasattr(rendered, '__dict__'):
            for key, val in rendered.__dict__.items():
                if 'image' in key.lower():
                    print(f"    {key}: {type(val)}, length: {len(val) if hasattr(val, '__len__') else 'N/A'}")
        # Try to access images directly from rendered
        if hasattr(rendered, 'images') and rendered.images:
            print("\n  Attempting direct image access:")
            print(f"    Total images in rendered: {len(rendered.images)}")
            for idx, (img_name, img_obj) in enumerate(list(rendered.images.items())[:3]):
                print(f"\n    Image {idx + 1}: {img_name}")
                print(f"      Type: {type(img_obj)}")
                print(f"      Attributes: {dir(img_obj) if hasattr(img_obj, '__dir__') else 'None'}")
                if hasattr(img_obj, 'tobytes'):
                    img_bytes = img_obj.tobytes()
                    print(f"      Bytes: {len(img_bytes)}")
                elif hasattr(img_obj, 'save'):
                    print(f"      Has save method (PIL Image?)")
    except Exception as e:
        print(f"\n  ERROR: {e}")
        import traceback
        traceback.print_exc()
 if __name__ == "__main__":
    # Debug the first PDF in input folder
    import glob
    pdf_files = glob.glob("input/*.pdf")
    if pdf_files:
        debug_image_extraction(pdf_files[0])
    else:
        print("No PDF files found in input folder")
--- a/debug_single_page.py
+++ b/debug_single_page.py
@@ -0,0 +1,99 @@
 #!/usr/bin/env python3
 """
 Debug image extraction for a single page
 """
 import os
 from pathlib import Path
 from marker.converters.pdf import PdfConverter
 from marker.models import create_model_dict
 from marker.output import text_from_rendered
 import pypdfium2 as pdfium
 def debug_single_page(pdf_path: str, page_num: int = 1):
    """
    Debug image extraction for a specific page (page_num is 1-indexed)
    """
    pdf_file = Path(pdf_path)
    print(f"Debugging page {page_num} of: {pdf_file.name}")
    print("=" * 60)
    # First check what PyPDFium2 sees
    print("\n1. Checking with PyPDFium2:")
    try:
        pdf = pdfium.PdfDocument(pdf_path)
        page = pdf[page_num - 1]  # 0-indexed
        print(f"   Page {page_num} objects:")
        obj_count = 0
        for obj in page.get_objects():
            obj_count += 1
            if hasattr(pdfium, 'FPDF_PAGEOBJ_IMAGE'):
                if obj.type == pdfium.FPDF_PAGEOBJ_IMAGE:
                    print(f"     - Image object found (old API)")
            else:
                print(f"     - Object type: {obj.type}")
        print(f"   Total objects on page: {obj_count}")
        pdf.close()
    except Exception as e:
        print(f"   PyPDFium2 error: {e}")
    # Now check marker-pdf
    print("\n2. Checking with marker-pdf:")
    try:
        converter = PdfConverter(
            artifact_dict=create_model_dict(),
        )
        print("   Converting...")
        rendered = converter(pdf_path)
        # Check rendered object
        print(f"\n   Rendered type: {type(rendered)}")
        if hasattr(rendered, 'images'):
            print(f"   rendered.images: {len(rendered.images) if rendered.images else 0} images")
            if rendered.images:
                for img_name, img_data in list(rendered.images.items())[:5]:
                    print(f"     - {img_name}: {len(img_data) if img_data else 0} bytes")
        # Extract using text_from_rendered
        print("\n3. Extracting with text_from_rendered:")
        text, metadata, images = text_from_rendered(rendered)
        print(f"   Extracted images: {len(images) if images else 0}")
        if images:
            for img_name, img_data in images.items():
                print(f"     - {img_name}: {len(img_data) if img_data else 0} bytes")
                if not img_data or len(img_data) == 0:
                    print(f"       ⚠️ WARNING: Empty image data!")
        # Save a test image if available
        if images:
            output_dir = "output/debug_test"
            os.makedirs(output_dir, exist_ok=True)
            for img_name, img_data in images.items():
                if img_data and len(img_data) > 0:
                    img_path = os.path.join(output_dir, img_name)
                    with open(img_path, "wb") as f:
                        f.write(img_data)
                    print(f"\n   ✓ Saved test image: {img_path}")
                    break
    except Exception as e:
        print(f"   marker-pdf error: {e}")
        import traceback
        traceback.print_exc()
 if __name__ == "__main__":
    import glob
    pdf_files = glob.glob("input/*.pdf")
    if pdf_files:
        # Test page 2 (should have Figure 1.2, 1.3 according to the markdown)
        debug_single_page(pdf_files[0], page_num=2)
    else:
        print("No PDF files found in input folder")
--- a/docs/history/.gitkeep
+++ b/docs/history/.gitkeep
--- a/docs/history/2026-03-31_GPU설정및전체MD변환시작.md
+++ b/docs/history/2026-03-31_GPU설정및전체MD변환시작.md
@@ -0,0 +1,33 @@
 **이슈**: #1
 **소요 시간**: 90분
 **Context 사용량**: input 80k / output 10k tokens
 ## 작업 내용
 - torch CPU 버전(2.9.1)을 GPU 버전(2.7.0+cu126)으로 교체하여 RTX 3060 GPU OCR 활성화
 - MSEW3.0 매뉴얼 96페이지 중 01~12 MD 변환 완료
 - 01~12 변환된 MD 파일의 이미지 참조 아래에 파라미터 설명 삽입 완료
 - 나머지 13~96은 PowerShell 명령어로 야간 자동 변환 예정
 ## 변경 파일
 - `.venv` : torch 2.9.1+cpu → 2.7.0+cu126 교체
 - `output/MSWE3.0 Manual-01~12.md` : 생성 완료
 - `output/MSWE3.0 Manual-03,04,09,10,11,12.md` : 이미지 파라미터 설명 삽입
 ## 주요 결정사항
 - torch pip 설치 시 `.venv/Scripts/python.exe -m pip` 사용해야 올바른 venv에 설치됨 (단순 pip 명령은 다른 venv에 설치됨)
 - 병렬 변환(2개 동시)은 GPU 과부하 위험 → 1개씩 순차 처리로 변경
 - 야간 변환: 완료 파일 자동 건너뜀 로직 포함 PowerShell 스크립트 사용
 ## 추출된 파라미터 (03~12 추가분)
 - Available Connection Strength: Confining Stress Sigma [kPa], CRu-1, CRs-1, GEOGRID 타입(1.4T~5.15T)
 - Project Identification: Project Title(필수), Designer(필수), Date/Time, Company/Firm, Project No.
 - Program Manager 메인화면: AASHTO(ASD/LRFD)/NCMA 설계모드 선택
 - Reinforcement Layout: LAYER#, Geogrid Height [m], Geogrid Type#, Vertical distance [m]
 - FACIA(Blocks): Depth Wu [m]=0.3, Height Hu [m]=0.2, Unit weight γ [kN/m³]=24, Gu [m]=0.15
 - Reduction factors at connection: RF d(내구성), RF c(크리프), BREAK Overall Fs=1.5, PULLOUT Overall Fs=1.5, 지진시 Tc-pullout 감소율=80%
 - Results 화면: Final Values for Design, 층별 강도/연결부 체크 결과
 ## 문제 및 해결
 - `pip install` 이 docuConverter01 venv에 설치되는 문제 → `python -m pip install` 로 해결
 - torch 2.6.0: marker-pdf 호환 안됨(>=2.7.0 필요) → 2.7.0+cu126 사용
 - 병렬 실행 과부하 → 순차 처리로 변경
--- a/docs/history/2026-03-31_MSEW매뉴얼31-47이미지읽기시도중단.md
+++ b/docs/history/2026-03-31_MSEW매뉴얼31-47이미지읽기시도중단.md
@@ -0,0 +1,25 @@
 **이슈**: #1
 **소요 시간**: 약 30분 (추정)
 **Context 사용량**: input 약 180k / output 약 3k tokens (추정 — 컨텍스트 초과로 정확한 수치 기록 불가)
 ## 작업 내용
 MSEW3.0 Manual 31~47번 이미지 파라미터 분석 시도.
 이미지 Read 툴로 31~33번 일부 이미지를 읽던 중 컨텍스트 한도 초과로 강제 종료.
 MD 파일에 실제 삽입(Edit)은 한 건도 이루어지지 않음 — 다음 세션에서 전체 재처리됨.
 ## 변경 파일
 - 없음 (컨텍스트 초과로 Edit 도달 전 종료)
 ## 읽은 이미지 목록 (삽입 미완료)
 - `output/MSWE3.0 Manual-31_images/` : Figure_1, Figure_3, Picture_5, Picture_13 (4개)
 - `output/MSWE3.0 Manual-32_images/` : Figure_1, Figure_8, Picture_22 (3개)
 - `output/MSWE3.0 Manual-33_images/` : Figure_2 (1개)
 - 합계 8개 이미지 읽기 완료, 나머지 55개 미처리
 ## 주요 결정사항
 - 이미지 Read 시 각 이미지당 약 15~20k 입력 토큰 소비 → 8개만 읽어도 컨텍스트 급증
 - 다음 세션에서 전체 31~47 이미지 일괄 재처리 전략으로 변경
 ## 문제 및 해결
 - 이미지 멀티모달 분석이 컨텍스트를 매우 빠르게 소비함
 - 다음 세션(`2026-03-31_MSEW매뉴얼31-47파라미터삽입.md`)에서 31~47 전체 완료
--- a/docs/history/2026-03-31_MSEW매뉴얼31-47파라미터삽입.md
+++ b/docs/history/2026-03-31_MSEW매뉴얼31-47파라미터삽입.md
@@ -0,0 +1,34 @@
 **이슈**: #1
 **소요 시간**: 40분
 **Context 사용량**: input 120k / output 8k tokens
 ## 작업 내용
 MSEW3.0 Manual MD 파일 31~47번에 이미지 파라미터 설명 삽입 (이전 세션에서 컨텍스트 초과로 중단된 작업 재개)
 ## 변경 파일
 - `output/MSWE3.0 Manual-31.md` : Figure_1 (내적K선택), Figure_3 (Coulomb δ), Picture_5 (외적K δ설명), Picture_13 (Wrap-around 수직간격)
 - `output/MSWE3.0 Manual-32.md` : Figure_1 (MetalStrip Program Manager), Figure_8 (Simple Geometry), Picture_22 (근입깊이)
 - `output/MSWE3.0 Manual-33.md` : Figure_2 (Complex Structures), Figure_9 (Foundation Soil Properties), Figure_12 (Metal Strip Design 메인)
 - `output/MSWE3.0 Manual-34.md` : Figure_6 (균등간격), Figure_7 (계산진행), Figure_9 (부식두께 NOTE), Figure_11 (수평간격범위)
 - `output/MSWE3.0 Manual-35.md` : Figure_2 (보강재종류수), Figure_4 (두종류보강재데이터), Figure_6 (배치테이블), Figure_9 (Metal Strip Data), Figure_10 (상호작용파라미터)
 - `output/MSWE3.0 Manual-36.md` : Figure_9 (토압계수변화), Picture_11 (Fw배치), Picture_13 (외적K선택)
 - `output/MSWE3.0 Manual-37.md` : Figure_0 (패널물성), Figure_5 (연결부강도관계), Figure_7 (연결부감소계수)
 - `output/MSWE3.0 Manual-38.md` : Figure_1 (동적하중설계), Figure_6 (지층기본설정)
 - `output/MSWE3.0 Manual-39.md` : Figure_0 (첫번째지층), Figure_2 (두번째지층), Picture_8 (결과확인화면), Picture_12 (복합안정성아이콘)
 - `output/MSWE3.0 Manual-40.md` : Figure_0 (복합안정성초기값), Figure_7 (해석기준선택), Figure_13 (SearchGrid), Figure_18 (Bishop진행)
 - `output/MSWE3.0 Manual-41.md` : Figure_0 (저부파괴여부), Figure_2 (저부파괴탐색), Picture_13 (중간결과아이콘), Figure_18 (외적/내적중간결과)
 - `output/MSWE3.0 Manual-42.md` : Figure_6 (지지력정적), Figure_14 (지지력동적), Figure_17 (활동정적)
 - `output/MSWE3.0 Manual-43.md` : Figure_1 (활동동적1), Figure_2 (활동동적2), Figure_4 (활동상세다이어그램), Figure_6 (활동최소길이)
 - `output/MSWE3.0 Manual-44.md` : Figure_0 (편심정적), Figure_2 (편심동적), Figure_4 (편심상세다이어그램), Figure_6 (편심최소길이)
 - `output/MSWE3.0 Manual-45.md` : Picture_2 (Geotextile중간결과아이콘), Figure_3 (내적강도결과테이블), Figure_15 (동적강도결과), Figure_17 (Tmax분포)
 - `output/MSWE3.0 Manual-46.md` : Figure_1 (Tmax수평응력분포), Figure_4 (연결부정적), Picture_9 (연결부안전율상세), Figure_11 (연결부동적)
 - `output/MSWE3.0 Manual-47.md` : Figure_0 (인발정적), Figure_7 (인발동적1), Figure_8 (인발동적2), Figure_11 (최종설계결과)
 ## 주요 결정사항
 - 총 63개 이미지를 Read 툴로 멀티모달 분석 후 각 MD 파일에 삽입
 - 31-47 범위의 내용은 Metal Strip 설계(31-38), 전체안정해석(38-41), 결과확인(41-47)
 - 결과 확인 화면(42-47)은 입력 파라미터보다 출력 결과 컬럼명을 설명하는 방식으로 기술
 ## 문제 및 해결
 - 이전 세션에서 Manual-31 이미지 4개와 Manual-33 Figure_2 이미지를 이미 읽었으나 컨텍스트 초과로 삽입 전 중단
 - 이번 세션에서 나머지 이미지(33 Figure_9~47 전체)를 병렬 Read로 한꺼번에 분석 후 순차 삽입
--- a/docs/history/2026-03-31_MSEW매뉴얼60-83파라미터삽입.md
+++ b/docs/history/2026-03-31_MSEW매뉴얼60-83파라미터삽입.md
@@ -0,0 +1,42 @@
 **이슈**: #1
 **소요 시간**: 약 90분
 **Context 사용량**: input 약 220k / output 약 15k tokens
 ## 작업 내용
 MSEW3.0 Manual MD 파일 60~83번에 이미지 파라미터 설명 삽입.
 컨텍스트 초과로 종료된 이전 세션에서 이어받아, 사용자 요청으로 60번부터 진행.
 이전 세션의 누락된 히스토리(`2026-03-31_MSEW매뉴얼31-47이미지읽기시도중단.md`)도 추정 작성.
 ## 변경 파일
 - `output/MSWE3.0 Manual-60.md` : Figure_0 (외부안정분석), Figure_6 (내적안정테이블), Figure_15 (Tmax분포)
 - `output/MSWE3.0 Manual-61.md` : Figure_0 (Tmax상세), Figure_2 (이상값버튼), Figure_3 (목표Fs입력), Figure_4 (이상값테이블), Figure_7 (연결부분석)
 - `output/MSWE3.0 Manual-62.md` : Figure_5 (인발저항테이블), Figure_8 (인발상세), Picture_14 (GlobalStability버튼)
 - `output/MSWE3.0 Manual-63.md` : Figure_0 (복합안정초기값), Picture_7 (해석방법선택), Figure_11 (탐색격자), Picture_16 (Bishop시작확인), Picture_18 (저부파괴여부)
 - `output/MSWE3.0 Manual-64.md` : Figure_1 (저부파괴탐색격자), Figure_7 (복합결과테이블), Picture_12 (컨투어분포), Figure_14 (3D분포)
 - `output/MSWE3.0 Manual-65.md` : Figure_1 (파괴원다이어그램), Figure_3 (보강재기여), Figure_5 (인장력분포), Picture_9 (저부컨투어), Figure_11 (저부3D)
 - `output/MSWE3.0 Manual-66.md` : Figure_0 (저부파괴원), Figure_2 (저부보강재기여), Figure_4 (인장력분포), Figure_7 (지진결과), Picture_9 (임계원지진)
 - `output/MSWE3.0 Manual-67.md` : Figure_20 (전면블록데이터)
 - `output/MSWE3.0 Manual-68.md` : Figure_3 (연결부감소계수), Figure_8 (연결부강도입력), Figure_12 (전단저항입력)
 - `output/MSWE3.0 Manual-69.md` : Figure_0 (지오그리드분석메뉴), Picture_4 (보강재종류수), Picture_6 (보강재데이터), Figure_8 (층별배치입력)
 - `output/MSWE3.0 Manual-70.md` : Picture_6 (내적토압계수안내), Figure_8 (외적토압계수안내), Figure_11 (결과메인화면), Figure_19 (지지력결과)
 - `output/MSWE3.0 Manual-71.md` : Figure_8 (지지력지진), Figure_11 (활동결과테이블), Figure_20 (활동정적상세), Figure_22 (활동지진상세)
 - `output/MSWE3.0 Manual-72.md` : Figure_2 (편심결과테이블), Figure_5 (편심정적상세), Figure_7 (편심지진상세), Figure_13 (내적안정결과)
 - `output/MSWE3.0 Manual-73.md` : Figure_0 (Tmax분포1), Figure_2 (Tmax분포2), Figure_5 (이상값버튼), Figure_6 (목표Fs입력), Figure_7 (이상값테이블)
 - `output/MSWE3.0 Manual-74.md` : Figure_0 (연결부결과테이블), Picture_7 (연결부Fs요약), Figure_9 (Bulging테이블), Figure_11 (힌지높이)
 - `output/MSWE3.0 Manual-75.md` : Figure_1 (최대비보강높이), Figure_5 (인발결과테이블), Figure_8 (인발상세테이블)
 - `output/MSWE3.0 Manual-76.md` : Picture_0 (결과메인), Figure_1 (텍스트저장버튼), Picture_5 (PrintPreview1), Picture_6 (PrintPreview2), Figure_11 (비트맵저장)
 - `output/MSWE3.0 Manual-80.md` : Figure_5 (연결력비율테이블), Figure_6 (연결력그래프)
 - `output/MSWE3.0 Manual-81.md` : Picture_8 (보강재배치도)
 - `output/MSWE3.0 Manual-83.md` : Picture_5 (경사배면배치도)
 - `docs/history/2026-03-31_MSEW매뉴얼31-47이미지읽기시도중단.md` : 누락 히스토리 추정 작성
 ## 주요 결정사항
 - 77~79, 82, 84~86번은 _images 폴더 없어 건너뜀
 - 결과 화면(분석 결과 테이블, 다이어그램)은 출력 컬럼명과 의미 위주로 기술
 - 입력 다이얼로그는 파라미터명·단위·샘플값 위주로 기술
 - 60~76번: NCMA 방식 Geogrid/Geotextile 결과 확인 화면이 주를 이룸
 - 80~83번: 텍스트 출력 결과 파일 관련 배치도 및 집계 그래프
 ## 문제 및 해결
 - Manual-65 Picture_9/Figure_11 삽입 시 공백 라인 차이로 첫 시도 실패 → 파일 재읽기 후 정확한 문자열로 수정 성공
 - 파일 86번까지만 존재(사용자가 98번까지라고 했으나 실제 변환 파일은 86번이 마지막)
--- a/docs/history/2026-03-31_MSEW매뉴얼이미지파라미터추출.md
+++ b/docs/history/2026-03-31_MSEW매뉴얼이미지파라미터추출.md
@@ -0,0 +1,31 @@
 **이슈**: #1
 **소요 시간**: 30분
 **Context 사용량**: input 45k / output 6k tokens
 ## 작업 내용
 MSEW3.0 매뉴얼 샘플 PDF 3페이지(06, 07, 08)에서 추출된 이미지를 Claude Code의 Read 툴(멀티모달)로 직접 분석하여, MD 파일의 이미지 참조 바로 아래에 파라미터명과 샘플값을 삽입.
 API 키 없이 Claude Code 구독으로 처리하는 워크플로우 검증 완료.
 ## 변경 파일
 - `output/MSWE3.0 Manual-06.md` : 이미지 3개 아래 파라미터 설명 삽입
 - `output/MSWE3.0 Manual-07.md` : 이미지 1개 아래 파라미터 설명 삽입
 - `output/MSWE3.0 Manual-08.md` : 이미지 3개 아래 파라미터 설명 삽입
 ## 주요 결정사항
 - Python 스크립트 대신 Claude Code가 직접 Read(이미지) → Edit(MD) 수행
 - API 키 불필요 — Claude Code 구독으로 이미지 분석 가능
 - 삽입 형식: `> **[화면명]** \n> - \`파라미터명\`: 샘플값`
 - 세션당 약 15~20페이지 처리 가능 (컨텍스트 한계)
 ## 추출된 파라미터 목록
 - 메인 메뉴: General Information, Geometry and Surcharge, Soil Data, Reinforcement (Geogrid), FACIA (Blocks), Seismic Parameters, Strata for Global Stability Analysis, Target Performance Criteria
 - Wall Embedment: Type in front of wall, Embedded depth E [m]
 - Geometry/Surcharge: Height H [m], BackSlope [deg], Batter, BackSlope ris [m]
 - Geogrid Design: Le [m], L/Hd, L [m], 보강재 길이 옵션(Uniform/Minimum), 강도·간격 옵션, Internal/External Stability K
 - Reinforcement Types: Number of reinforcement types
 - Geogrid DB: Product Name, Ultimate Tensile Strength [kN/m], Strength Reduction factors
 - Reinforcement Layout: From/To [m], Geogrid Type #, T-allowable [kN/m]
 ## 문제 및 해결
 - pdftoppm 미설치로 Read 툴로 PDF 직접 읽기 불가 → PyMuPDF로 텍스트 추출 후 OCR(marker-pdf)로 보완
 - 온라인 공식 매뉴얼 없음 (MSEW 3.0은 2020년 지원 종료) → 로컬 샘플 PDF 활용
--- a/docs/history/2026-03-31_이미지분석코드정리및마무리.md
+++ b/docs/history/2026-03-31_이미지분석코드정리및마무리.md
@@ -0,0 +1,18 @@
 **이슈**: #1
 **소요 시간**: 15분
 **Context 사용량**: input 28k / output 4k tokens
 ## 작업 내용
 convert_with_cropped_images.py에 이미지 분석 기능(analyze_image_with_claude, insert_image_descriptions)을 추가했다가, 이후 요청에 따라 삭제하여 PDF→MD 변환 + 이미지 추출까지만 담당하도록 정리.
 이미지 분석은 별도 파일로 특화 개발 예정.
 ## 변경 파일
 - `convert_with_cropped_images.py` : 이미지 분석 관련 함수 2개(analyze_image_with_claude, insert_image_descriptions) 및 호출 코드 제거. base64/dotenv import 제거.
 ## 주요 결정사항
 - convert_with_cropped_images.py는 PDF→MD 변환 + 이미지 파일 추출까지만 담당
 - 이미지 분석(멀티모달 AI)은 이 파일을 복제한 별도 스크립트에서 특화 구현 예정
 - 분리 이유: 매뉴얼 이미지는 범용 분석이 아닌 특화된 프롬프트/로직이 필요
 ## 문제 및 해결
 없음
--- a/docs/history/2026-03-31_훅설정및PDF2단변환기능추가.md
+++ b/docs/history/2026-03-31_훅설정및PDF2단변환기능추가.md
@@ -0,0 +1,28 @@
 **이슈**: #1
 **소요 시간**: 40분
 **Context 사용량**: input 35k / output 8k tokens
 ## 작업 내용
 1. common/.claude/hooks 훅을 프로젝트에 적용
 2. convert_with_cropped_images.py에 2단(다단) → 1단 변환 기능 추가
 3. 샘플 PDF(MSWE3.0 Manual-06.pdf) 변환 테스트
 4. 히스토리 훅 미작동 원인 분석 및 수정
 ## 변경 파일
 - `.claude/settings.json` : 신규 생성 — UserPromptSubmit/PostToolUse/Stop 훅 등록
 - `.claude/hooks/` : common에서 훅 파일 4개 복사 (session-context.sh, guard-history-fields.sh/.py, guard-history-reminder.sh)
 - `.claude/hooks/session-context.sh` : 히스토리 기록 지시 문구 추가 (stdout으로 Claude에게 전달)
 - `convert_with_cropped_images.py` : `is_scanned_pdf()`, `reorder_text_by_columns()` 함수 추가 — 스캔/텍스트 PDF 자동 판별 후 2단→1단 처리
 - `CLAUDE.md` : 신규 생성 — 히스토리 작성 규칙 및 템플릿 정의
 - `docs/history/.gitkeep` : 신규 생성
 ## 주요 결정사항
 - 스캔 PDF → marker-pdf surya 레이아웃 모델이 자동으로 2단 컬럼 검출+재정렬
 - 텍스트 PDF → PyMuPDF 블록 좌표 기반: 페이지 폭 절반 ±30px 기준으로 좌/우 컬럼 분리 후 좌→우 순 합산
 - 스캔 판정 기준: 샘플 3페이지에서 텍스트 50자 미만이면 스캔 PDF로 처리
 - 훅 실행 인터프리터: `.venv/Scripts/python.exe` 사용 (python/python3 명령은 다른 Python 환경을 가리킴)
 ## 문제 및 해결
 - **훅 미작동 원인**: CLAUDE.md 없음 + session-context.sh에 작성 지시 없음 + guard-history-reminder.sh가 stderr 출력으로 Claude에게 전달 안 됨 → session-context.sh stdout에 지시 문구 추가 + CLAUDE.md 생성으로 해결
 - **ModuleNotFoundError(marker)**: python/python3 명령이 marker 미설치 Python 가리킴 → .venv/Scripts/python.exe 직접 지정으로 해결
 - **샘플 PDF 1페이지, 이미지 기반**: PyMuPDF 텍스트 블록 0개 확인 → marker-pdf OCR 경로로 처리, 정상 변환 완료
--- a/docs/history/2026-04-01_MD파일병합및이미지경로통합.md
+++ b/docs/history/2026-04-01_MD파일병합및이미지경로통합.md
@@ -0,0 +1,19 @@
 **소요 시간**: 10분
 **Context 사용량**: input 18k / output 2k tokens
 ## 작업 내용
 96개 MD 파일을 하나로 병합하는 방법 설계 및 테스트 (06~08 페이지 3개).
 이미지 파일명 충돌 문제를 해결하기 위해 페이지 번호 prefix를 붙여 단일 폴더로 통합하는 방식 채택.
 ## 변경 파일
 - `merge_markdown.py` : 전면 재작성 — 이미지 통합 폴더 생성, 파일명 rename, MD 내 경로 치환, file_range 파라미터 지원
 ## 주요 결정사항
 - 이미지 rename 규칙: `{stem}_images/_page_0_Figure_3.jpeg` → `images/p006_Figure_3.jpeg`
  - `_page_0_` 접두사 제거, 페이지 번호(zero-padded)를 prefix로
 - 병합 파일은 `output/` 안에 저장 → 상대경로 `images/` 그대로 유효
 - `file_range` 파라미터로 테스트 범위 지정 가능
 ## 문제 및 해결
 - 문제: 모든 MD가 단일 페이지이므로 `_page_0_Figure_X` 이름이 96개 파일에서 중복
 - 해결: 이미지를 단일 `images/` 폴더로 복사할 때 `p{pagenum}_` 접두사 추가하여 고유명 보장
--- a/docs/tutorial.md
+++ b/docs/tutorial.md
@@ -0,0 +1,39 @@
 # documan 사용법
 ## MD 파일 병합 (merge_markdown.py)
 PDF에서 변환된 페이지별 MD 파일들을 하나의 파일로 합친다.
 이미지도 `output/images/` 폴더로 통합되고, MD 내 경로가 자동으로 업데이트된다.
 ### 전체 병합
 ```bash
 python merge_markdown.py
 ```
 - 입력: `output/MSWE3.0 Manual-01.md` ~ `output/MSWE3.0 Manual-96.md`
 - 출력: `output/merged_all.md`
 - 이미지: `output/images/p01_Figure_0.jpeg` 형식으로 통합
 ### 일부 페이지만 테스트
 `merge_markdown.py` 하단의 `__main__` 블록에서 `file_range` 지정:
 ```python
 merge_markdown_files(
    input_dir="output",
    output_file="merged_test.md",
    images_subdir="images",
    file_range=(6, 8),   # 06~08 페이지만
 )
 ```
 ### 이미지 이름 규칙
 | 원본 | 변환 후 |
 |------|---------|
 | `MSWE3.0 Manual-06_images/_page_0_Figure_0.jpeg` | `images/p06_Figure_0.jpeg` |
 | `MSWE3.0 Manual-15_images/_page_0_Picture_12.jpeg` | `images/p15_Picture_12.jpeg` |
 - `_page_0_` 접두사 제거
 - 페이지 번호를 `p{NN}_` 형식으로 앞에 붙여 파일명 충돌 방지
--- a/extract_images.py
+++ b/extract_images.py
@@ -0,0 +1,175 @@
 #!/usr/bin/env python3
 """
 Extract embedded images from PDF files
 """
 import os
 import glob
 from pathlib import Path
 def extract_images_pypdfium2(pdf_path: str, output_dir: str = "output"):
    """
    Extract images using pypdfium2
    """
    try:
        import pypdfium2 as pdfium
        from PIL import Image
        import io
        pdf_file = Path(pdf_path)
        base_name = pdf_file.stem
        images_dir = os.path.join(output_dir, f"{base_name}_extracted_images")
        os.makedirs(images_dir, exist_ok=True)
        print(f"\nExtracting images from {pdf_file.name}...")
        pdf = pdfium.PdfDocument(pdf_path)
        image_count = 0
        for page_num in range(len(pdf)):
            page = pdf[page_num]
            # Get images from page
            for obj_index, obj in enumerate(page.get_objects()):
                if obj.type == pdfium.FPDF_PAGEOBJ_IMAGE:
                    try:
                        # Extract image
                        bitmap = obj.get_bitmap()
                        pil_image = bitmap.to_pil()
                        # Skip very small images (likely noise or artifacts)
                        if pil_image.width < 50 or pil_image.height < 50:
                            continue
                        image_count += 1
                        img_filename = f"page_{page_num + 1}_img_{obj_index + 1}.png"
                        img_path = os.path.join(images_dir, img_filename)
                        pil_image.save(img_path)
                        print(f"  Saved: {img_filename} ({pil_image.width}x{pil_image.height})")
                    except Exception as e:
                        print(f"  Warning: Could not extract image {obj_index} from page {page_num + 1}: {e}")
        pdf.close()
        if image_count > 0:
            print(f"  OK Total {image_count} images extracted to: {images_dir}")
            return True
        else:
            print(f"  INFO: No images found in {pdf_file.name}")
            return True
    except Exception as e:
        print(f"  ERROR: Failed with pypdfium2: {e}")
        return False
 def extract_images_pymupdf(pdf_path: str, output_dir: str = "output"):
    """
    Extract images using PyMuPDF (fitz) - fallback method
    """
    try:
        import fitz  # PyMuPDF
        pdf_file = Path(pdf_path)
        base_name = pdf_file.stem
        images_dir = os.path.join(output_dir, f"{base_name}_extracted_images")
        os.makedirs(images_dir, exist_ok=True)
        print(f"\nExtracting images from {pdf_file.name} using PyMuPDF...")
        doc = fitz.open(pdf_path)
        image_count = 0
        for page_num in range(len(doc)):
            page = doc[page_num]
            image_list = page.get_images(full=True)
            for img_index, img_info in enumerate(image_list):
                xref = img_info[0]
                try:
                    # Extract image
                    base_image = doc.extract_image(xref)
                    image_bytes = base_image["image"]
                    image_ext = base_image["ext"]
                    # Skip very small images
                    if len(image_bytes) < 1000:  # Less than 1KB
                        continue
                    image_count += 1
                    img_filename = f"page_{page_num + 1}_img_{img_index + 1}.{image_ext}"
                    img_path = os.path.join(images_dir, img_filename)
                    with open(img_path, "wb") as f:
                        f.write(image_bytes)
                    print(f"  Saved: {img_filename} ({len(image_bytes)} bytes)")
                except Exception as e:
                    print(f"  Warning: Could not extract image {img_index} from page {page_num + 1}: {e}")
        doc.close()
        if image_count > 0:
            print(f"  OK Total {image_count} images extracted to: {images_dir}")
            return True
        else:
            print(f"  INFO: No images found in {pdf_file.name}")
            return True
    except ImportError:
        print("  ERROR: PyMuPDF not installed. Install with: pip install PyMuPDF")
        return False
    except Exception as e:
        print(f"  ERROR: Failed with PyMuPDF: {e}")
        return False
 def extract_images_from_pdf(pdf_path: str, output_dir: str = "output"):
    """
    Try to extract images using available methods
    """
    # Try pypdfium2 first (already installed)
    success = extract_images_pypdfium2(pdf_path, output_dir)
    if not success:
        print("\nTrying PyMuPDF as fallback...")
        success = extract_images_pymupdf(pdf_path, output_dir)
    return success
 def extract_all_images(input_dir: str = "input", output_dir: str = "output"):
    """
    Extract images from all PDF files in the input directory
    """
    pdf_pattern = os.path.join(input_dir, "*.pdf")
    pdf_files = sorted(glob.glob(pdf_pattern))
    if not pdf_files:
        print(f"No PDF files found in {input_dir}")
        return
    print(f"Found {len(pdf_files)} PDF files")
    print("=" * 60)
    successful = 0
    failed = 0
    for pdf_file in pdf_files:
        if extract_images_from_pdf(pdf_file, output_dir):
            successful += 1
        else:
            failed += 1
    print("\n" + "=" * 60)
    print(f"Image extraction complete!")
    print(f"  Successful: {successful}")
    print(f"  Failed: {failed}")
    print(f"  Total: {len(pdf_files)}")
 if __name__ == "__main__":
    extract_all_images()
--- a/main.py
+++ b/main.py
@@ -0,0 +1,327 @@
 #!/usr/bin/env python3
 """
 docuConverter — 문서 → Markdown 변환 도구 모음
 지원 포맷:
  PDF  → Markdown  (marker-pdf 기반, 이미지 유/무 선택)
  EPUB → Markdown  (ebooklib + BeautifulSoup 기반)
 시나리오:
  1. PDF 단일 변환 (이미지 포함, 고품질)
  2. PDF 단일 변환 (텍스트 전용, 빠름)
  3. PDF 배치 변환 (이미지 포함, 순차)
  4. PDF 배치 변환 (텍스트 전용, 순차, 빠름)
  5. PDF 배치 변환 (병렬 처리, 멀티코어)
  6. EPUB 단일 변환
  7. EPUB 배치 변환
  8. 이미지만 추출 (PDF → 이미지 파일)
  9. Markdown 병합 (output/ 폴더의 .md 파일들을 하나로)
 10. 이미지 경로 업데이트 (Markdown 내 이미지 링크 재연결)
 """
 import os
 import sys
 import glob
 from pathlib import Path
 # ─── 시나리오 함수들 ──────────────────────────────────────────────────────────
 def scenario_pdf_single_with_images():
    """PDF 단일 변환 — 이미지 포함 (고품질, 느림)"""
    from convert_with_cropped_images import convert_pdf_with_cropped_images
    pdf_path = input("변환할 PDF 경로를 입력하세요: ").strip()
    if not pdf_path:
        pdf_files = sorted(glob.glob("input/*.pdf"))
        if not pdf_files:
            print("ERROR: input/ 폴더에 PDF 파일이 없습니다.")
            return
        pdf_path = pdf_files[0]
        print(f"  → 자동 선택: {pdf_path}")
    output_dir = input("출력 폴더 [기본: output]: ").strip() or "output"
    convert_pdf_with_cropped_images(pdf_path, output_dir)
 def scenario_pdf_single_fast():
    """PDF 단일 변환 — 텍스트 전용 (빠름)"""
    from convert_pdfs_fast import convert_pdf_to_markdown_fast
    pdf_path = input("변환할 PDF 경로를 입력하세요: ").strip()
    if not pdf_path:
        pdf_files = sorted(glob.glob("input/*.pdf"))
        if not pdf_files:
            print("ERROR: input/ 폴더에 PDF 파일이 없습니다.")
            return
        pdf_path = pdf_files[0]
        print(f"  → 자동 선택: {pdf_path}")
    output_dir = input("출력 폴더 [기본: output]: ").strip() or "output"
    convert_pdf_to_markdown_fast(pdf_path, output_dir)
 def scenario_pdf_batch_with_images():
    """PDF 배치 변환 — 이미지 포함 (순차, input/ → output/)"""
    from convert_with_cropped_images import convert_all_pdfs
    input_dir = input("입력 폴더 [기본: input]: ").strip() or "input"
    output_dir = input("출력 폴더 [기본: output]: ").strip() or "output"
    convert_all_pdfs(input_dir, output_dir)
 def scenario_pdf_batch_fast():
    """PDF 배치 변환 — 텍스트 전용 (순차, 빠름)"""
    from convert_pdfs_fast import convert_all_pdfs_fast
    input_dir = input("입력 폴더 [기본: input]: ").strip() or "input"
    output_dir = input("출력 폴더 [기본: output]: ").strip() or "output"
    convert_all_pdfs_fast(input_dir, output_dir)
 def scenario_pdf_batch_parallel():
    """PDF 배치 변환 — 병렬 처리 (멀티코어)"""
    from convert_pdfs_parallel import convert_all_pdfs_parallel
    import multiprocessing
    input_dir = input("입력 폴더 [기본: input]: ").strip() or "input"
    output_dir = input("출력 폴더 [기본: output]: ").strip() or "output"
    cpu_count = multiprocessing.cpu_count()
    workers_input = input(f"병렬 워커 수 [기본: 2, CPU: {cpu_count}]: ").strip()
    max_workers = int(workers_input) if workers_input.isdigit() else 2
    convert_all_pdfs_parallel(input_dir, output_dir, max_workers)
 def scenario_epub_single():
    """EPUB 단일 변환 → Markdown"""
    from convert_epub import convert_epub_to_markdown
    epub_path = input("변환할 EPUB 경로를 입력하세요: ").strip()
    if not epub_path:
        epub_files = sorted(glob.glob("input/*.epub"))
        if not epub_files:
            print("ERROR: input/ 폴더에 EPUB 파일이 없습니다.")
            return
        epub_path = epub_files[0]
        print(f"  → 자동 선택: {epub_path}")
    output_dir = input("출력 폴더 [기본: output]: ").strip() or "output"
    convert_epub_to_markdown(epub_path, output_dir)
 def scenario_epub_batch():
    """EPUB 배치 변환 — input/ 폴더의 모든 .epub 파일"""
    from convert_epub import convert_epub_to_markdown
    input_dir = input("입력 폴더 [기본: input]: ").strip() or "input"
    output_dir = input("출력 폴더 [기본: output]: ").strip() or "output"
    epub_files = sorted(glob.glob(os.path.join(input_dir, "*.epub")))
    if not epub_files:
        print(f"ERROR: {input_dir}/ 폴더에 EPUB 파일이 없습니다.")
        return
    print(f"Found {len(epub_files)} EPUB file(s)")
    print("=" * 60)
    successful = 0
    failed = 0
    for i, epub_file in enumerate(epub_files, 1):
        print(f"\n[{i}/{len(epub_files)}] {Path(epub_file).name}")
        try:
            convert_epub_to_markdown(epub_file, output_dir)
            successful += 1
        except Exception as e:
            print(f"  ERROR: {e}")
            failed += 1
    print("\n" + "=" * 60)
    print(f"Conversion complete! Successful: {successful}, Failed: {failed}")
 def scenario_extract_images():
    """PDF에서 이미지만 추출 (Markdown 변환 없음)"""
    from extract_images import extract_all_images, extract_images_from_pdf
    mode = input("모드 선택 — [1] 단일 파일  [2] 배치 (input/ 폴더): ").strip()
    output_dir = input("출력 폴더 [기본: output]: ").strip() or "output"
    if mode == "1":
        pdf_path = input("PDF 경로를 입력하세요: ").strip()
        if not pdf_path:
            print("ERROR: 경로가 비어 있습니다.")
            return
        extract_images_from_pdf(pdf_path, output_dir)
    else:
        input_dir = input("입력 폴더 [기본: input]: ").strip() or "input"
        extract_all_images(input_dir, output_dir)
 def scenario_merge_markdown():
    """output/ 폴더의 .md 파일들을 하나의 파일로 병합"""
    from merge_markdown import merge_markdown_files
    input_dir = input("병합할 Markdown 폴더 [기본: output]: ").strip() or "output"
    output_file = input("병합 결과 파일명 [기본: merged_all.md]: ").strip() or "merged_all.md"
    separator_choice = input("구분자 — [1] 수평선 (---) [2] 빈줄만: ").strip()
    separator = "\n\n---\n\n" if separator_choice != "2" else "\n\n"
    merge_markdown_files(input_dir, output_file, separator)
 def scenario_update_image_paths():
    """Markdown 내 이미지 경로를 추출된 실제 이미지 경로로 업데이트"""
    from update_image_paths import update_all_markdown_files
    output_dir = input("Markdown 폴더 [기본: output]: ").strip() or "output"
    update_all_markdown_files(output_dir)
 # ─── 메뉴 ────────────────────────────────────────────────────────────────────
 SCENARIOS = [
    ("PDF 단일 변환 (이미지 포함, 고품질)",        scenario_pdf_single_with_images),
    ("PDF 단일 변환 (텍스트 전용, 빠름)",          scenario_pdf_single_fast),
    ("PDF 배치 변환 (이미지 포함, 순차)",          scenario_pdf_batch_with_images),
    ("PDF 배치 변환 (텍스트 전용, 순차, 빠름)",    scenario_pdf_batch_fast),
    ("PDF 배치 변환 (병렬 처리, 멀티코어)",        scenario_pdf_batch_parallel),
    ("EPUB 단일 변환 → Markdown",                 scenario_epub_single),
    ("EPUB 배치 변환 (input/ 폴더 전체)",          scenario_epub_batch),
    ("이미지만 추출 (PDF → 이미지 파일)",          scenario_extract_images),
    ("Markdown 파일 병합 (여러 .md → 하나로)",     scenario_merge_markdown),
    ("이미지 경로 업데이트 (Markdown 링크 수정)",  scenario_update_image_paths),
 ]
 def print_menu():
    print("\n" + "=" * 60)
    print("  docuConverter — 문서 → Markdown 변환 도구")
    print("=" * 60)
    for i, (label, _) in enumerate(SCENARIOS, 1):
        print(f"  {i:2}. {label}")
    print("   0. 종료")
    print("=" * 60)
 def run_interactive():
    """대화형 메뉴 실행"""
    while True:
        print_menu()
        choice = input("시나리오 번호를 선택하세요: ").strip()
        if choice == "0":
            print("종료합니다.")
            break
        if not choice.isdigit() or not (1 <= int(choice) <= len(SCENARIOS)):
            print("잘못된 입력입니다. 다시 선택하세요.")
            continue
        idx = int(choice) - 1
        label, fn = SCENARIOS[idx]
        print(f"\n▶ {label}")
        print("-" * 60)
        try:
            fn()
        except KeyboardInterrupt:
            print("\n중단되었습니다.")
        except Exception as e:
            print(f"\nERROR: {e}")
            import traceback
            traceback.print_exc()
        input("\n[Enter] 키를 누르면 메뉴로 돌아갑니다...")
 def run_cli(args):
    """CLI 직접 실행 모드 (비대화형)
    사용 예:
      python main.py 1 path/to/file.pdf output/
      python main.py 4 input/ output/
      python main.py 6 path/to/book.epub output/
    """
    if not args:
        run_interactive()
        return
    scenario_num = args[0]
    if not scenario_num.isdigit() or not (1 <= int(scenario_num) <= len(SCENARIOS)):
        print(f"ERROR: 시나리오 번호는 1~{len(SCENARIOS)} 사이여야 합니다.")
        sys.exit(1)
    idx = int(scenario_num) - 1
    label, fn = SCENARIOS[idx]
    print(f"▶ {label}")
    # 인자를 stdin 처럼 흉내 내어 input() 호출을 우회
    # 직접 함수를 시나리오별로 호출
    extra = args[1:]
    if idx == 0:  # PDF 단일, 이미지 포함
        from convert_with_cropped_images import convert_pdf_with_cropped_images
        pdf_path = extra[0] if len(extra) > 0 else sorted(glob.glob("input/*.pdf"))[0]
        out = extra[1] if len(extra) > 1 else "output"
        convert_pdf_with_cropped_images(pdf_path, out)
    elif idx == 1:  # PDF 단일, fast
        from convert_pdfs_fast import convert_pdf_to_markdown_fast
        pdf_path = extra[0] if len(extra) > 0 else sorted(glob.glob("input/*.pdf"))[0]
        out = extra[1] if len(extra) > 1 else "output"
        convert_pdf_to_markdown_fast(pdf_path, out)
    elif idx == 2:  # PDF 배치, 이미지 포함
        from convert_with_cropped_images import convert_all_pdfs
        inp = extra[0] if len(extra) > 0 else "input"
        out = extra[1] if len(extra) > 1 else "output"
        convert_all_pdfs(inp, out)
    elif idx == 3:  # PDF 배치, fast
        from convert_pdfs_fast import convert_all_pdfs_fast
        inp = extra[0] if len(extra) > 0 else "input"
        out = extra[1] if len(extra) > 1 else "output"
        convert_all_pdfs_fast(inp, out)
    elif idx == 4:  # PDF 배치, 병렬
        from convert_pdfs_parallel import convert_all_pdfs_parallel
        inp = extra[0] if len(extra) > 0 else "input"
        out = extra[1] if len(extra) > 1 else "output"
        workers = int(extra[2]) if len(extra) > 2 else 2
        convert_all_pdfs_parallel(inp, out, workers)
    elif idx == 5:  # EPUB 단일
        from convert_epub import convert_epub_to_markdown
        epub_path = extra[0] if len(extra) > 0 else sorted(glob.glob("input/*.epub"))[0]
        out = extra[1] if len(extra) > 1 else "output"
        convert_epub_to_markdown(epub_path, out)
    elif idx == 6:  # EPUB 배치
        from convert_epub import convert_epub_to_markdown
        inp = extra[0] if len(extra) > 0 else "input"
        out = extra[1] if len(extra) > 1 else "output"
        for ep in sorted(glob.glob(os.path.join(inp, "*.epub"))):
            print(f"\n→ {Path(ep).name}")
            convert_epub_to_markdown(ep, out)
    elif idx == 7:  # 이미지 추출
        from extract_images import extract_all_images
        inp = extra[0] if len(extra) > 0 else "input"
        out = extra[1] if len(extra) > 1 else "output"
        extract_all_images(inp, out)
    elif idx == 8:  # Markdown 병합
        from merge_markdown import merge_markdown_files
        inp = extra[0] if len(extra) > 0 else "output"
        out_file = extra[1] if len(extra) > 1 else "merged_all.md"
        merge_markdown_files(inp, out_file)
    elif idx == 9:  # 이미지 경로 업데이트
        from update_image_paths import update_all_markdown_files
        out = extra[0] if len(extra) > 0 else "output"
        update_all_markdown_files(out)
 if __name__ == "__main__":
    # docuConverter 폴더를 cwd로 설정 (어느 경로에서 실행해도 input/output 경로 일관)
    script_dir = Path(__file__).parent
    os.chdir(script_dir)
    run_cli(sys.argv[1:])
--- a/merge_markdown.py
+++ b/merge_markdown.py
@@ -0,0 +1,119 @@
 #!/usr/bin/env python3
 """
 Merge multiple Markdown files into a single file,
 consolidating all images into a single images/ folder with unique names.
 Image rename rule:
  {stem}_images/_page_0_Figure_3.jpeg  ->  images/p006_Figure_3.jpeg
  {stem}_images/_page_0_Picture_12.jpeg -> images/p006_Picture_12.jpeg
 """
 import os
 import re
 import glob
 import shutil
 from pathlib import Path
 def merge_markdown_files(
    input_dir: str = "output",
    output_file: str = "merged_all.md",
    images_subdir: str = "images",
    file_range: tuple = None,  # e.g. (6, 8) to process only pages 06~08
 ):
    md_pattern = os.path.join(input_dir, "*.md")
    all_md_files = sorted(glob.glob(md_pattern))
    # Only include files matching Manual-NN pattern (skip merged outputs)
    all_md_files = [f for f in all_md_files if re.search(r'-\d+\.md$', f)]
    # Filter by page number range if given
    if file_range:
        start, end = file_range
        md_files = []
        for f in all_md_files:
            m = re.search(r'-(\d+)\.md$', f)
            if m and start <= int(m.group(1)) <= end:
                md_files.append(f)
    else:
        md_files = all_md_files
    if not md_files:
        print(f"No markdown files found in {input_dir}")
        return
    print(f"Files to merge: {len(md_files)}")
    for f in md_files:
        print(f"  {Path(f).name}")
    print("=" * 60)
    # Create unified images directory
    unified_images_path = os.path.join(input_dir, images_subdir)
    os.makedirs(unified_images_path, exist_ok=True)
    merged_content = []
    for md_file in md_files:
        file_path = Path(md_file)
        # Extract zero-padded page number from filename (e.g. "06" from "Manual-06.md")
        m = re.search(r'-(\d+)\.md$', str(file_path))
        page_num = m.group(1) if m else "000"
        print(f"Processing [{page_num}] {file_path.name} ...")
        with open(md_file, "r", encoding="utf-8") as f:
            content = f.read()
        # Replace each image reference
        def replace_image(match):
            alt = match.group(1)
            old_path = match.group(2)
            # Decode %20 → space for filesystem access
            old_path_decoded = old_path.replace("%20", " ")
            # Filename only: _page_0_Figure_3.jpeg
            img_filename = Path(old_path_decoded).name
            # Strip leading _page_N_ to get: Figure_3.jpeg  or  Picture_12.jpeg
            clean_name = re.sub(r'^_page_\d+_', '', img_filename)
            # New unique name: p006_Figure_3.jpeg
            new_name = f"p{page_num}_{clean_name}"
            # Copy image to unified folder
            src = os.path.join(input_dir, old_path_decoded)
            dst = os.path.join(unified_images_path, new_name)
            if os.path.exists(src):
                shutil.copy2(src, dst)
            else:
                print(f"  WARNING: image not found: {src}")
            return f"![{alt}]({images_subdir}/{new_name})"
        new_content = re.sub(r'!\[([^\]]*)\]\(([^)]+)\)', replace_image, content)
        merged_content.append(new_content)
    if not merged_content:
        print("No content to merge")
        return
    final_content = "\n\n---\n\n".join(merged_content)
    output_path = os.path.join(input_dir, output_file)
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(final_content)
    print("\n" + "=" * 60)
    print(f"SUCCESS: {output_path}")
    print(f"  Files merged : {len(merged_content)}")
    print(f"  Total chars  : {len(final_content):,}")
 if __name__ == "__main__":
    merge_markdown_files(
        input_dir="output",
        output_file="merged_all.md",
        images_subdir="images",
    )
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,96 @@
 annotated-types==0.7.0
 anthropic==0.46.0
 anyio==4.12.1
 beautifulsoup4==4.14.3
 brotli==1.2.0
 certifi==2026.1.4
 cffi==2.0.0
 cfgv==3.5.0
 charset-normalizer==3.4.4
 click==8.3.1
 cobble==0.1.4
 colorama==0.4.6
 cssselect2==0.8.0
 distlib==0.4.0
 distro==1.9.0
 EbookLib==0.18
 einops==0.8.1
 et_xmlfile==2.0.0
 filelock==3.20.3
 filetype==1.2.0
 fonttools==4.61.1
 fsspec==2026.1.0
 ftfy==6.3.1
 google-auth==2.47.0
 google-genai==1.59.0
 h11==0.16.0
 httpcore==1.0.9
 httpx==0.28.1
 huggingface-hub==0.36.0
 identify==2.6.16
 idna==3.11
 Jinja2==3.1.6
 jiter==0.12.0
 joblib==1.5.3
 lxml==6.0.2
 mammoth==1.11.0
 markdown2==2.5.4
 markdownify==1.2.2
 marker-pdf==1.10.1
 MarkupSafe==3.0.3
 mpmath==1.3.0
 networkx==3.6.1
 nodeenv==1.10.0
 numpy==2.4.1
 openai==1.109.1
 opencv-python-headless==4.11.0.86
 openpyxl==3.1.5
 packaging==25.0
 pdftext==0.6.3
 pillow==10.4.0
 platformdirs==4.5.1
 pre_commit==4.5.1
 psutil==7.2.1
 pyasn1==0.6.2
 pyasn1_modules==0.4.2
 pycparser==2.23
 pydantic==2.12.5
 pydantic-settings==2.12.0
 pydantic_core==2.41.5
 pydyf==0.12.1
 PyMuPDF==1.26.7
 pypdfium2==4.30.0
 pyphen==0.17.2
 python-dotenv==1.2.1
 python-pptx==1.0.2
 PyYAML==6.0.3
 RapidFuzz==3.14.3
 regex==2024.11.6
 requests==2.32.5
 rsa==4.9.1
 safetensors==0.7.0
 scikit-learn==1.8.0
 scipy==1.17.0
 six==1.17.0
 sniffio==1.3.1
 soupsieve==2.8.1
 surya-ocr==0.17.0
 sympy==1.14.0
 tenacity==9.1.2
 threadpoolctl==3.6.0
 tinycss2==1.5.1
 tinyhtml5==2.0.0
 tokenizers==0.22.2
 torch==2.9.1
 tqdm==4.67.1
 transformers==4.57.6
 typing-inspection==0.4.2
 typing_extensions==4.15.0
 urllib3==2.6.3
 virtualenv==20.36.1
 wcwidth==0.2.14
 weasyprint==63.1
 webencodings==0.5.1
 websockets==15.0.1
 xlsxwriter==3.2.9
 zopfli==0.4.0
--- a/update_image_paths.py
+++ b/update_image_paths.py
@@ -0,0 +1,94 @@
 #!/usr/bin/env python3
 """
 Update image paths in markdown files to point to extracted images
 """
 import os
 import re
 import glob
 from pathlib import Path
 def update_markdown_image_paths(md_path: str, output_dir: str = "output"):
    """
    Update image paths in markdown file to point to extracted images
    """
    md_file = Path(md_path)
    base_name = md_file.stem
    # Path to extracted images folder
    extracted_images_dir = f"{base_name}_extracted_images"
    # Check if extracted images folder exists
    extracted_images_path = os.path.join(output_dir, extracted_images_dir)
    if not os.path.exists(extracted_images_path):
        print(f"No extracted images folder found: {extracted_images_path}")
        return False
    # Read markdown content
    with open(md_path, 'r', encoding='utf-8') as f:
        content = f.read()
    original_content = content
    # Pattern to match image references like ![](_page_1_Figure_1.jpeg)
    # Replace with actual extracted images
    def replace_image_path(match):
        old_path = match.group(1)
        # Extract page number from old path (e.g., _page_1_Figure_1.jpeg -> page 1)
        page_match = re.search(r'_page_(\d+)_', old_path)
        if page_match:
            page_num = page_match.group(1)
            # Map to extracted image: page_1_img_1.png
            new_path = f"{extracted_images_dir}/page_{page_num}_img_1.png"
            return f'![]({new_path})'
        return match.group(0)  # Return original if no match
    # Replace all image paths
    content = re.sub(r'\!\[\]\(([^)]+\.jpeg)\)', replace_image_path, content)
    if content == original_content:
        print(f"No changes needed for {md_file.name}")
        return True
    # Save updated markdown
    output_path = md_path.replace('.md', '_updated.md')
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(content)
    print(f"Updated markdown saved to: {output_path}")
    # Count replacements
    old_count = len(re.findall(r'\!\[\]\([^)]+\.jpeg\)', original_content))
    new_count = len(re.findall(r'\!\[\]\([^)]+\.png\)', content))
    print(f"  Replaced {new_count} image paths (out of {old_count} references)")
    return True
 def update_all_markdown_files(output_dir: str = "output"):
    """
    Update image paths in all markdown files
    """
    md_pattern = os.path.join(output_dir, "*.md")
    md_files = [f for f in glob.glob(md_pattern) if not f.endswith('_updated.md')]
    if not md_files:
        print(f"No markdown files found in {output_dir}")
        return
    print(f"Found {len(md_files)} markdown files")
    print("=" * 60)
    for md_file in md_files:
        update_markdown_image_paths(md_file, output_dir)
        print()
    print("=" * 60)
    print("Done!")
 if __name__ == "__main__":
    update_all_markdown_files()