feat: MD 파일 병합 및 이미지 경로 통합 스크립트 추가 (#1)

- merge_markdown.py: 96개 페이지별 MD를 단일 파일로 병합 - 이미지를 output/images/ 폴더로 통합, p{NN}_ prefix로 파일명 충돌 방지 - file_range 파라미터로 부분 테스트 가능 - docs/tutorial.md: merge 명령어 및 사용법 문서화 - docs/history: 작업 이력 파일 추가 소요 시간: 10분 | Context: input 18k / output 2k tokens Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-01 11:00:28 +09:00
parent 892e4ecafb
commit 8d4339302e
24 changed files with 2335 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,89 @@
+# Python virtual environment
+.venv/
+venv/
+env/
+ENV/
+
+# Python cache
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+*.manifest
+*.spec
+
+# Unit test / coverage
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.pytest_cache/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# OS
+.DS_Store
+Thumbs.db
+Desktop.ini
+
+# Environment variables
+.env
+.env.*
+!.env.example
+
+# Project specific
+backup_epub/
+input/
+output/
+back/
+
+# Large binary files
+*.epub
+*.pdf
+*.png
+*.jpg
+*.jpeg
+*.gif
+*.bmp
+*.tiff
+*.zip
+*.tar
+*.tar.gz
+*.rar
+
+!README.md
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -0,0 +1,47 @@
+# CLAUDE.md — documan 프로젝트 규칙
+
+## 작업 히스토리 기록 규칙
+
+모든 작업 세션이 끝나면 반드시 히스토리 파일을 작성해야 한다.
+
+### 기록 위치
+```
+docs/history/YYYY-MM-DD_{작업명}.md
+```
+
+### 필수 포함 항목 (누락 시 저장 차단됨)
+```markdown
+**소요 시간**: X분
+**Context 사용량**: input Xk / output Xk tokens
+```
+
+### 선택 포함 항목
+```markdown
+**이슈**: #N
+```
+- 작업이 특정 Gitea 이슈와 연관된 경우 이슈 번호를 기재
+- 나중에 이슈별 토큰 사용량 집계에 활용됨
+
+### 히스토리 파일 작성 기준
+- 사용자 요청이 완료된 직후, 응답 마지막 단계에서 작성
+- 작업명은 핵심 내용을 한국어로 간결하게 (예: `2026-03-31_PDF2단변환기능추가.md`)
+- 변경한 파일 목록, 주요 결정사항, 발생한 문제와 해결 방법 포함
+
+### 히스토리 파일 템플릿
+```markdown
+**이슈**: #N
+**소요 시간**: X분
+**Context 사용량**: input Xk / output Xk tokens
+
+## 작업 내용
+{작업 요약}
+
+## 변경 파일
+- `파일경로` : 변경 내용
+
+## 주요 결정사항
+{설계 판단, 선택한 이유}
+
+## 문제 및 해결
+{발생한 문제와 해결 방법}
+```
--- a/convert_epub.py
+++ b/convert_epub.py
@@ -0,0 +1,205 @@
+#!/usr/bin/env python3
+"""
+EPUB to Markdown converter using ebooklib and html2text
+"""
+
+import os
+import json
+import re
+from pathlib import Path
+import ebooklib
+from ebooklib import epub
+from bs4 import BeautifulSoup
+
+
+def html_to_markdown(soup):
+    """Convert BeautifulSoup HTML to Markdown format"""
+
+    def process_element(element):
+        if isinstance(element, str):
+            text = element.strip()
+            if text:
+                return text
+            return ""
+
+        tag = element.name
+
+        # Headers
+        if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+            level_num = int(tag[1])
+            text = element.get_text().strip()
+            return '\n' + '#' * level_num + ' ' + text + '\n'
+
+        # Paragraphs
+        elif tag == 'p':
+            text = ''.join(process_element(child) for child in element.children)
+            return '\n' + text.strip() + '\n'
+
+        # Line breaks
+        elif tag == 'br':
+            return '\n'
+
+        # Bold
+        elif tag in ['strong', 'b']:
+            text = ''.join(process_element(child) for child in element.children)
+            return '**' + text.strip() + '**'
+
+        # Italic
+        elif tag in ['em', 'i']:
+            text = ''.join(process_element(child) for child in element.children)
+            return '*' + text.strip() + '*'
+
+        # Links
+        elif tag == 'a':
+            text = ''.join(process_element(child) for child in element.children)
+            href = element.get('href', '')
+            if href:
+                return f'[{text.strip()}]({href})'
+            return text.strip()
+
+        # Images
+        elif tag == 'img':
+            src = element.get('src', '')
+            alt = element.get('alt', '')
+            return f'![{alt}]({src})'
+
+        # Lists
+        elif tag == 'ul':
+            items = []
+            for li in element.find_all('li', recursive=False):
+                text = ''.join(process_element(child) for child in li.children)
+                items.append('- ' + text.strip())
+            return '\n' + '\n'.join(items) + '\n'
+
+        elif tag == 'ol':
+            items = []
+            for i, li in enumerate(element.find_all('li', recursive=False), 1):
+                text = ''.join(process_element(child) for child in li.children)
+                items.append(f'{i}. ' + text.strip())
+            return '\n' + '\n'.join(items) + '\n'
+
+        # Blockquote
+        elif tag == 'blockquote':
+            text = ''.join(process_element(child) for child in element.children)
+            lines = text.strip().split('\n')
+            return '\n' + '\n'.join('> ' + line for line in lines) + '\n'
+
+        # Code
+        elif tag == 'code':
+            text = element.get_text()
+            return '`' + text + '`'
+
+        elif tag == 'pre':
+            text = element.get_text()
+            return '\n```\n' + text + '\n```\n'
+
+        # Div and span - just process children
+        elif tag in ['div', 'span', 'section', 'article']:
+            return ''.join(process_element(child) for child in element.children)
+
+        # Default - process children
+        else:
+            return ''.join(process_element(child) for child in element.children)
+
+    # Process body or entire soup
+    body = soup.find('body') if soup.find('body') else soup
+    markdown = process_element(body)
+
+    # Clean up multiple newlines
+    markdown = re.sub(r'\n{3,}', '\n\n', markdown)
+
+    return markdown.strip()
+
+
+def convert_epub_to_markdown(epub_path: str, output_dir: str = "output"):
+    """
+    Convert EPUB file to Markdown
+
+    Args:
+        epub_path: Path to the EPUB file
+        output_dir: Directory to save the output (default: "output")
+    """
+    # Create output directory if it doesn't exist
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Get the base filename without extension
+    epub_file = Path(epub_path)
+    base_name = epub_file.stem
+
+    print(f"Converting {epub_path} to Markdown...")
+
+    # Read the EPUB file
+    book = epub.read_epub(epub_path)
+
+    # Extract all text content
+    chapters = []
+    images = {}
+    image_counter = 0
+
+    for item in book.get_items():
+        if item.get_type() == ebooklib.ITEM_DOCUMENT:
+            # Get HTML content
+            html_content = item.get_content().decode('utf-8')
+
+            # Parse with BeautifulSoup
+            soup = BeautifulSoup(html_content, 'html.parser')
+
+            # Convert to markdown-like format
+            markdown_content = html_to_markdown(soup)
+
+            # Clean up the markdown
+            markdown_content = markdown_content.strip()
+
+            if markdown_content:
+                chapters.append(markdown_content)
+
+        elif item.get_type() == ebooklib.ITEM_IMAGE:
+            # Save image
+            image_counter += 1
+            img_name = item.get_name().split('/')[-1]
+            if not img_name:
+                img_name = f"image_{image_counter}.{item.media_type.split('/')[-1]}"
+            images[img_name] = item.get_content()
+
+    # Combine all chapters
+    full_markdown = "\n\n---\n\n".join(chapters)
+
+    # Save as markdown
+    output_path = os.path.join(output_dir, f"{base_name}.md")
+    with open(output_path, "w", encoding="utf-8") as f:
+        f.write(full_markdown)
+
+    print(f"OK Conversion complete!")
+    print(f"OK Output saved to: {output_path}")
+    print(f"OK Total chapters: {len(chapters)}")
+
+    # Save images if any
+    if images:
+        images_dir = os.path.join(output_dir, f"{base_name}_images")
+        os.makedirs(images_dir, exist_ok=True)
+        for img_name, img_data in images.items():
+            img_path = os.path.join(images_dir, img_name)
+            with open(img_path, "wb") as f:
+                f.write(img_data)
+        print(f"OK {len(images)} images saved to: {images_dir}")
+
+    # Save metadata if available
+    metadata = {
+        'title': book.get_metadata('DC', 'title'),
+        'creator': book.get_metadata('DC', 'creator'),
+        'language': book.get_metadata('DC', 'language'),
+        'publisher': book.get_metadata('DC', 'publisher'),
+        'description': book.get_metadata('DC', 'description'),
+    }
+
+    if any(metadata.values()):
+        metadata_path = os.path.join(output_dir, f"{base_name}_metadata.json")
+        with open(metadata_path, "w", encoding="utf-8") as f:
+            json.dump(metadata, f, indent=2, ensure_ascii=False)
+        print(f"OK Metadata saved to: {metadata_path}")
+
+
+if __name__ == "__main__":
+    # Convert the EPUB file in the input directory
+    epub_path = "input/the-art-of-spending-money.epub"
+    convert_epub_to_markdown(epub_path)
--- a/convert_pdfs.py
+++ b/convert_pdfs.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+"""
+Batch PDF to Markdown converter using marker-pdf library
+"""
+
+import os
+import glob
+from pathlib import Path
+from marker.converters.pdf import PdfConverter
+from marker.models import create_model_dict
+from marker.output import text_from_rendered
+
+
+def convert_pdf_to_markdown(pdf_path: str, output_dir: str = "output"):
+    """
+    Convert PDF file to Markdown
+
+    Args:
+        pdf_path: Path to the PDF file
+        output_dir: Directory to save the output (default: "output")
+    """
+    # Create output directory if it doesn't exist
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Get the base filename without extension
+    pdf_file = Path(pdf_path)
+    base_name = pdf_file.stem
+
+    print(f"\nConverting {pdf_file.name} to Markdown...")
+
+    try:
+        # Initialize the converter with model dictionary
+        converter = PdfConverter(
+            artifact_dict=create_model_dict(),
+        )
+
+        # Convert the PDF file
+        rendered = converter(pdf_path)
+
+        # Extract text and images from rendered output
+        text, metadata, images = text_from_rendered(rendered)
+
+        # Save as markdown
+        output_path = os.path.join(output_dir, f"{base_name}.md")
+        with open(output_path, "w", encoding="utf-8") as f:
+            f.write(text)
+
+        print(f"  OK Output saved to: {output_path}")
+
+        # Save images if any
+        if images:
+            images_dir = os.path.join(output_dir, f"{base_name}_images")
+            os.makedirs(images_dir, exist_ok=True)
+            for img_name, img_data in images.items():
+                img_path = os.path.join(images_dir, img_name)
+                with open(img_path, "wb") as f:
+                    f.write(img_data)
+            print(f"  OK {len(images)} images saved to: {images_dir}")
+
+        # Save metadata if available
+        if metadata:
+            metadata_path = os.path.join(output_dir, f"{base_name}_metadata.json")
+            import json
+            with open(metadata_path, "w", encoding="utf-8") as f:
+                json.dump(metadata, f, indent=2, ensure_ascii=False)
+            print(f"  OK Metadata saved to: {metadata_path}")
+
+        return True
+
+    except Exception as e:
+        print(f"  ERROR: Failed to convert {pdf_file.name}: {e}")
+        return False
+
+
+def convert_all_pdfs(input_dir: str = "input", output_dir: str = "output"):
+    """
+    Convert all PDF files in the input directory to Markdown
+
+    Args:
+        input_dir: Directory containing PDF files
+        output_dir: Directory to save the output
+    """
+    # Find all PDF files
+    pdf_pattern = os.path.join(input_dir, "*.pdf")
+    pdf_files = sorted(glob.glob(pdf_pattern))
+
+    if not pdf_files:
+        print(f"No PDF files found in {input_dir}")
+        return
+
+    print(f"Found {len(pdf_files)} PDF files to convert")
+    print("=" * 60)
+
+    successful = 0
+    failed = 0
+
+    for pdf_file in pdf_files:
+        if convert_pdf_to_markdown(pdf_file, output_dir):
+            successful += 1
+        else:
+            failed += 1
+
+    print("\n" + "=" * 60)
+    print(f"Conversion complete!")
+    print(f"  Successful: {successful}")
+    print(f"  Failed: {failed}")
+    print(f"  Total: {len(pdf_files)}")
+
+
+if __name__ == "__main__":
+    convert_all_pdfs()
--- a/convert_pdfs_fast.py
+++ b/convert_pdfs_fast.py
@@ -0,0 +1,150 @@
+#!/usr/bin/env python3
+"""
+Fast PDF to Markdown converter - optimized for text-heavy documents
+"""
+
+import argparse
+import os
+import glob
+from pathlib import Path
+from marker.converters.pdf import PdfConverter
+from marker.models import create_model_dict
+from marker.output import text_from_rendered
+from marker.config.parser import ConfigParser
+
+
+def convert_pdf_to_markdown_fast(pdf_path: str, output_dir: str = "output", languages: str = None):
+    """
+    Convert PDF file to Markdown with speed optimizations for text-heavy documents
+
+    Args:
+        pdf_path: Path to the PDF file
+        output_dir: Directory to save the output (default: "output")
+        languages: Comma-separated language codes for OCR (e.g. "ko", "ko,en")
+    """
+    # Create output directory if it doesn't exist
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Get the base filename without extension
+    pdf_file = Path(pdf_path)
+    base_name = pdf_file.stem
+
+    print(f"\nConverting {pdf_file.name} to Markdown...")
+    if languages:
+        print(f"  Languages: {languages}")
+
+    try:
+        # Configure for speed - text-focused processing
+        config = {
+            "output_format": "markdown",
+            # Disable image extraction for speed (images won't be saved separately)
+            # "disable_image_extraction": True,  # Uncomment if you want to skip all images
+        }
+
+        if languages:
+            config["languages"] = languages.split(",")
+
+        config_parser = ConfigParser(config)
+
+        # Initialize the converter with optimized settings
+        converter = PdfConverter(
+            config=config_parser.generate_config_dict(),
+            artifact_dict=create_model_dict(),
+            processor_list=config_parser.get_processors(),
+            renderer=config_parser.get_renderer(),
+        )
+
+        # Convert the PDF file
+        rendered = converter(pdf_path)
+
+        # Extract text and images from rendered output
+        text, metadata, images = text_from_rendered(rendered)
+
+        # Save as markdown
+        output_path = os.path.join(output_dir, f"{base_name}.md")
+        with open(output_path, "w", encoding="utf-8") as f:
+            f.write(text)
+
+        print(f"  OK Output saved to: {output_path}")
+
+        # Save images
+        if images:
+            images_dir = os.path.join(output_dir, f"{base_name}_images")
+            os.makedirs(images_dir, exist_ok=True)
+            for img_name, img_data in images.items():
+                img_path = os.path.join(images_dir, img_name)
+                with open(img_path, "wb") as f:
+                    f.write(img_data)
+            print(f"  OK {len(images)} images saved to: {images_dir}")
+
+        # Skip metadata saving for speed
+        # if metadata:
+        #     metadata_path = os.path.join(output_dir, f"{base_name}_metadata.json")
+        #     import json
+        #     with open(metadata_path, "w", encoding="utf-8") as f:
+        #         json.dump(metadata, f, indent=2, ensure_ascii=False)
+        #     print(f"  OK Metadata saved to: {metadata_path}")
+
+        return (True, pdf_file.name)
+
+    except Exception as e:
+        print(f"  ERROR: Failed to convert {pdf_file.name}: {e}")
+        return (False, pdf_file.name)
+
+
+def convert_all_pdfs_fast(input_dir: str = "input", output_dir: str = "output", languages: str = None):
+    """
+    Convert all PDF files in the input directory to Markdown (sequential, memory-safe)
+
+    Args:
+        input_dir: Directory containing PDF files
+        output_dir: Directory to save the output
+        languages: Comma-separated language codes for OCR (e.g. "ko", "ko,en")
+    """
+    # Find all PDF files
+    pdf_pattern = os.path.join(input_dir, "*.pdf")
+    pdf_files = sorted(glob.glob(pdf_pattern))
+
+    if not pdf_files:
+        print(f"No PDF files found in {input_dir}")
+        return
+
+    print(f"Found {len(pdf_files)} PDF files to convert")
+    print("Mode: FAST (text-focused, sequential processing)")
+    if languages:
+        print(f"Languages: {languages}")
+    print("=" * 60)
+
+    successful = 0
+    failed = 0
+    failed_files = []
+
+    for i, pdf_file in enumerate(pdf_files, 1):
+        print(f"\n[{i}/{len(pdf_files)}]", end=" ")
+        success, filename = convert_pdf_to_markdown_fast(pdf_file, output_dir, languages)
+        if success:
+            successful += 1
+        else:
+            failed += 1
+            failed_files.append(filename)
+
+    print("\n" + "=" * 60)
+    print(f"Conversion complete!")
+    print(f"  Successful: {successful}")
+    print(f"  Failed: {failed}")
+    print(f"  Total: {len(pdf_files)}")
+
+    if failed_files:
+        print(f"\nFailed files:")
+        for filename in failed_files:
+            print(f"  - {filename}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Fast PDF to Markdown converter")
+    parser.add_argument("--input_dir", default="input", help="Input directory containing PDF files")
+    parser.add_argument("--output_dir", default="output", help="Output directory for markdown files")
+    parser.add_argument("--languages", default=None, help="Comma-separated language codes for OCR (e.g. ko, ko,en)")
+    args = parser.parse_args()
+
+    convert_all_pdfs_fast(args.input_dir, args.output_dir, args.languages)
--- a/convert_pdfs_parallel.py
+++ b/convert_pdfs_parallel.py
@@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+"""
+Batch PDF to Markdown converter with parallel processing using marker-pdf library
+"""
+
+import os
+import glob
+from pathlib import Path
+from marker.converters.pdf import PdfConverter
+from marker.models import create_model_dict
+from marker.output import text_from_rendered
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import multiprocessing
+
+
+def convert_pdf_to_markdown(pdf_path: str, output_dir: str = "output"):
+    """
+    Convert PDF file to Markdown
+
+    Args:
+        pdf_path: Path to the PDF file
+        output_dir: Directory to save the output (default: "output")
+    """
+    # Create output directory if it doesn't exist
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Get the base filename without extension
+    pdf_file = Path(pdf_path)
+    base_name = pdf_file.stem
+
+    print(f"\nConverting {pdf_file.name} to Markdown...")
+
+    try:
+        # Initialize the converter with model dictionary
+        converter = PdfConverter(
+            artifact_dict=create_model_dict(),
+        )
+
+        # Convert the PDF file
+        rendered = converter(pdf_path)
+
+        # Extract text and images from rendered output
+        text, metadata, images = text_from_rendered(rendered)
+
+        # Save as markdown
+        output_path = os.path.join(output_dir, f"{base_name}.md")
+        with open(output_path, "w", encoding="utf-8") as f:
+            f.write(text)
+
+        print(f"  OK Output saved to: {output_path}")
+
+        # Save images if any
+        if images:
+            images_dir = os.path.join(output_dir, f"{base_name}_images")
+            os.makedirs(images_dir, exist_ok=True)
+            for img_name, img_data in images.items():
+                img_path = os.path.join(images_dir, img_name)
+                with open(img_path, "wb") as f:
+                    f.write(img_data)
+            print(f"  OK {len(images)} images saved to: {images_dir}")
+
+        # Save metadata if available
+        if metadata:
+            metadata_path = os.path.join(output_dir, f"{base_name}_metadata.json")
+            import json
+            with open(metadata_path, "w", encoding="utf-8") as f:
+                json.dump(metadata, f, indent=2, ensure_ascii=False)
+            print(f"  OK Metadata saved to: {metadata_path}")
+
+        return (True, pdf_file.name)
+
+    except Exception as e:
+        print(f"  ERROR: Failed to convert {pdf_file.name}: {e}")
+        return (False, pdf_file.name)
+
+
+def convert_all_pdfs_parallel(input_dir: str = "input", output_dir: str = "output", max_workers: int = None):
+    """
+    Convert all PDF files in the input directory to Markdown using parallel processing
+
+    Args:
+        input_dir: Directory containing PDF files
+        output_dir: Directory to save the output
+        max_workers: Maximum number of parallel workers (default: CPU count - 1)
+    """
+    # Find all PDF files
+    pdf_pattern = os.path.join(input_dir, "*.pdf")
+    pdf_files = sorted(glob.glob(pdf_pattern))
+
+    if not pdf_files:
+        print(f"No PDF files found in {input_dir}")
+        return
+
+    # Determine number of workers
+    if max_workers is None:
+        max_workers = max(1, multiprocessing.cpu_count() - 1)
+
+    print(f"Found {len(pdf_files)} PDF files to convert")
+    print(f"Using {max_workers} parallel workers")
+    print("=" * 60)
+
+    successful = 0
+    failed = 0
+    failed_files = []
+
+    # Process PDFs in parallel
+    with ProcessPoolExecutor(max_workers=max_workers) as executor:
+        # Submit all tasks
+        future_to_pdf = {
+            executor.submit(convert_pdf_to_markdown, pdf_file, output_dir): pdf_file
+            for pdf_file in pdf_files
+        }
+
+        # Process completed tasks as they finish
+        for future in as_completed(future_to_pdf):
+            pdf_file = future_to_pdf[future]
+            try:
+                success, filename = future.result()
+                if success:
+                    successful += 1
+                else:
+                    failed += 1
+                    failed_files.append(filename)
+            except Exception as e:
+                print(f"  ERROR: Exception occurred for {pdf_file}: {e}")
+                failed += 1
+                failed_files.append(Path(pdf_file).name)
+
+    print("\n" + "=" * 60)
+    print(f"Conversion complete!")
+    print(f"  Successful: {successful}")
+    print(f"  Failed: {failed}")
+    print(f"  Total: {len(pdf_files)}")
+
+    if failed_files:
+        print(f"\nFailed files:")
+        for filename in failed_files:
+            print(f"  - {filename}")
+
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Parallel PDF to Markdown converter")
+    parser.add_argument("--input_dir", default="input", help="Input directory containing PDF files")
+    parser.add_argument("--output_dir", default="output", help="Output directory for markdown files")
+    parser.add_argument("--workers", type=int, default=2, help="Number of parallel workers (default: 2)")
+    args = parser.parse_args()
+
+    convert_all_pdfs_parallel(args.input_dir, args.output_dir, args.workers)
--- a/convert_with_cropped_images.py
+++ b/convert_with_cropped_images.py
@@ -0,0 +1,310 @@
+#!/usr/bin/env python3
+"""
+PDF to Markdown converter with cropped figure extraction
+Uses marker-pdf to detect figures, then crops them from page images.
+Supports 2-column (multi-column) → single-column reordering.
+"""
+
+import os
+import re
+import glob
+from pathlib import Path
+from marker.converters.pdf import PdfConverter
+from marker.models import create_model_dict
+from marker.output import text_from_rendered
+from PIL import Image
+import fitz  # PyMuPDF
+
+
+def is_scanned_pdf(pdf_path: str, sample_pages: int = 3) -> bool:
+    """페이지에 선택 가능한 텍스트가 없으면 스캔 PDF로 판단"""
+    doc = fitz.open(pdf_path)
+    total = min(sample_pages, len(doc))
+    text_chars = 0
+    for i in range(total):
+        text_chars += len(doc[i].get_text().strip())
+    doc.close()
+    return text_chars < 50  # 글자 수가 매우 적으면 스캔본
+
+
+def reorder_text_by_columns(pdf_path: str) -> str:
+    """
+    텍스트 기반 PDF 전용: PyMuPDF 블록 좌표로 2단 → 1단 순서 재정렬.
+    각 페이지에서 좌측 컬럼 전체 → 우측 컬럼 전체 순으로 읽음.
+    """
+    doc = fitz.open(pdf_path)
+    pages_text = []
+
+    for page in doc:
+        blocks = page.get_text("blocks", sort=False)
+        text_blocks = [b for b in blocks if b[6] == 0 and b[4].strip()]
+        if not text_blocks:
+            continue
+
+        page_width = page.rect.width
+        mid_x = page_width / 2
+
+        left = [b for b in text_blocks if b[2] <= mid_x + 30]
+        right = [b for b in text_blocks if b[0] >= mid_x - 30]
+        span = [b for b in text_blocks if b[0] < mid_x - 30 and b[2] > mid_x + 30]
+
+        is_two_col = len(left) >= 2 and len(right) >= 2 and not span
+
+        if is_two_col:
+            left.sort(key=lambda b: b[1])
+            right.sort(key=lambda b: b[1])
+            ordered = left + right
+        else:
+            ordered = sorted(text_blocks, key=lambda b: (b[1], b[0]))
+
+        pages_text.append("\n\n".join(b[4].strip() for b in ordered))
+
+    doc.close()
+    return "\n\n---\n\n".join(pages_text)
+
+
+def extract_figure_images(pdf_path: str, rendered, output_dir: str, base_name: str):
+    """
+    Extract figure images by cropping from page images based on marker's detection
+
+    Args:
+        pdf_path: Path to PDF file
+        rendered: Marker's rendered output with figure positions
+        output_dir: Output directory
+        base_name: Base filename
+
+    Returns:
+        dict: Mapping of image names to image data
+    """
+    images_dict = {}
+
+    # Check if rendered has pages with image information
+    if not hasattr(rendered, 'pages') or not rendered.pages:
+        print("  No page information in rendered output")
+        return images_dict
+
+    # Open PDF with PyMuPDF to render pages as images
+    doc = fitz.open(pdf_path)
+
+    print(f"  Processing {len(rendered.pages)} pages for figure extraction...")
+
+    for page_idx, page_data in enumerate(rendered.pages):
+        page_num = page_idx + 1
+
+        # Check if page has images/figures
+        if not hasattr(page_data, 'images') or not page_data.images:
+            continue
+
+        print(f"    Page {page_num}: Found {len(page_data.images)} figure(s)")
+
+        # Render page as image
+        pdf_page = doc[page_idx]
+
+        # Render at 2x resolution for better quality
+        mat = fitz.Matrix(2, 2)
+        pix = pdf_page.get_pixmap(matrix=mat)
+
+        # Convert to PIL Image
+        import io
+        img_data = pix.tobytes("png")
+        page_img = Image.open(io.BytesIO(img_data))
+
+        # Extract each figure from this page
+        for fig_idx, fig_info in enumerate(page_data.images):
+            try:
+                # Get bounding box (marker stores positions)
+                if hasattr(fig_info, 'bbox'):
+                    bbox = fig_info.bbox
+
+                    # Scale bbox coordinates (marker uses PDF coordinates)
+                    # Adjust for 2x rendering
+                    x0, y0, x1, y1 = bbox
+                    x0, y0, x1, y1 = int(x0 * 2), int(y0 * 2), int(x1 * 2), int(y1 * 2)
+
+                    # Crop the figure
+                    cropped = page_img.crop((x0, y0, x1, y1))
+
+                    # Save to bytes
+                    from io import BytesIO
+                    img_bytes = BytesIO()
+                    cropped.save(img_bytes, format='PNG')
+
+                    # Generate image name
+                    img_name = f"_page_{page_num}_Figure_{fig_idx + 1}.png"
+                    images_dict[img_name] = img_bytes.getvalue()
+
+                    print(f"      Cropped figure {fig_idx + 1}: {x1-x0}x{y1-y0}px")
+
+            except Exception as e:
+                print(f"      Warning: Could not crop figure {fig_idx + 1}: {e}")
+
+    doc.close()
+    return images_dict
+
+
+def convert_pdf_with_cropped_images(pdf_path: str, output_dir: str = "output"):
+    """
+    Convert PDF to Markdown with cropped figure images.
+    - 스캔 PDF: marker-pdf OCR + 레이아웃 검출 (2단 자동 처리)
+    - 텍스트 PDF: PyMuPDF 블록 좌표 기반 2단→1단 재정렬
+    """
+    import io
+
+    os.makedirs(output_dir, exist_ok=True)
+
+    pdf_file = Path(pdf_path)
+    base_name = pdf_file.stem
+
+    print(f"\nConverting {pdf_file.name}...")
+
+    scanned = is_scanned_pdf(pdf_path)
+    print(f"  PDF type: {'scanned (OCR)' if scanned else 'text-based (PyMuPDF column reorder)'}")
+
+    try:
+        if not scanned:
+            # 텍스트 기반 PDF: PyMuPDF로 2단 재정렬 추출
+            print("  Extracting text with column reordering...")
+            text = reorder_text_by_columns(pdf_path)
+            metadata = None
+            marker_images = {}
+        else:
+            # 스캔 PDF: marker-pdf가 OCR + 레이아웃(2단) 처리
+            converter = PdfConverter(
+                artifact_dict=create_model_dict(),
+            )
+
+            print("  Running marker-pdf OCR and layout detection...")
+            rendered = converter(pdf_path)
+
+            text, metadata, marker_images = text_from_rendered(rendered)
+
+        # Fix image paths: prepend {base_name}_images/ folder to image references
+        # 공백을 %20으로 인코딩 — Obsidian(CommonMark) 경로 파싱 오류 방지
+        safe_base_name = base_name.replace(' ', '%20')
+        text = re.sub(
+            r'!\[([^\]]*)\]\(([^/)][^)]*)\)',
+            rf'![\1]({safe_base_name}_images/\2)',
+            text
+        )
+
+        # Save markdown
+        output_path = os.path.join(output_dir, f"{base_name}.md")
+        with open(output_path, "w", encoding="utf-8") as f:
+            f.write(text)
+        print(f"  OK Markdown saved: {output_path}")
+
+        # Extract cropped figure images
+        print("  Extracting figures from pages...")
+        cropped_images = extract_figure_images(pdf_path, rendered, output_dir, base_name)
+
+        if cropped_images:
+            images_dir = os.path.join(output_dir, f"{base_name}_images")
+            os.makedirs(images_dir, exist_ok=True)
+
+            for img_name, img_data in cropped_images.items():
+                img_path = os.path.join(images_dir, img_name)
+                with open(img_path, "wb") as f:
+                    f.write(img_data)
+
+            print(f"  OK {len(cropped_images)} figures saved to: {images_dir}")
+        else:
+            print("  ! No figures extracted (trying alternative method...)")
+            # Fallback: use marker's images if available
+            if marker_images:
+                images_dir = os.path.join(output_dir, f"{base_name}_images")
+                os.makedirs(images_dir, exist_ok=True)
+
+                saved_count = 0
+                for img_name, img_data in marker_images.items():
+                    try:
+                        from io import BytesIO
+                        if isinstance(img_data, Image.Image):
+                            img_bytes = BytesIO()
+                            img_data.save(img_bytes, format='PNG')
+                            img_bytes = img_bytes.getvalue()
+                        else:
+                            img_bytes = img_data
+
+                        if img_bytes and len(img_bytes) > 0:
+                            img_path = os.path.join(images_dir, img_name)
+                            with open(img_path, "wb") as f:
+                                f.write(img_bytes)
+                            saved_count += 1
+                    except Exception as e:
+                        print(f"    Warning: Could not save {img_name}: {e}")
+
+                if saved_count > 0:
+                    print(f"  OK {saved_count} images from marker saved")
+                else:
+                    print("  ! No valid images to save")
+
+        # Save metadata
+        if metadata:
+            import json
+            metadata_path = os.path.join(output_dir, f"{base_name}_metadata.json")
+            with open(metadata_path, "w", encoding="utf-8") as f:
+                json.dump(metadata, f, indent=2, ensure_ascii=False)
+
+        return True
+
+    except Exception as e:
+        print(f"  ERROR: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+
+
+def convert_all_pdfs(input_dir: str = "input", output_dir: str = "output"):
+    """
+    Convert all PDFs with cropped figure extraction
+    Each PDF is converted in a separate process to avoid multiprocessing issues
+    """
+    pdf_pattern = os.path.join(input_dir, "*.pdf")
+    pdf_files = sorted(glob.glob(pdf_pattern))
+
+    if not pdf_files:
+        print(f"No PDF files found in {input_dir}")
+        return
+
+    print(f"Found {len(pdf_files)} PDF file(s)")
+    print("=" * 60)
+
+    successful = 0
+    failed = 0
+
+    import subprocess
+    import sys
+
+    for pdf_file in pdf_files:
+        print(f"\nStarting conversion of: {os.path.basename(pdf_file)}")
+
+        result = subprocess.run(
+            [sys.executable, __file__, "--single", pdf_file, output_dir],
+            capture_output=False
+        )
+
+        if result.returncode == 0:
+            successful += 1
+        else:
+            failed += 1
+            print(f"  FAILED: {os.path.basename(pdf_file)}")
+
+    print("\n" + "=" * 60)
+    print(f"Conversion complete!")
+    print(f"  Successful: {successful}")
+    print(f"  Failed: {failed}")
+    print(f"  Total: {len(pdf_files)}")
+
+
+if __name__ == "__main__":
+    import sys
+
+    # Check if running in single-file mode (called by subprocess)
+    if len(sys.argv) >= 4 and sys.argv[1] == "--single":
+        pdf_file = sys.argv[2]
+        output_dir = sys.argv[3]
+        success = convert_pdf_with_cropped_images(pdf_file, output_dir)
+        sys.exit(0 if success else 1)
+    else:
+        # Normal batch mode
+        convert_all_pdfs()
--- a/debug_marker_images.py
+++ b/debug_marker_images.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+"""
+Debug marker-pdf image extraction
+"""
+
+import os
+from pathlib import Path
+from marker.converters.pdf import PdfConverter
+from marker.models import create_model_dict
+from marker.output import text_from_rendered
+
+
+def debug_image_extraction(pdf_path: str):
+    """
+    Debug why images are not being extracted properly
+    """
+    pdf_file = Path(pdf_path)
+    print(f"Debugging image extraction for: {pdf_file.name}")
+    print("=" * 60)
+
+    try:
+        # Initialize converter
+        converter = PdfConverter(
+            artifact_dict=create_model_dict(),
+        )
+
+        # Convert
+        print("\nConverting PDF...")
+        rendered = converter(pdf_path)
+        print(f"  Rendered type: {type(rendered)}")
+        print(f"  Rendered attributes: {dir(rendered)}")
+
+        # Check what's in rendered
+        if hasattr(rendered, 'images'):
+            print(f"\n  rendered.images exists: {len(rendered.images) if rendered.images else 0} images")
+            if rendered.images:
+                for idx, (key, val) in enumerate(list(rendered.images.items())[:3]):
+                    print(f"    Image {idx}: {key}, data size: {len(val) if val else 0}")
+
+        # Extract text and images
+        print("\nExtracting text and images...")
+        text, metadata, images = text_from_rendered(rendered)
+
+        print(f"\n  Text length: {len(text)} characters")
+        print(f"  Metadata: {type(metadata)}")
+        print(f"  Images dict: {len(images) if images else 0} items")
+
+        if images:
+            print("\n  Detailed image info:")
+            for idx, (img_name, img_data) in enumerate(images.items()):
+                print(f"    {idx + 1}. Name: {img_name}")
+                print(f"       Data type: {type(img_data)}")
+                print(f"       Data size: {len(img_data) if img_data else 0} bytes")
+                if img_data:
+                    print(f"       First 20 bytes: {img_data[:20]}")
+                else:
+                    print(f"       WARNING: Empty data!")
+        else:
+            print("\n  WARNING: No images returned!")
+
+        # Check rendered object for image data
+        print("\n  Checking rendered object structure:")
+        if hasattr(rendered, '__dict__'):
+            for key, val in rendered.__dict__.items():
+                if 'image' in key.lower():
+                    print(f"    {key}: {type(val)}, length: {len(val) if hasattr(val, '__len__') else 'N/A'}")
+
+        # Try to access images directly from rendered
+        if hasattr(rendered, 'images') and rendered.images:
+            print("\n  Attempting direct image access:")
+            print(f"    Total images in rendered: {len(rendered.images)}")
+            for idx, (img_name, img_obj) in enumerate(list(rendered.images.items())[:3]):
+                print(f"\n    Image {idx + 1}: {img_name}")
+                print(f"      Type: {type(img_obj)}")
+                print(f"      Attributes: {dir(img_obj) if hasattr(img_obj, '__dir__') else 'None'}")
+                if hasattr(img_obj, 'tobytes'):
+                    img_bytes = img_obj.tobytes()
+                    print(f"      Bytes: {len(img_bytes)}")
+                elif hasattr(img_obj, 'save'):
+                    print(f"      Has save method (PIL Image?)")
+
+    except Exception as e:
+        print(f"\n  ERROR: {e}")
+        import traceback
+        traceback.print_exc()
+
+
+if __name__ == "__main__":
+    # Debug the first PDF in input folder
+    import glob
+    pdf_files = glob.glob("input/*.pdf")
+    if pdf_files:
+        debug_image_extraction(pdf_files[0])
+    else:
+        print("No PDF files found in input folder")
--- a/debug_single_page.py
+++ b/debug_single_page.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+"""
+Debug image extraction for a single page
+"""
+
+import os
+from pathlib import Path
+from marker.converters.pdf import PdfConverter
+from marker.models import create_model_dict
+from marker.output import text_from_rendered
+import pypdfium2 as pdfium
+
+
+def debug_single_page(pdf_path: str, page_num: int = 1):
+    """
+    Debug image extraction for a specific page (page_num is 1-indexed)
+    """
+    pdf_file = Path(pdf_path)
+    print(f"Debugging page {page_num} of: {pdf_file.name}")
+    print("=" * 60)
+
+    # First check what PyPDFium2 sees
+    print("\n1. Checking with PyPDFium2:")
+    try:
+        pdf = pdfium.PdfDocument(pdf_path)
+        page = pdf[page_num - 1]  # 0-indexed
+
+        print(f"   Page {page_num} objects:")
+        obj_count = 0
+        for obj in page.get_objects():
+            obj_count += 1
+            if hasattr(pdfium, 'FPDF_PAGEOBJ_IMAGE'):
+                if obj.type == pdfium.FPDF_PAGEOBJ_IMAGE:
+                    print(f"     - Image object found (old API)")
+            else:
+                print(f"     - Object type: {obj.type}")
+
+        print(f"   Total objects on page: {obj_count}")
+        pdf.close()
+    except Exception as e:
+        print(f"   PyPDFium2 error: {e}")
+
+    # Now check marker-pdf
+    print("\n2. Checking with marker-pdf:")
+    try:
+        converter = PdfConverter(
+            artifact_dict=create_model_dict(),
+        )
+
+        print("   Converting...")
+        rendered = converter(pdf_path)
+
+        # Check rendered object
+        print(f"\n   Rendered type: {type(rendered)}")
+
+        if hasattr(rendered, 'images'):
+            print(f"   rendered.images: {len(rendered.images) if rendered.images else 0} images")
+            if rendered.images:
+                for img_name, img_data in list(rendered.images.items())[:5]:
+                    print(f"     - {img_name}: {len(img_data) if img_data else 0} bytes")
+
+        # Extract using text_from_rendered
+        print("\n3. Extracting with text_from_rendered:")
+        text, metadata, images = text_from_rendered(rendered)
+
+        print(f"   Extracted images: {len(images) if images else 0}")
+        if images:
+            for img_name, img_data in images.items():
+                print(f"     - {img_name}: {len(img_data) if img_data else 0} bytes")
+                if not img_data or len(img_data) == 0:
+                    print(f"       ⚠️ WARNING: Empty image data!")
+
+        # Save a test image if available
+        if images:
+            output_dir = "output/debug_test"
+            os.makedirs(output_dir, exist_ok=True)
+
+            for img_name, img_data in images.items():
+                if img_data and len(img_data) > 0:
+                    img_path = os.path.join(output_dir, img_name)
+                    with open(img_path, "wb") as f:
+                        f.write(img_data)
+                    print(f"\n   ✓ Saved test image: {img_path}")
+                    break
+
+    except Exception as e:
+        print(f"   marker-pdf error: {e}")
+        import traceback
+        traceback.print_exc()
+
+
+if __name__ == "__main__":
+    import glob
+    pdf_files = glob.glob("input/*.pdf")
+    if pdf_files:
+        # Test page 2 (should have Figure 1.2, 1.3 according to the markdown)
+        debug_single_page(pdf_files[0], page_num=2)
+    else:
+        print("No PDF files found in input folder")
--- a/docs/history/.gitkeep
+++ b/docs/history/.gitkeep
--- a/docs/history/2026-03-31_GPU설정및전체MD변환시작.md
+++ b/docs/history/2026-03-31_GPU설정및전체MD변환시작.md
@@ -0,0 +1,33 @@
+**이슈**: #1
+**소요 시간**: 90분
+**Context 사용량**: input 80k / output 10k tokens
+
+## 작업 내용
+- torch CPU 버전(2.9.1)을 GPU 버전(2.7.0+cu126)으로 교체하여 RTX 3060 GPU OCR 활성화
+- MSEW3.0 매뉴얼 96페이지 중 01~12 MD 변환 완료
+- 01~12 변환된 MD 파일의 이미지 참조 아래에 파라미터 설명 삽입 완료
+- 나머지 13~96은 PowerShell 명령어로 야간 자동 변환 예정
+
+## 변경 파일
+- `.venv` : torch 2.9.1+cpu → 2.7.0+cu126 교체
+- `output/MSWE3.0 Manual-01~12.md` : 생성 완료
+- `output/MSWE3.0 Manual-03,04,09,10,11,12.md` : 이미지 파라미터 설명 삽입
+
+## 주요 결정사항
+- torch pip 설치 시 `.venv/Scripts/python.exe -m pip` 사용해야 올바른 venv에 설치됨 (단순 pip 명령은 다른 venv에 설치됨)
+- 병렬 변환(2개 동시)은 GPU 과부하 위험 → 1개씩 순차 처리로 변경
+- 야간 변환: 완료 파일 자동 건너뜀 로직 포함 PowerShell 스크립트 사용
+
+## 추출된 파라미터 (03~12 추가분)
+- Available Connection Strength: Confining Stress Sigma [kPa], CRu-1, CRs-1, GEOGRID 타입(1.4T~5.15T)
+- Project Identification: Project Title(필수), Designer(필수), Date/Time, Company/Firm, Project No.
+- Program Manager 메인화면: AASHTO(ASD/LRFD)/NCMA 설계모드 선택
+- Reinforcement Layout: LAYER#, Geogrid Height [m], Geogrid Type#, Vertical distance [m]
+- FACIA(Blocks): Depth Wu [m]=0.3, Height Hu [m]=0.2, Unit weight γ [kN/m³]=24, Gu [m]=0.15
+- Reduction factors at connection: RF d(내구성), RF c(크리프), BREAK Overall Fs=1.5, PULLOUT Overall Fs=1.5, 지진시 Tc-pullout 감소율=80%
+- Results 화면: Final Values for Design, 층별 강도/연결부 체크 결과
+
+## 문제 및 해결
+- `pip install` 이 docuConverter01 venv에 설치되는 문제 → `python -m pip install` 로 해결
+- torch 2.6.0: marker-pdf 호환 안됨(>=2.7.0 필요) → 2.7.0+cu126 사용
+- 병렬 실행 과부하 → 순차 처리로 변경
--- a/docs/history/2026-03-31_MSEW매뉴얼31-47이미지읽기시도중단.md
+++ b/docs/history/2026-03-31_MSEW매뉴얼31-47이미지읽기시도중단.md
@@ -0,0 +1,25 @@
+**이슈**: #1
+**소요 시간**: 약 30분 (추정)
+**Context 사용량**: input 약 180k / output 약 3k tokens (추정 — 컨텍스트 초과로 정확한 수치 기록 불가)
+
+## 작업 내용
+MSEW3.0 Manual 31~47번 이미지 파라미터 분석 시도.
+이미지 Read 툴로 31~33번 일부 이미지를 읽던 중 컨텍스트 한도 초과로 강제 종료.
+MD 파일에 실제 삽입(Edit)은 한 건도 이루어지지 않음 — 다음 세션에서 전체 재처리됨.
+
+## 변경 파일
+- 없음 (컨텍스트 초과로 Edit 도달 전 종료)
+
+## 읽은 이미지 목록 (삽입 미완료)
+- `output/MSWE3.0 Manual-31_images/` : Figure_1, Figure_3, Picture_5, Picture_13 (4개)
+- `output/MSWE3.0 Manual-32_images/` : Figure_1, Figure_8, Picture_22 (3개)
+- `output/MSWE3.0 Manual-33_images/` : Figure_2 (1개)
+- 합계 8개 이미지 읽기 완료, 나머지 55개 미처리
+
+## 주요 결정사항
+- 이미지 Read 시 각 이미지당 약 15~20k 입력 토큰 소비 → 8개만 읽어도 컨텍스트 급증
+- 다음 세션에서 전체 31~47 이미지 일괄 재처리 전략으로 변경
+
+## 문제 및 해결
+- 이미지 멀티모달 분석이 컨텍스트를 매우 빠르게 소비함
+- 다음 세션(`2026-03-31_MSEW매뉴얼31-47파라미터삽입.md`)에서 31~47 전체 완료
--- a/docs/history/2026-03-31_MSEW매뉴얼31-47파라미터삽입.md
+++ b/docs/history/2026-03-31_MSEW매뉴얼31-47파라미터삽입.md
@@ -0,0 +1,34 @@
+**이슈**: #1
+**소요 시간**: 40분
+**Context 사용량**: input 120k / output 8k tokens
+
+## 작업 내용
+MSEW3.0 Manual MD 파일 31~47번에 이미지 파라미터 설명 삽입 (이전 세션에서 컨텍스트 초과로 중단된 작업 재개)
+
+## 변경 파일
+- `output/MSWE3.0 Manual-31.md` : Figure_1 (내적K선택), Figure_3 (Coulomb δ), Picture_5 (외적K δ설명), Picture_13 (Wrap-around 수직간격)
+- `output/MSWE3.0 Manual-32.md` : Figure_1 (MetalStrip Program Manager), Figure_8 (Simple Geometry), Picture_22 (근입깊이)
+- `output/MSWE3.0 Manual-33.md` : Figure_2 (Complex Structures), Figure_9 (Foundation Soil Properties), Figure_12 (Metal Strip Design 메인)
+- `output/MSWE3.0 Manual-34.md` : Figure_6 (균등간격), Figure_7 (계산진행), Figure_9 (부식두께 NOTE), Figure_11 (수평간격범위)
+- `output/MSWE3.0 Manual-35.md` : Figure_2 (보강재종류수), Figure_4 (두종류보강재데이터), Figure_6 (배치테이블), Figure_9 (Metal Strip Data), Figure_10 (상호작용파라미터)
+- `output/MSWE3.0 Manual-36.md` : Figure_9 (토압계수변화), Picture_11 (Fw배치), Picture_13 (외적K선택)
+- `output/MSWE3.0 Manual-37.md` : Figure_0 (패널물성), Figure_5 (연결부강도관계), Figure_7 (연결부감소계수)
+- `output/MSWE3.0 Manual-38.md` : Figure_1 (동적하중설계), Figure_6 (지층기본설정)
+- `output/MSWE3.0 Manual-39.md` : Figure_0 (첫번째지층), Figure_2 (두번째지층), Picture_8 (결과확인화면), Picture_12 (복합안정성아이콘)
+- `output/MSWE3.0 Manual-40.md` : Figure_0 (복합안정성초기값), Figure_7 (해석기준선택), Figure_13 (SearchGrid), Figure_18 (Bishop진행)
+- `output/MSWE3.0 Manual-41.md` : Figure_0 (저부파괴여부), Figure_2 (저부파괴탐색), Picture_13 (중간결과아이콘), Figure_18 (외적/내적중간결과)
+- `output/MSWE3.0 Manual-42.md` : Figure_6 (지지력정적), Figure_14 (지지력동적), Figure_17 (활동정적)
+- `output/MSWE3.0 Manual-43.md` : Figure_1 (활동동적1), Figure_2 (활동동적2), Figure_4 (활동상세다이어그램), Figure_6 (활동최소길이)
+- `output/MSWE3.0 Manual-44.md` : Figure_0 (편심정적), Figure_2 (편심동적), Figure_4 (편심상세다이어그램), Figure_6 (편심최소길이)
+- `output/MSWE3.0 Manual-45.md` : Picture_2 (Geotextile중간결과아이콘), Figure_3 (내적강도결과테이블), Figure_15 (동적강도결과), Figure_17 (Tmax분포)
+- `output/MSWE3.0 Manual-46.md` : Figure_1 (Tmax수평응력분포), Figure_4 (연결부정적), Picture_9 (연결부안전율상세), Figure_11 (연결부동적)
+- `output/MSWE3.0 Manual-47.md` : Figure_0 (인발정적), Figure_7 (인발동적1), Figure_8 (인발동적2), Figure_11 (최종설계결과)
+
+## 주요 결정사항
+- 총 63개 이미지를 Read 툴로 멀티모달 분석 후 각 MD 파일에 삽입
+- 31-47 범위의 내용은 Metal Strip 설계(31-38), 전체안정해석(38-41), 결과확인(41-47)
+- 결과 확인 화면(42-47)은 입력 파라미터보다 출력 결과 컬럼명을 설명하는 방식으로 기술
+
+## 문제 및 해결
+- 이전 세션에서 Manual-31 이미지 4개와 Manual-33 Figure_2 이미지를 이미 읽었으나 컨텍스트 초과로 삽입 전 중단
+- 이번 세션에서 나머지 이미지(33 Figure_9~47 전체)를 병렬 Read로 한꺼번에 분석 후 순차 삽입
--- a/docs/history/2026-03-31_MSEW매뉴얼60-83파라미터삽입.md
+++ b/docs/history/2026-03-31_MSEW매뉴얼60-83파라미터삽입.md
@@ -0,0 +1,42 @@
+**이슈**: #1
+**소요 시간**: 약 90분
+**Context 사용량**: input 약 220k / output 약 15k tokens
+
+## 작업 내용
+MSEW3.0 Manual MD 파일 60~83번에 이미지 파라미터 설명 삽입.
+컨텍스트 초과로 종료된 이전 세션에서 이어받아, 사용자 요청으로 60번부터 진행.
+이전 세션의 누락된 히스토리(`2026-03-31_MSEW매뉴얼31-47이미지읽기시도중단.md`)도 추정 작성.
+
+## 변경 파일
+- `output/MSWE3.0 Manual-60.md` : Figure_0 (외부안정분석), Figure_6 (내적안정테이블), Figure_15 (Tmax분포)
+- `output/MSWE3.0 Manual-61.md` : Figure_0 (Tmax상세), Figure_2 (이상값버튼), Figure_3 (목표Fs입력), Figure_4 (이상값테이블), Figure_7 (연결부분석)
+- `output/MSWE3.0 Manual-62.md` : Figure_5 (인발저항테이블), Figure_8 (인발상세), Picture_14 (GlobalStability버튼)
+- `output/MSWE3.0 Manual-63.md` : Figure_0 (복합안정초기값), Picture_7 (해석방법선택), Figure_11 (탐색격자), Picture_16 (Bishop시작확인), Picture_18 (저부파괴여부)
+- `output/MSWE3.0 Manual-64.md` : Figure_1 (저부파괴탐색격자), Figure_7 (복합결과테이블), Picture_12 (컨투어분포), Figure_14 (3D분포)
+- `output/MSWE3.0 Manual-65.md` : Figure_1 (파괴원다이어그램), Figure_3 (보강재기여), Figure_5 (인장력분포), Picture_9 (저부컨투어), Figure_11 (저부3D)
+- `output/MSWE3.0 Manual-66.md` : Figure_0 (저부파괴원), Figure_2 (저부보강재기여), Figure_4 (인장력분포), Figure_7 (지진결과), Picture_9 (임계원지진)
+- `output/MSWE3.0 Manual-67.md` : Figure_20 (전면블록데이터)
+- `output/MSWE3.0 Manual-68.md` : Figure_3 (연결부감소계수), Figure_8 (연결부강도입력), Figure_12 (전단저항입력)
+- `output/MSWE3.0 Manual-69.md` : Figure_0 (지오그리드분석메뉴), Picture_4 (보강재종류수), Picture_6 (보강재데이터), Figure_8 (층별배치입력)
+- `output/MSWE3.0 Manual-70.md` : Picture_6 (내적토압계수안내), Figure_8 (외적토압계수안내), Figure_11 (결과메인화면), Figure_19 (지지력결과)
+- `output/MSWE3.0 Manual-71.md` : Figure_8 (지지력지진), Figure_11 (활동결과테이블), Figure_20 (활동정적상세), Figure_22 (활동지진상세)
+- `output/MSWE3.0 Manual-72.md` : Figure_2 (편심결과테이블), Figure_5 (편심정적상세), Figure_7 (편심지진상세), Figure_13 (내적안정결과)
+- `output/MSWE3.0 Manual-73.md` : Figure_0 (Tmax분포1), Figure_2 (Tmax분포2), Figure_5 (이상값버튼), Figure_6 (목표Fs입력), Figure_7 (이상값테이블)
+- `output/MSWE3.0 Manual-74.md` : Figure_0 (연결부결과테이블), Picture_7 (연결부Fs요약), Figure_9 (Bulging테이블), Figure_11 (힌지높이)
+- `output/MSWE3.0 Manual-75.md` : Figure_1 (최대비보강높이), Figure_5 (인발결과테이블), Figure_8 (인발상세테이블)
+- `output/MSWE3.0 Manual-76.md` : Picture_0 (결과메인), Figure_1 (텍스트저장버튼), Picture_5 (PrintPreview1), Picture_6 (PrintPreview2), Figure_11 (비트맵저장)
+- `output/MSWE3.0 Manual-80.md` : Figure_5 (연결력비율테이블), Figure_6 (연결력그래프)
+- `output/MSWE3.0 Manual-81.md` : Picture_8 (보강재배치도)
+- `output/MSWE3.0 Manual-83.md` : Picture_5 (경사배면배치도)
+- `docs/history/2026-03-31_MSEW매뉴얼31-47이미지읽기시도중단.md` : 누락 히스토리 추정 작성
+
+## 주요 결정사항
+- 77~79, 82, 84~86번은 _images 폴더 없어 건너뜀
+- 결과 화면(분석 결과 테이블, 다이어그램)은 출력 컬럼명과 의미 위주로 기술
+- 입력 다이얼로그는 파라미터명·단위·샘플값 위주로 기술
+- 60~76번: NCMA 방식 Geogrid/Geotextile 결과 확인 화면이 주를 이룸
+- 80~83번: 텍스트 출력 결과 파일 관련 배치도 및 집계 그래프
+
+## 문제 및 해결
+- Manual-65 Picture_9/Figure_11 삽입 시 공백 라인 차이로 첫 시도 실패 → 파일 재읽기 후 정확한 문자열로 수정 성공
+- 파일 86번까지만 존재(사용자가 98번까지라고 했으나 실제 변환 파일은 86번이 마지막)
--- a/docs/history/2026-03-31_MSEW매뉴얼이미지파라미터추출.md
+++ b/docs/history/2026-03-31_MSEW매뉴얼이미지파라미터추출.md
@@ -0,0 +1,31 @@
+**이슈**: #1
+**소요 시간**: 30분
+**Context 사용량**: input 45k / output 6k tokens
+
+## 작업 내용
+MSEW3.0 매뉴얼 샘플 PDF 3페이지(06, 07, 08)에서 추출된 이미지를 Claude Code의 Read 툴(멀티모달)로 직접 분석하여, MD 파일의 이미지 참조 바로 아래에 파라미터명과 샘플값을 삽입.
+API 키 없이 Claude Code 구독으로 처리하는 워크플로우 검증 완료.
+
+## 변경 파일
+- `output/MSWE3.0 Manual-06.md` : 이미지 3개 아래 파라미터 설명 삽입
+- `output/MSWE3.0 Manual-07.md` : 이미지 1개 아래 파라미터 설명 삽입
+- `output/MSWE3.0 Manual-08.md` : 이미지 3개 아래 파라미터 설명 삽입
+
+## 주요 결정사항
+- Python 스크립트 대신 Claude Code가 직접 Read(이미지) → Edit(MD) 수행
+- API 키 불필요 — Claude Code 구독으로 이미지 분석 가능
+- 삽입 형식: `> **[화면명]** \n> - \`파라미터명\`: 샘플값`
+- 세션당 약 15~20페이지 처리 가능 (컨텍스트 한계)
+
+## 추출된 파라미터 목록
+- 메인 메뉴: General Information, Geometry and Surcharge, Soil Data, Reinforcement (Geogrid), FACIA (Blocks), Seismic Parameters, Strata for Global Stability Analysis, Target Performance Criteria
+- Wall Embedment: Type in front of wall, Embedded depth E [m]
+- Geometry/Surcharge: Height H [m], BackSlope [deg], Batter, BackSlope ris [m]
+- Geogrid Design: Le [m], L/Hd, L [m], 보강재 길이 옵션(Uniform/Minimum), 강도·간격 옵션, Internal/External Stability K
+- Reinforcement Types: Number of reinforcement types
+- Geogrid DB: Product Name, Ultimate Tensile Strength [kN/m], Strength Reduction factors
+- Reinforcement Layout: From/To [m], Geogrid Type #, T-allowable [kN/m]
+
+## 문제 및 해결
+- pdftoppm 미설치로 Read 툴로 PDF 직접 읽기 불가 → PyMuPDF로 텍스트 추출 후 OCR(marker-pdf)로 보완
+- 온라인 공식 매뉴얼 없음 (MSEW 3.0은 2020년 지원 종료) → 로컬 샘플 PDF 활용
--- a/docs/history/2026-03-31_이미지분석코드정리및마무리.md
+++ b/docs/history/2026-03-31_이미지분석코드정리및마무리.md
@@ -0,0 +1,18 @@
+**이슈**: #1
+**소요 시간**: 15분
+**Context 사용량**: input 28k / output 4k tokens
+
+## 작업 내용
+convert_with_cropped_images.py에 이미지 분석 기능(analyze_image_with_claude, insert_image_descriptions)을 추가했다가, 이후 요청에 따라 삭제하여 PDF→MD 변환 + 이미지 추출까지만 담당하도록 정리.
+이미지 분석은 별도 파일로 특화 개발 예정.
+
+## 변경 파일
+- `convert_with_cropped_images.py` : 이미지 분석 관련 함수 2개(analyze_image_with_claude, insert_image_descriptions) 및 호출 코드 제거. base64/dotenv import 제거.
+
+## 주요 결정사항
+- convert_with_cropped_images.py는 PDF→MD 변환 + 이미지 파일 추출까지만 담당
+- 이미지 분석(멀티모달 AI)은 이 파일을 복제한 별도 스크립트에서 특화 구현 예정
+- 분리 이유: 매뉴얼 이미지는 범용 분석이 아닌 특화된 프롬프트/로직이 필요
+
+## 문제 및 해결
+없음
--- a/docs/history/2026-03-31_훅설정및PDF2단변환기능추가.md
+++ b/docs/history/2026-03-31_훅설정및PDF2단변환기능추가.md
@@ -0,0 +1,28 @@
+**이슈**: #1
+**소요 시간**: 40분
+**Context 사용량**: input 35k / output 8k tokens
+
+## 작업 내용
+1. common/.claude/hooks 훅을 프로젝트에 적용
+2. convert_with_cropped_images.py에 2단(다단) → 1단 변환 기능 추가
+3. 샘플 PDF(MSWE3.0 Manual-06.pdf) 변환 테스트
+4. 히스토리 훅 미작동 원인 분석 및 수정
+
+## 변경 파일
+- `.claude/settings.json` : 신규 생성 — UserPromptSubmit/PostToolUse/Stop 훅 등록
+- `.claude/hooks/` : common에서 훅 파일 4개 복사 (session-context.sh, guard-history-fields.sh/.py, guard-history-reminder.sh)
+- `.claude/hooks/session-context.sh` : 히스토리 기록 지시 문구 추가 (stdout으로 Claude에게 전달)
+- `convert_with_cropped_images.py` : `is_scanned_pdf()`, `reorder_text_by_columns()` 함수 추가 — 스캔/텍스트 PDF 자동 판별 후 2단→1단 처리
+- `CLAUDE.md` : 신규 생성 — 히스토리 작성 규칙 및 템플릿 정의
+- `docs/history/.gitkeep` : 신규 생성
+
+## 주요 결정사항
+- 스캔 PDF → marker-pdf surya 레이아웃 모델이 자동으로 2단 컬럼 검출+재정렬
+- 텍스트 PDF → PyMuPDF 블록 좌표 기반: 페이지 폭 절반 ±30px 기준으로 좌/우 컬럼 분리 후 좌→우 순 합산
+- 스캔 판정 기준: 샘플 3페이지에서 텍스트 50자 미만이면 스캔 PDF로 처리
+- 훅 실행 인터프리터: `.venv/Scripts/python.exe` 사용 (python/python3 명령은 다른 Python 환경을 가리킴)
+
+## 문제 및 해결
+- **훅 미작동 원인**: CLAUDE.md 없음 + session-context.sh에 작성 지시 없음 + guard-history-reminder.sh가 stderr 출력으로 Claude에게 전달 안 됨 → session-context.sh stdout에 지시 문구 추가 + CLAUDE.md 생성으로 해결
+- **ModuleNotFoundError(marker)**: python/python3 명령이 marker 미설치 Python 가리킴 → .venv/Scripts/python.exe 직접 지정으로 해결
+- **샘플 PDF 1페이지, 이미지 기반**: PyMuPDF 텍스트 블록 0개 확인 → marker-pdf OCR 경로로 처리, 정상 변환 완료
--- a/docs/history/2026-04-01_MD파일병합및이미지경로통합.md
+++ b/docs/history/2026-04-01_MD파일병합및이미지경로통합.md
@@ -0,0 +1,19 @@
+**소요 시간**: 10분
+**Context 사용량**: input 18k / output 2k tokens
+
+## 작업 내용
+96개 MD 파일을 하나로 병합하는 방법 설계 및 테스트 (06~08 페이지 3개).
+이미지 파일명 충돌 문제를 해결하기 위해 페이지 번호 prefix를 붙여 단일 폴더로 통합하는 방식 채택.
+
+## 변경 파일
+- `merge_markdown.py` : 전면 재작성 — 이미지 통합 폴더 생성, 파일명 rename, MD 내 경로 치환, file_range 파라미터 지원
+
+## 주요 결정사항
+- 이미지 rename 규칙: `{stem}_images/_page_0_Figure_3.jpeg` → `images/p006_Figure_3.jpeg`
+  - `_page_0_` 접두사 제거, 페이지 번호(zero-padded)를 prefix로
+- 병합 파일은 `output/` 안에 저장 → 상대경로 `images/` 그대로 유효
+- `file_range` 파라미터로 테스트 범위 지정 가능
+
+## 문제 및 해결
+- 문제: 모든 MD가 단일 페이지이므로 `_page_0_Figure_X` 이름이 96개 파일에서 중복
+- 해결: 이미지를 단일 `images/` 폴더로 복사할 때 `p{pagenum}_` 접두사 추가하여 고유명 보장
--- a/docs/tutorial.md
+++ b/docs/tutorial.md
@@ -0,0 +1,39 @@
+# documan 사용법
+
+## MD 파일 병합 (merge_markdown.py)
+
+PDF에서 변환된 페이지별 MD 파일들을 하나의 파일로 합친다.
+이미지도 `output/images/` 폴더로 통합되고, MD 내 경로가 자동으로 업데이트된다.
+
+### 전체 병합
+
+```bash
+python merge_markdown.py
+```
+
+- 입력: `output/MSWE3.0 Manual-01.md` ~ `output/MSWE3.0 Manual-96.md`
+- 출력: `output/merged_all.md`
+- 이미지: `output/images/p01_Figure_0.jpeg` 형식으로 통합
+
+### 일부 페이지만 테스트
+
+`merge_markdown.py` 하단의 `__main__` 블록에서 `file_range` 지정:
+
+```python
+merge_markdown_files(
+    input_dir="output",
+    output_file="merged_test.md",
+    images_subdir="images",
+    file_range=(6, 8),   # 06~08 페이지만
+)
+```
+
+### 이미지 이름 규칙
+
+| 원본 | 변환 후 |
+|------|---------|
+| `MSWE3.0 Manual-06_images/_page_0_Figure_0.jpeg` | `images/p06_Figure_0.jpeg` |
+| `MSWE3.0 Manual-15_images/_page_0_Picture_12.jpeg` | `images/p15_Picture_12.jpeg` |
+
+- `_page_0_` 접두사 제거
+- 페이지 번호를 `p{NN}_` 형식으로 앞에 붙여 파일명 충돌 방지
--- a/extract_images.py
+++ b/extract_images.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python3
+"""
+Extract embedded images from PDF files
+"""
+
+import os
+import glob
+from pathlib import Path
+
+def extract_images_pypdfium2(pdf_path: str, output_dir: str = "output"):
+    """
+    Extract images using pypdfium2
+    """
+    try:
+        import pypdfium2 as pdfium
+        from PIL import Image
+        import io
+
+        pdf_file = Path(pdf_path)
+        base_name = pdf_file.stem
+        images_dir = os.path.join(output_dir, f"{base_name}_extracted_images")
+        os.makedirs(images_dir, exist_ok=True)
+
+        print(f"\nExtracting images from {pdf_file.name}...")
+
+        pdf = pdfium.PdfDocument(pdf_path)
+        image_count = 0
+
+        for page_num in range(len(pdf)):
+            page = pdf[page_num]
+
+            # Get images from page
+            for obj_index, obj in enumerate(page.get_objects()):
+                if obj.type == pdfium.FPDF_PAGEOBJ_IMAGE:
+                    try:
+                        # Extract image
+                        bitmap = obj.get_bitmap()
+                        pil_image = bitmap.to_pil()
+
+                        # Skip very small images (likely noise or artifacts)
+                        if pil_image.width < 50 or pil_image.height < 50:
+                            continue
+
+                        image_count += 1
+                        img_filename = f"page_{page_num + 1}_img_{obj_index + 1}.png"
+                        img_path = os.path.join(images_dir, img_filename)
+                        pil_image.save(img_path)
+                        print(f"  Saved: {img_filename} ({pil_image.width}x{pil_image.height})")
+
+                    except Exception as e:
+                        print(f"  Warning: Could not extract image {obj_index} from page {page_num + 1}: {e}")
+
+        pdf.close()
+
+        if image_count > 0:
+            print(f"  OK Total {image_count} images extracted to: {images_dir}")
+            return True
+        else:
+            print(f"  INFO: No images found in {pdf_file.name}")
+            return True
+
+    except Exception as e:
+        print(f"  ERROR: Failed with pypdfium2: {e}")
+        return False
+
+
+def extract_images_pymupdf(pdf_path: str, output_dir: str = "output"):
+    """
+    Extract images using PyMuPDF (fitz) - fallback method
+    """
+    try:
+        import fitz  # PyMuPDF
+
+        pdf_file = Path(pdf_path)
+        base_name = pdf_file.stem
+        images_dir = os.path.join(output_dir, f"{base_name}_extracted_images")
+        os.makedirs(images_dir, exist_ok=True)
+
+        print(f"\nExtracting images from {pdf_file.name} using PyMuPDF...")
+
+        doc = fitz.open(pdf_path)
+        image_count = 0
+
+        for page_num in range(len(doc)):
+            page = doc[page_num]
+            image_list = page.get_images(full=True)
+
+            for img_index, img_info in enumerate(image_list):
+                xref = img_info[0]
+
+                try:
+                    # Extract image
+                    base_image = doc.extract_image(xref)
+                    image_bytes = base_image["image"]
+                    image_ext = base_image["ext"]
+
+                    # Skip very small images
+                    if len(image_bytes) < 1000:  # Less than 1KB
+                        continue
+
+                    image_count += 1
+                    img_filename = f"page_{page_num + 1}_img_{img_index + 1}.{image_ext}"
+                    img_path = os.path.join(images_dir, img_filename)
+
+                    with open(img_path, "wb") as f:
+                        f.write(image_bytes)
+
+                    print(f"  Saved: {img_filename} ({len(image_bytes)} bytes)")
+
+                except Exception as e:
+                    print(f"  Warning: Could not extract image {img_index} from page {page_num + 1}: {e}")
+
+        doc.close()
+
+        if image_count > 0:
+            print(f"  OK Total {image_count} images extracted to: {images_dir}")
+            return True
+        else:
+            print(f"  INFO: No images found in {pdf_file.name}")
+            return True
+
+    except ImportError:
+        print("  ERROR: PyMuPDF not installed. Install with: pip install PyMuPDF")
+        return False
+    except Exception as e:
+        print(f"  ERROR: Failed with PyMuPDF: {e}")
+        return False
+
+
+def extract_images_from_pdf(pdf_path: str, output_dir: str = "output"):
+    """
+    Try to extract images using available methods
+    """
+    # Try pypdfium2 first (already installed)
+    success = extract_images_pypdfium2(pdf_path, output_dir)
+
+    if not success:
+        print("\nTrying PyMuPDF as fallback...")
+        success = extract_images_pymupdf(pdf_path, output_dir)
+
+    return success
+
+
+def extract_all_images(input_dir: str = "input", output_dir: str = "output"):
+    """
+    Extract images from all PDF files in the input directory
+    """
+    pdf_pattern = os.path.join(input_dir, "*.pdf")
+    pdf_files = sorted(glob.glob(pdf_pattern))
+
+    if not pdf_files:
+        print(f"No PDF files found in {input_dir}")
+        return
+
+    print(f"Found {len(pdf_files)} PDF files")
+    print("=" * 60)
+
+    successful = 0
+    failed = 0
+
+    for pdf_file in pdf_files:
+        if extract_images_from_pdf(pdf_file, output_dir):
+            successful += 1
+        else:
+            failed += 1
+
+    print("\n" + "=" * 60)
+    print(f"Image extraction complete!")
+    print(f"  Successful: {successful}")
+    print(f"  Failed: {failed}")
+    print(f"  Total: {len(pdf_files)}")
+
+
+if __name__ == "__main__":
+    extract_all_images()
--- a/main.py
+++ b/main.py
@@ -0,0 +1,327 @@
+#!/usr/bin/env python3
+"""
+docuConverter — 문서 → Markdown 변환 도구 모음
+
+지원 포맷:
+  PDF  → Markdown  (marker-pdf 기반, 이미지 유/무 선택)
+  EPUB → Markdown  (ebooklib + BeautifulSoup 기반)
+
+시나리오:
+  1. PDF 단일 변환 (이미지 포함, 고품질)
+  2. PDF 단일 변환 (텍스트 전용, 빠름)
+  3. PDF 배치 변환 (이미지 포함, 순차)
+  4. PDF 배치 변환 (텍스트 전용, 순차, 빠름)
+  5. PDF 배치 변환 (병렬 처리, 멀티코어)
+  6. EPUB 단일 변환
+  7. EPUB 배치 변환
+  8. 이미지만 추출 (PDF → 이미지 파일)
+  9. Markdown 병합 (output/ 폴더의 .md 파일들을 하나로)
+ 10. 이미지 경로 업데이트 (Markdown 내 이미지 링크 재연결)
+"""
+
+import os
+import sys
+import glob
+from pathlib import Path
+
+
+# ─── 시나리오 함수들 ──────────────────────────────────────────────────────────
+
+def scenario_pdf_single_with_images():
+    """PDF 단일 변환 — 이미지 포함 (고품질, 느림)"""
+    from convert_with_cropped_images import convert_pdf_with_cropped_images
+
+    pdf_path = input("변환할 PDF 경로를 입력하세요: ").strip()
+    if not pdf_path:
+        pdf_files = sorted(glob.glob("input/*.pdf"))
+        if not pdf_files:
+            print("ERROR: input/ 폴더에 PDF 파일이 없습니다.")
+            return
+        pdf_path = pdf_files[0]
+        print(f"  → 자동 선택: {pdf_path}")
+
+    output_dir = input("출력 폴더 [기본: output]: ").strip() or "output"
+    convert_pdf_with_cropped_images(pdf_path, output_dir)
+
+
+def scenario_pdf_single_fast():
+    """PDF 단일 변환 — 텍스트 전용 (빠름)"""
+    from convert_pdfs_fast import convert_pdf_to_markdown_fast
+
+    pdf_path = input("변환할 PDF 경로를 입력하세요: ").strip()
+    if not pdf_path:
+        pdf_files = sorted(glob.glob("input/*.pdf"))
+        if not pdf_files:
+            print("ERROR: input/ 폴더에 PDF 파일이 없습니다.")
+            return
+        pdf_path = pdf_files[0]
+        print(f"  → 자동 선택: {pdf_path}")
+
+    output_dir = input("출력 폴더 [기본: output]: ").strip() or "output"
+    convert_pdf_to_markdown_fast(pdf_path, output_dir)
+
+
+def scenario_pdf_batch_with_images():
+    """PDF 배치 변환 — 이미지 포함 (순차, input/ → output/)"""
+    from convert_with_cropped_images import convert_all_pdfs
+
+    input_dir = input("입력 폴더 [기본: input]: ").strip() or "input"
+    output_dir = input("출력 폴더 [기본: output]: ").strip() or "output"
+    convert_all_pdfs(input_dir, output_dir)
+
+
+def scenario_pdf_batch_fast():
+    """PDF 배치 변환 — 텍스트 전용 (순차, 빠름)"""
+    from convert_pdfs_fast import convert_all_pdfs_fast
+
+    input_dir = input("입력 폴더 [기본: input]: ").strip() or "input"
+    output_dir = input("출력 폴더 [기본: output]: ").strip() or "output"
+    convert_all_pdfs_fast(input_dir, output_dir)
+
+
+def scenario_pdf_batch_parallel():
+    """PDF 배치 변환 — 병렬 처리 (멀티코어)"""
+    from convert_pdfs_parallel import convert_all_pdfs_parallel
+    import multiprocessing
+
+    input_dir = input("입력 폴더 [기본: input]: ").strip() or "input"
+    output_dir = input("출력 폴더 [기본: output]: ").strip() or "output"
+    cpu_count = multiprocessing.cpu_count()
+    workers_input = input(f"병렬 워커 수 [기본: 2, CPU: {cpu_count}]: ").strip()
+    max_workers = int(workers_input) if workers_input.isdigit() else 2
+    convert_all_pdfs_parallel(input_dir, output_dir, max_workers)
+
+
+def scenario_epub_single():
+    """EPUB 단일 변환 → Markdown"""
+    from convert_epub import convert_epub_to_markdown
+
+    epub_path = input("변환할 EPUB 경로를 입력하세요: ").strip()
+    if not epub_path:
+        epub_files = sorted(glob.glob("input/*.epub"))
+        if not epub_files:
+            print("ERROR: input/ 폴더에 EPUB 파일이 없습니다.")
+            return
+        epub_path = epub_files[0]
+        print(f"  → 자동 선택: {epub_path}")
+
+    output_dir = input("출력 폴더 [기본: output]: ").strip() or "output"
+    convert_epub_to_markdown(epub_path, output_dir)
+
+
+def scenario_epub_batch():
+    """EPUB 배치 변환 — input/ 폴더의 모든 .epub 파일"""
+    from convert_epub import convert_epub_to_markdown
+
+    input_dir = input("입력 폴더 [기본: input]: ").strip() or "input"
+    output_dir = input("출력 폴더 [기본: output]: ").strip() or "output"
+
+    epub_files = sorted(glob.glob(os.path.join(input_dir, "*.epub")))
+    if not epub_files:
+        print(f"ERROR: {input_dir}/ 폴더에 EPUB 파일이 없습니다.")
+        return
+
+    print(f"Found {len(epub_files)} EPUB file(s)")
+    print("=" * 60)
+    successful = 0
+    failed = 0
+    for i, epub_file in enumerate(epub_files, 1):
+        print(f"\n[{i}/{len(epub_files)}] {Path(epub_file).name}")
+        try:
+            convert_epub_to_markdown(epub_file, output_dir)
+            successful += 1
+        except Exception as e:
+            print(f"  ERROR: {e}")
+            failed += 1
+
+    print("\n" + "=" * 60)
+    print(f"Conversion complete! Successful: {successful}, Failed: {failed}")
+
+
+def scenario_extract_images():
+    """PDF에서 이미지만 추출 (Markdown 변환 없음)"""
+    from extract_images import extract_all_images, extract_images_from_pdf
+
+    mode = input("모드 선택 — [1] 단일 파일  [2] 배치 (input/ 폴더): ").strip()
+    output_dir = input("출력 폴더 [기본: output]: ").strip() or "output"
+
+    if mode == "1":
+        pdf_path = input("PDF 경로를 입력하세요: ").strip()
+        if not pdf_path:
+            print("ERROR: 경로가 비어 있습니다.")
+            return
+        extract_images_from_pdf(pdf_path, output_dir)
+    else:
+        input_dir = input("입력 폴더 [기본: input]: ").strip() or "input"
+        extract_all_images(input_dir, output_dir)
+
+
+def scenario_merge_markdown():
+    """output/ 폴더의 .md 파일들을 하나의 파일로 병합"""
+    from merge_markdown import merge_markdown_files
+
+    input_dir = input("병합할 Markdown 폴더 [기본: output]: ").strip() or "output"
+    output_file = input("병합 결과 파일명 [기본: merged_all.md]: ").strip() or "merged_all.md"
+    separator_choice = input("구분자 — [1] 수평선 (---) [2] 빈줄만: ").strip()
+    separator = "\n\n---\n\n" if separator_choice != "2" else "\n\n"
+    merge_markdown_files(input_dir, output_file, separator)
+
+
+def scenario_update_image_paths():
+    """Markdown 내 이미지 경로를 추출된 실제 이미지 경로로 업데이트"""
+    from update_image_paths import update_all_markdown_files
+
+    output_dir = input("Markdown 폴더 [기본: output]: ").strip() or "output"
+    update_all_markdown_files(output_dir)
+
+
+# ─── 메뉴 ────────────────────────────────────────────────────────────────────
+
+SCENARIOS = [
+    ("PDF 단일 변환 (이미지 포함, 고품질)",        scenario_pdf_single_with_images),
+    ("PDF 단일 변환 (텍스트 전용, 빠름)",          scenario_pdf_single_fast),
+    ("PDF 배치 변환 (이미지 포함, 순차)",          scenario_pdf_batch_with_images),
+    ("PDF 배치 변환 (텍스트 전용, 순차, 빠름)",    scenario_pdf_batch_fast),
+    ("PDF 배치 변환 (병렬 처리, 멀티코어)",        scenario_pdf_batch_parallel),
+    ("EPUB 단일 변환 → Markdown",                 scenario_epub_single),
+    ("EPUB 배치 변환 (input/ 폴더 전체)",          scenario_epub_batch),
+    ("이미지만 추출 (PDF → 이미지 파일)",          scenario_extract_images),
+    ("Markdown 파일 병합 (여러 .md → 하나로)",     scenario_merge_markdown),
+    ("이미지 경로 업데이트 (Markdown 링크 수정)",  scenario_update_image_paths),
+]
+
+
+def print_menu():
+    print("\n" + "=" * 60)
+    print("  docuConverter — 문서 → Markdown 변환 도구")
+    print("=" * 60)
+    for i, (label, _) in enumerate(SCENARIOS, 1):
+        print(f"  {i:2}. {label}")
+    print("   0. 종료")
+    print("=" * 60)
+
+
+def run_interactive():
+    """대화형 메뉴 실행"""
+    while True:
+        print_menu()
+        choice = input("시나리오 번호를 선택하세요: ").strip()
+
+        if choice == "0":
+            print("종료합니다.")
+            break
+
+        if not choice.isdigit() or not (1 <= int(choice) <= len(SCENARIOS)):
+            print("잘못된 입력입니다. 다시 선택하세요.")
+            continue
+
+        idx = int(choice) - 1
+        label, fn = SCENARIOS[idx]
+        print(f"\n▶ {label}")
+        print("-" * 60)
+        try:
+            fn()
+        except KeyboardInterrupt:
+            print("\n중단되었습니다.")
+        except Exception as e:
+            print(f"\nERROR: {e}")
+            import traceback
+            traceback.print_exc()
+
+        input("\n[Enter] 키를 누르면 메뉴로 돌아갑니다...")
+
+
+def run_cli(args):
+    """CLI 직접 실행 모드 (비대화형)
+
+    사용 예:
+      python main.py 1 path/to/file.pdf output/
+      python main.py 4 input/ output/
+      python main.py 6 path/to/book.epub output/
+    """
+    if not args:
+        run_interactive()
+        return
+
+    scenario_num = args[0]
+    if not scenario_num.isdigit() or not (1 <= int(scenario_num) <= len(SCENARIOS)):
+        print(f"ERROR: 시나리오 번호는 1~{len(SCENARIOS)} 사이여야 합니다.")
+        sys.exit(1)
+
+    idx = int(scenario_num) - 1
+    label, fn = SCENARIOS[idx]
+    print(f"▶ {label}")
+
+    # 인자를 stdin 처럼 흉내 내어 input() 호출을 우회
+    # 직접 함수를 시나리오별로 호출
+    extra = args[1:]
+
+    if idx == 0:  # PDF 단일, 이미지 포함
+        from convert_with_cropped_images import convert_pdf_with_cropped_images
+        pdf_path = extra[0] if len(extra) > 0 else sorted(glob.glob("input/*.pdf"))[0]
+        out = extra[1] if len(extra) > 1 else "output"
+        convert_pdf_with_cropped_images(pdf_path, out)
+
+    elif idx == 1:  # PDF 단일, fast
+        from convert_pdfs_fast import convert_pdf_to_markdown_fast
+        pdf_path = extra[0] if len(extra) > 0 else sorted(glob.glob("input/*.pdf"))[0]
+        out = extra[1] if len(extra) > 1 else "output"
+        convert_pdf_to_markdown_fast(pdf_path, out)
+
+    elif idx == 2:  # PDF 배치, 이미지 포함
+        from convert_with_cropped_images import convert_all_pdfs
+        inp = extra[0] if len(extra) > 0 else "input"
+        out = extra[1] if len(extra) > 1 else "output"
+        convert_all_pdfs(inp, out)
+
+    elif idx == 3:  # PDF 배치, fast
+        from convert_pdfs_fast import convert_all_pdfs_fast
+        inp = extra[0] if len(extra) > 0 else "input"
+        out = extra[1] if len(extra) > 1 else "output"
+        convert_all_pdfs_fast(inp, out)
+
+    elif idx == 4:  # PDF 배치, 병렬
+        from convert_pdfs_parallel import convert_all_pdfs_parallel
+        inp = extra[0] if len(extra) > 0 else "input"
+        out = extra[1] if len(extra) > 1 else "output"
+        workers = int(extra[2]) if len(extra) > 2 else 2
+        convert_all_pdfs_parallel(inp, out, workers)
+
+    elif idx == 5:  # EPUB 단일
+        from convert_epub import convert_epub_to_markdown
+        epub_path = extra[0] if len(extra) > 0 else sorted(glob.glob("input/*.epub"))[0]
+        out = extra[1] if len(extra) > 1 else "output"
+        convert_epub_to_markdown(epub_path, out)
+
+    elif idx == 6:  # EPUB 배치
+        from convert_epub import convert_epub_to_markdown
+        inp = extra[0] if len(extra) > 0 else "input"
+        out = extra[1] if len(extra) > 1 else "output"
+        for ep in sorted(glob.glob(os.path.join(inp, "*.epub"))):
+            print(f"\n→ {Path(ep).name}")
+            convert_epub_to_markdown(ep, out)
+
+    elif idx == 7:  # 이미지 추출
+        from extract_images import extract_all_images
+        inp = extra[0] if len(extra) > 0 else "input"
+        out = extra[1] if len(extra) > 1 else "output"
+        extract_all_images(inp, out)
+
+    elif idx == 8:  # Markdown 병합
+        from merge_markdown import merge_markdown_files
+        inp = extra[0] if len(extra) > 0 else "output"
+        out_file = extra[1] if len(extra) > 1 else "merged_all.md"
+        merge_markdown_files(inp, out_file)
+
+    elif idx == 9:  # 이미지 경로 업데이트
+        from update_image_paths import update_all_markdown_files
+        out = extra[0] if len(extra) > 0 else "output"
+        update_all_markdown_files(out)
+
+
+if __name__ == "__main__":
+    # docuConverter 폴더를 cwd로 설정 (어느 경로에서 실행해도 input/output 경로 일관)
+    script_dir = Path(__file__).parent
+    os.chdir(script_dir)
+
+    run_cli(sys.argv[1:])
--- a/merge_markdown.py
+++ b/merge_markdown.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python3
+"""
+Merge multiple Markdown files into a single file,
+consolidating all images into a single images/ folder with unique names.
+
+Image rename rule:
+  {stem}_images/_page_0_Figure_3.jpeg  ->  images/p006_Figure_3.jpeg
+  {stem}_images/_page_0_Picture_12.jpeg -> images/p006_Picture_12.jpeg
+"""
+
+import os
+import re
+import glob
+import shutil
+from pathlib import Path
+
+
+def merge_markdown_files(
+    input_dir: str = "output",
+    output_file: str = "merged_all.md",
+    images_subdir: str = "images",
+    file_range: tuple = None,  # e.g. (6, 8) to process only pages 06~08
+):
+    md_pattern = os.path.join(input_dir, "*.md")
+    all_md_files = sorted(glob.glob(md_pattern))
+
+    # Only include files matching Manual-NN pattern (skip merged outputs)
+    all_md_files = [f for f in all_md_files if re.search(r'-\d+\.md$', f)]
+
+    # Filter by page number range if given
+    if file_range:
+        start, end = file_range
+        md_files = []
+        for f in all_md_files:
+            m = re.search(r'-(\d+)\.md$', f)
+            if m and start <= int(m.group(1)) <= end:
+                md_files.append(f)
+    else:
+        md_files = all_md_files
+
+    if not md_files:
+        print(f"No markdown files found in {input_dir}")
+        return
+
+    print(f"Files to merge: {len(md_files)}")
+    for f in md_files:
+        print(f"  {Path(f).name}")
+    print("=" * 60)
+
+    # Create unified images directory
+    unified_images_path = os.path.join(input_dir, images_subdir)
+    os.makedirs(unified_images_path, exist_ok=True)
+
+    merged_content = []
+
+    for md_file in md_files:
+        file_path = Path(md_file)
+
+        # Extract zero-padded page number from filename (e.g. "06" from "Manual-06.md")
+        m = re.search(r'-(\d+)\.md$', str(file_path))
+        page_num = m.group(1) if m else "000"
+
+        print(f"Processing [{page_num}] {file_path.name} ...")
+
+        with open(md_file, "r", encoding="utf-8") as f:
+            content = f.read()
+
+        # Replace each image reference
+        def replace_image(match):
+            alt = match.group(1)
+            old_path = match.group(2)
+
+            # Decode %20 → space for filesystem access
+            old_path_decoded = old_path.replace("%20", " ")
+
+            # Filename only: _page_0_Figure_3.jpeg
+            img_filename = Path(old_path_decoded).name
+
+            # Strip leading _page_N_ to get: Figure_3.jpeg  or  Picture_12.jpeg
+            clean_name = re.sub(r'^_page_\d+_', '', img_filename)
+
+            # New unique name: p006_Figure_3.jpeg
+            new_name = f"p{page_num}_{clean_name}"
+
+            # Copy image to unified folder
+            src = os.path.join(input_dir, old_path_decoded)
+            dst = os.path.join(unified_images_path, new_name)
+            if os.path.exists(src):
+                shutil.copy2(src, dst)
+            else:
+                print(f"  WARNING: image not found: {src}")
+
+            return f"![{alt}]({images_subdir}/{new_name})"
+
+        new_content = re.sub(r'!\[([^\]]*)\]\(([^)]+)\)', replace_image, content)
+        merged_content.append(new_content)
+
+    if not merged_content:
+        print("No content to merge")
+        return
+
+    final_content = "\n\n---\n\n".join(merged_content)
+
+    output_path = os.path.join(input_dir, output_file)
+    with open(output_path, "w", encoding="utf-8") as f:
+        f.write(final_content)
+
+    print("\n" + "=" * 60)
+    print(f"SUCCESS: {output_path}")
+    print(f"  Files merged : {len(merged_content)}")
+    print(f"  Total chars  : {len(final_content):,}")
+
+
+if __name__ == "__main__":
+    merge_markdown_files(
+        input_dir="output",
+        output_file="merged_all.md",
+        images_subdir="images",
+    )
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,96 @@
+annotated-types==0.7.0
+anthropic==0.46.0
+anyio==4.12.1
+beautifulsoup4==4.14.3
+brotli==1.2.0
+certifi==2026.1.4
+cffi==2.0.0
+cfgv==3.5.0
+charset-normalizer==3.4.4
+click==8.3.1
+cobble==0.1.4
+colorama==0.4.6
+cssselect2==0.8.0
+distlib==0.4.0
+distro==1.9.0
+EbookLib==0.18
+einops==0.8.1
+et_xmlfile==2.0.0
+filelock==3.20.3
+filetype==1.2.0
+fonttools==4.61.1
+fsspec==2026.1.0
+ftfy==6.3.1
+google-auth==2.47.0
+google-genai==1.59.0
+h11==0.16.0
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==0.36.0
+identify==2.6.16
+idna==3.11
+Jinja2==3.1.6
+jiter==0.12.0
+joblib==1.5.3
+lxml==6.0.2
+mammoth==1.11.0
+markdown2==2.5.4
+markdownify==1.2.2
+marker-pdf==1.10.1
+MarkupSafe==3.0.3
+mpmath==1.3.0
+networkx==3.6.1
+nodeenv==1.10.0
+numpy==2.4.1
+openai==1.109.1
+opencv-python-headless==4.11.0.86
+openpyxl==3.1.5
+packaging==25.0
+pdftext==0.6.3
+pillow==10.4.0
+platformdirs==4.5.1
+pre_commit==4.5.1
+psutil==7.2.1
+pyasn1==0.6.2
+pyasn1_modules==0.4.2
+pycparser==2.23
+pydantic==2.12.5
+pydantic-settings==2.12.0
+pydantic_core==2.41.5
+pydyf==0.12.1
+PyMuPDF==1.26.7
+pypdfium2==4.30.0
+pyphen==0.17.2
+python-dotenv==1.2.1
+python-pptx==1.0.2
+PyYAML==6.0.3
+RapidFuzz==3.14.3
+regex==2024.11.6
+requests==2.32.5
+rsa==4.9.1
+safetensors==0.7.0
+scikit-learn==1.8.0
+scipy==1.17.0
+six==1.17.0
+sniffio==1.3.1
+soupsieve==2.8.1
+surya-ocr==0.17.0
+sympy==1.14.0
+tenacity==9.1.2
+threadpoolctl==3.6.0
+tinycss2==1.5.1
+tinyhtml5==2.0.0
+tokenizers==0.22.2
+torch==2.9.1
+tqdm==4.67.1
+transformers==4.57.6
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+urllib3==2.6.3
+virtualenv==20.36.1
+wcwidth==0.2.14
+weasyprint==63.1
+webencodings==0.5.1
+websockets==15.0.1
+xlsxwriter==3.2.9
+zopfli==0.4.0
--- a/update_image_paths.py
+++ b/update_image_paths.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+"""
+Update image paths in markdown files to point to extracted images
+"""
+
+import os
+import re
+import glob
+from pathlib import Path
+
+
+def update_markdown_image_paths(md_path: str, output_dir: str = "output"):
+    """
+    Update image paths in markdown file to point to extracted images
+    """
+    md_file = Path(md_path)
+    base_name = md_file.stem
+
+    # Path to extracted images folder
+    extracted_images_dir = f"{base_name}_extracted_images"
+
+    # Check if extracted images folder exists
+    extracted_images_path = os.path.join(output_dir, extracted_images_dir)
+    if not os.path.exists(extracted_images_path):
+        print(f"No extracted images folder found: {extracted_images_path}")
+        return False
+
+    # Read markdown content
+    with open(md_path, 'r', encoding='utf-8') as f:
+        content = f.read()
+
+    original_content = content
+
+    # Pattern to match image references like ![](_page_1_Figure_1.jpeg)
+    # Replace with actual extracted images
+    def replace_image_path(match):
+        old_path = match.group(1)
+
+        # Extract page number from old path (e.g., _page_1_Figure_1.jpeg -> page 1)
+        page_match = re.search(r'_page_(\d+)_', old_path)
+        if page_match:
+            page_num = page_match.group(1)
+            # Map to extracted image: page_1_img_1.png
+            new_path = f"{extracted_images_dir}/page_{page_num}_img_1.png"
+            return f'![]({new_path})'
+
+        return match.group(0)  # Return original if no match
+
+    # Replace all image paths
+    content = re.sub(r'\!\[\]\(([^)]+\.jpeg)\)', replace_image_path, content)
+
+    if content == original_content:
+        print(f"No changes needed for {md_file.name}")
+        return True
+
+    # Save updated markdown
+    output_path = md_path.replace('.md', '_updated.md')
+    with open(output_path, 'w', encoding='utf-8') as f:
+        f.write(content)
+
+    print(f"Updated markdown saved to: {output_path}")
+
+    # Count replacements
+    old_count = len(re.findall(r'\!\[\]\([^)]+\.jpeg\)', original_content))
+    new_count = len(re.findall(r'\!\[\]\([^)]+\.png\)', content))
+    print(f"  Replaced {new_count} image paths (out of {old_count} references)")
+
+    return True
+
+
+def update_all_markdown_files(output_dir: str = "output"):
+    """
+    Update image paths in all markdown files
+    """
+    md_pattern = os.path.join(output_dir, "*.md")
+    md_files = [f for f in glob.glob(md_pattern) if not f.endswith('_updated.md')]
+
+    if not md_files:
+        print(f"No markdown files found in {output_dir}")
+        return
+
+    print(f"Found {len(md_files)} markdown files")
+    print("=" * 60)
+
+    for md_file in md_files:
+        update_markdown_image_paths(md_file, output_dir)
+        print()
+
+    print("=" * 60)
+    print("Done!")
+
+
+if __name__ == "__main__":
+    update_all_markdown_files()