feat: MD 파일 병합 및 이미지 경로 통합 스크립트 추가 (#1)

- merge_markdown.py: 96개 페이지별 MD를 단일 파일로 병합 - 이미지를 output/images/ 폴더로 통합, p{NN}_ prefix로 파일명 충돌 방지 - file_range 파라미터로 부분 테스트 가능 - docs/tutorial.md: merge 명령어 및 사용법 문서화 - docs/history: 작업 이력 파일 추가 소요 시간: 10분 | Context: input 18k / output 2k tokens Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-01 11:00:28 +09:00
parent 892e4ecafb
commit 8d4339302e
24 changed files with 2335 additions and 0 deletions
--- a/convert_epub.py
+++ b/convert_epub.py
@@ -0,0 +1,205 @@
+#!/usr/bin/env python3
+"""
+EPUB to Markdown converter using ebooklib and html2text
+"""
+
+import os
+import json
+import re
+from pathlib import Path
+import ebooklib
+from ebooklib import epub
+from bs4 import BeautifulSoup
+
+
+def html_to_markdown(soup):
+    """Convert BeautifulSoup HTML to Markdown format"""
+
+    def process_element(element):
+        if isinstance(element, str):
+            text = element.strip()
+            if text:
+                return text
+            return ""
+
+        tag = element.name
+
+        # Headers
+        if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+            level_num = int(tag[1])
+            text = element.get_text().strip()
+            return '\n' + '#' * level_num + ' ' + text + '\n'
+
+        # Paragraphs
+        elif tag == 'p':
+            text = ''.join(process_element(child) for child in element.children)
+            return '\n' + text.strip() + '\n'
+
+        # Line breaks
+        elif tag == 'br':
+            return '\n'
+
+        # Bold
+        elif tag in ['strong', 'b']:
+            text = ''.join(process_element(child) for child in element.children)
+            return '**' + text.strip() + '**'
+
+        # Italic
+        elif tag in ['em', 'i']:
+            text = ''.join(process_element(child) for child in element.children)
+            return '*' + text.strip() + '*'
+
+        # Links
+        elif tag == 'a':
+            text = ''.join(process_element(child) for child in element.children)
+            href = element.get('href', '')
+            if href:
+                return f'[{text.strip()}]({href})'
+            return text.strip()
+
+        # Images
+        elif tag == 'img':
+            src = element.get('src', '')
+            alt = element.get('alt', '')
+            return f'![{alt}]({src})'
+
+        # Lists
+        elif tag == 'ul':
+            items = []
+            for li in element.find_all('li', recursive=False):
+                text = ''.join(process_element(child) for child in li.children)
+                items.append('- ' + text.strip())
+            return '\n' + '\n'.join(items) + '\n'
+
+        elif tag == 'ol':
+            items = []
+            for i, li in enumerate(element.find_all('li', recursive=False), 1):
+                text = ''.join(process_element(child) for child in li.children)
+                items.append(f'{i}. ' + text.strip())
+            return '\n' + '\n'.join(items) + '\n'
+
+        # Blockquote
+        elif tag == 'blockquote':
+            text = ''.join(process_element(child) for child in element.children)
+            lines = text.strip().split('\n')
+            return '\n' + '\n'.join('> ' + line for line in lines) + '\n'
+
+        # Code
+        elif tag == 'code':
+            text = element.get_text()
+            return '`' + text + '`'
+
+        elif tag == 'pre':
+            text = element.get_text()
+            return '\n```\n' + text + '\n```\n'
+
+        # Div and span - just process children
+        elif tag in ['div', 'span', 'section', 'article']:
+            return ''.join(process_element(child) for child in element.children)
+
+        # Default - process children
+        else:
+            return ''.join(process_element(child) for child in element.children)
+
+    # Process body or entire soup
+    body = soup.find('body') if soup.find('body') else soup
+    markdown = process_element(body)
+
+    # Clean up multiple newlines
+    markdown = re.sub(r'\n{3,}', '\n\n', markdown)
+
+    return markdown.strip()
+
+
+def convert_epub_to_markdown(epub_path: str, output_dir: str = "output"):
+    """
+    Convert EPUB file to Markdown
+
+    Args:
+        epub_path: Path to the EPUB file
+        output_dir: Directory to save the output (default: "output")
+    """
+    # Create output directory if it doesn't exist
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Get the base filename without extension
+    epub_file = Path(epub_path)
+    base_name = epub_file.stem
+
+    print(f"Converting {epub_path} to Markdown...")
+
+    # Read the EPUB file
+    book = epub.read_epub(epub_path)
+
+    # Extract all text content
+    chapters = []
+    images = {}
+    image_counter = 0
+
+    for item in book.get_items():
+        if item.get_type() == ebooklib.ITEM_DOCUMENT:
+            # Get HTML content
+            html_content = item.get_content().decode('utf-8')
+
+            # Parse with BeautifulSoup
+            soup = BeautifulSoup(html_content, 'html.parser')
+
+            # Convert to markdown-like format
+            markdown_content = html_to_markdown(soup)
+
+            # Clean up the markdown
+            markdown_content = markdown_content.strip()
+
+            if markdown_content:
+                chapters.append(markdown_content)
+
+        elif item.get_type() == ebooklib.ITEM_IMAGE:
+            # Save image
+            image_counter += 1
+            img_name = item.get_name().split('/')[-1]
+            if not img_name:
+                img_name = f"image_{image_counter}.{item.media_type.split('/')[-1]}"
+            images[img_name] = item.get_content()
+
+    # Combine all chapters
+    full_markdown = "\n\n---\n\n".join(chapters)
+
+    # Save as markdown
+    output_path = os.path.join(output_dir, f"{base_name}.md")
+    with open(output_path, "w", encoding="utf-8") as f:
+        f.write(full_markdown)
+
+    print(f"OK Conversion complete!")
+    print(f"OK Output saved to: {output_path}")
+    print(f"OK Total chapters: {len(chapters)}")
+
+    # Save images if any
+    if images:
+        images_dir = os.path.join(output_dir, f"{base_name}_images")
+        os.makedirs(images_dir, exist_ok=True)
+        for img_name, img_data in images.items():
+            img_path = os.path.join(images_dir, img_name)
+            with open(img_path, "wb") as f:
+                f.write(img_data)
+        print(f"OK {len(images)} images saved to: {images_dir}")
+
+    # Save metadata if available
+    metadata = {
+        'title': book.get_metadata('DC', 'title'),
+        'creator': book.get_metadata('DC', 'creator'),
+        'language': book.get_metadata('DC', 'language'),
+        'publisher': book.get_metadata('DC', 'publisher'),
+        'description': book.get_metadata('DC', 'description'),
+    }
+
+    if any(metadata.values()):
+        metadata_path = os.path.join(output_dir, f"{base_name}_metadata.json")
+        with open(metadata_path, "w", encoding="utf-8") as f:
+            json.dump(metadata, f, indent=2, ensure_ascii=False)
+        print(f"OK Metadata saved to: {metadata_path}")
+
+
+if __name__ == "__main__":
+    # Convert the EPUB file in the input directory
+    epub_path = "input/the-art-of-spending-money.epub"
+    convert_epub_to_markdown(epub_path)