Files
documan/convert_epub.py
minsung 8d4339302e feat: MD 파일 병합 및 이미지 경로 통합 스크립트 추가 (#1)
- merge_markdown.py: 96개 페이지별 MD를 단일 파일로 병합
  - 이미지를 output/images/ 폴더로 통합, p{NN}_ prefix로 파일명 충돌 방지
  - file_range 파라미터로 부분 테스트 가능
- docs/tutorial.md: merge 명령어 및 사용법 문서화
- docs/history: 작업 이력 파일 추가

소요 시간: 10분 | Context: input 18k / output 2k tokens

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-01 11:00:28 +09:00

206 lines
6.4 KiB
Python

#!/usr/bin/env python3
"""
EPUB to Markdown converter using ebooklib and html2text
"""
import os
import json
import re
from pathlib import Path
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup
def html_to_markdown(soup):
"""Convert BeautifulSoup HTML to Markdown format"""
def process_element(element):
if isinstance(element, str):
text = element.strip()
if text:
return text
return ""
tag = element.name
# Headers
if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
level_num = int(tag[1])
text = element.get_text().strip()
return '\n' + '#' * level_num + ' ' + text + '\n'
# Paragraphs
elif tag == 'p':
text = ''.join(process_element(child) for child in element.children)
return '\n' + text.strip() + '\n'
# Line breaks
elif tag == 'br':
return '\n'
# Bold
elif tag in ['strong', 'b']:
text = ''.join(process_element(child) for child in element.children)
return '**' + text.strip() + '**'
# Italic
elif tag in ['em', 'i']:
text = ''.join(process_element(child) for child in element.children)
return '*' + text.strip() + '*'
# Links
elif tag == 'a':
text = ''.join(process_element(child) for child in element.children)
href = element.get('href', '')
if href:
return f'[{text.strip()}]({href})'
return text.strip()
# Images
elif tag == 'img':
src = element.get('src', '')
alt = element.get('alt', '')
return f'![{alt}]({src})'
# Lists
elif tag == 'ul':
items = []
for li in element.find_all('li', recursive=False):
text = ''.join(process_element(child) for child in li.children)
items.append('- ' + text.strip())
return '\n' + '\n'.join(items) + '\n'
elif tag == 'ol':
items = []
for i, li in enumerate(element.find_all('li', recursive=False), 1):
text = ''.join(process_element(child) for child in li.children)
items.append(f'{i}. ' + text.strip())
return '\n' + '\n'.join(items) + '\n'
# Blockquote
elif tag == 'blockquote':
text = ''.join(process_element(child) for child in element.children)
lines = text.strip().split('\n')
return '\n' + '\n'.join('> ' + line for line in lines) + '\n'
# Code
elif tag == 'code':
text = element.get_text()
return '`' + text + '`'
elif tag == 'pre':
text = element.get_text()
return '\n```\n' + text + '\n```\n'
# Div and span - just process children
elif tag in ['div', 'span', 'section', 'article']:
return ''.join(process_element(child) for child in element.children)
# Default - process children
else:
return ''.join(process_element(child) for child in element.children)
# Process body or entire soup
body = soup.find('body') if soup.find('body') else soup
markdown = process_element(body)
# Clean up multiple newlines
markdown = re.sub(r'\n{3,}', '\n\n', markdown)
return markdown.strip()
def convert_epub_to_markdown(epub_path: str, output_dir: str = "output"):
"""
Convert EPUB file to Markdown
Args:
epub_path: Path to the EPUB file
output_dir: Directory to save the output (default: "output")
"""
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Get the base filename without extension
epub_file = Path(epub_path)
base_name = epub_file.stem
print(f"Converting {epub_path} to Markdown...")
# Read the EPUB file
book = epub.read_epub(epub_path)
# Extract all text content
chapters = []
images = {}
image_counter = 0
for item in book.get_items():
if item.get_type() == ebooklib.ITEM_DOCUMENT:
# Get HTML content
html_content = item.get_content().decode('utf-8')
# Parse with BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
# Convert to markdown-like format
markdown_content = html_to_markdown(soup)
# Clean up the markdown
markdown_content = markdown_content.strip()
if markdown_content:
chapters.append(markdown_content)
elif item.get_type() == ebooklib.ITEM_IMAGE:
# Save image
image_counter += 1
img_name = item.get_name().split('/')[-1]
if not img_name:
img_name = f"image_{image_counter}.{item.media_type.split('/')[-1]}"
images[img_name] = item.get_content()
# Combine all chapters
full_markdown = "\n\n---\n\n".join(chapters)
# Save as markdown
output_path = os.path.join(output_dir, f"{base_name}.md")
with open(output_path, "w", encoding="utf-8") as f:
f.write(full_markdown)
print(f"OK Conversion complete!")
print(f"OK Output saved to: {output_path}")
print(f"OK Total chapters: {len(chapters)}")
# Save images if any
if images:
images_dir = os.path.join(output_dir, f"{base_name}_images")
os.makedirs(images_dir, exist_ok=True)
for img_name, img_data in images.items():
img_path = os.path.join(images_dir, img_name)
with open(img_path, "wb") as f:
f.write(img_data)
print(f"OK {len(images)} images saved to: {images_dir}")
# Save metadata if available
metadata = {
'title': book.get_metadata('DC', 'title'),
'creator': book.get_metadata('DC', 'creator'),
'language': book.get_metadata('DC', 'language'),
'publisher': book.get_metadata('DC', 'publisher'),
'description': book.get_metadata('DC', 'description'),
}
if any(metadata.values()):
metadata_path = os.path.join(output_dir, f"{base_name}_metadata.json")
with open(metadata_path, "w", encoding="utf-8") as f:
json.dump(metadata, f, indent=2, ensure_ascii=False)
print(f"OK Metadata saved to: {metadata_path}")
if __name__ == "__main__":
# Convert the EPUB file in the input directory
epub_path = "input/the-art-of-spending-money.epub"
convert_epub_to_markdown(epub_path)