- merge_markdown.py: 96개 페이지별 MD를 단일 파일로 병합
- 이미지를 output/images/ 폴더로 통합, p{NN}_ prefix로 파일명 충돌 방지
- file_range 파라미터로 부분 테스트 가능
- docs/tutorial.md: merge 명령어 및 사용법 문서화
- docs/history: 작업 이력 파일 추가
소요 시간: 10분 | Context: input 18k / output 2k tokens
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
206 lines
6.4 KiB
Python
206 lines
6.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
EPUB to Markdown converter using ebooklib and html2text
|
|
"""
|
|
|
|
import os
|
|
import json
|
|
import re
|
|
from pathlib import Path
|
|
import ebooklib
|
|
from ebooklib import epub
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
def html_to_markdown(soup):
|
|
"""Convert BeautifulSoup HTML to Markdown format"""
|
|
|
|
def process_element(element):
|
|
if isinstance(element, str):
|
|
text = element.strip()
|
|
if text:
|
|
return text
|
|
return ""
|
|
|
|
tag = element.name
|
|
|
|
# Headers
|
|
if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
|
level_num = int(tag[1])
|
|
text = element.get_text().strip()
|
|
return '\n' + '#' * level_num + ' ' + text + '\n'
|
|
|
|
# Paragraphs
|
|
elif tag == 'p':
|
|
text = ''.join(process_element(child) for child in element.children)
|
|
return '\n' + text.strip() + '\n'
|
|
|
|
# Line breaks
|
|
elif tag == 'br':
|
|
return '\n'
|
|
|
|
# Bold
|
|
elif tag in ['strong', 'b']:
|
|
text = ''.join(process_element(child) for child in element.children)
|
|
return '**' + text.strip() + '**'
|
|
|
|
# Italic
|
|
elif tag in ['em', 'i']:
|
|
text = ''.join(process_element(child) for child in element.children)
|
|
return '*' + text.strip() + '*'
|
|
|
|
# Links
|
|
elif tag == 'a':
|
|
text = ''.join(process_element(child) for child in element.children)
|
|
href = element.get('href', '')
|
|
if href:
|
|
return f'[{text.strip()}]({href})'
|
|
return text.strip()
|
|
|
|
# Images
|
|
elif tag == 'img':
|
|
src = element.get('src', '')
|
|
alt = element.get('alt', '')
|
|
return f''
|
|
|
|
# Lists
|
|
elif tag == 'ul':
|
|
items = []
|
|
for li in element.find_all('li', recursive=False):
|
|
text = ''.join(process_element(child) for child in li.children)
|
|
items.append('- ' + text.strip())
|
|
return '\n' + '\n'.join(items) + '\n'
|
|
|
|
elif tag == 'ol':
|
|
items = []
|
|
for i, li in enumerate(element.find_all('li', recursive=False), 1):
|
|
text = ''.join(process_element(child) for child in li.children)
|
|
items.append(f'{i}. ' + text.strip())
|
|
return '\n' + '\n'.join(items) + '\n'
|
|
|
|
# Blockquote
|
|
elif tag == 'blockquote':
|
|
text = ''.join(process_element(child) for child in element.children)
|
|
lines = text.strip().split('\n')
|
|
return '\n' + '\n'.join('> ' + line for line in lines) + '\n'
|
|
|
|
# Code
|
|
elif tag == 'code':
|
|
text = element.get_text()
|
|
return '`' + text + '`'
|
|
|
|
elif tag == 'pre':
|
|
text = element.get_text()
|
|
return '\n```\n' + text + '\n```\n'
|
|
|
|
# Div and span - just process children
|
|
elif tag in ['div', 'span', 'section', 'article']:
|
|
return ''.join(process_element(child) for child in element.children)
|
|
|
|
# Default - process children
|
|
else:
|
|
return ''.join(process_element(child) for child in element.children)
|
|
|
|
# Process body or entire soup
|
|
body = soup.find('body') if soup.find('body') else soup
|
|
markdown = process_element(body)
|
|
|
|
# Clean up multiple newlines
|
|
markdown = re.sub(r'\n{3,}', '\n\n', markdown)
|
|
|
|
return markdown.strip()
|
|
|
|
|
|
def convert_epub_to_markdown(epub_path: str, output_dir: str = "output"):
|
|
"""
|
|
Convert EPUB file to Markdown
|
|
|
|
Args:
|
|
epub_path: Path to the EPUB file
|
|
output_dir: Directory to save the output (default: "output")
|
|
"""
|
|
# Create output directory if it doesn't exist
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
# Get the base filename without extension
|
|
epub_file = Path(epub_path)
|
|
base_name = epub_file.stem
|
|
|
|
print(f"Converting {epub_path} to Markdown...")
|
|
|
|
# Read the EPUB file
|
|
book = epub.read_epub(epub_path)
|
|
|
|
# Extract all text content
|
|
chapters = []
|
|
images = {}
|
|
image_counter = 0
|
|
|
|
for item in book.get_items():
|
|
if item.get_type() == ebooklib.ITEM_DOCUMENT:
|
|
# Get HTML content
|
|
html_content = item.get_content().decode('utf-8')
|
|
|
|
# Parse with BeautifulSoup
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
|
|
# Convert to markdown-like format
|
|
markdown_content = html_to_markdown(soup)
|
|
|
|
# Clean up the markdown
|
|
markdown_content = markdown_content.strip()
|
|
|
|
if markdown_content:
|
|
chapters.append(markdown_content)
|
|
|
|
elif item.get_type() == ebooklib.ITEM_IMAGE:
|
|
# Save image
|
|
image_counter += 1
|
|
img_name = item.get_name().split('/')[-1]
|
|
if not img_name:
|
|
img_name = f"image_{image_counter}.{item.media_type.split('/')[-1]}"
|
|
images[img_name] = item.get_content()
|
|
|
|
# Combine all chapters
|
|
full_markdown = "\n\n---\n\n".join(chapters)
|
|
|
|
# Save as markdown
|
|
output_path = os.path.join(output_dir, f"{base_name}.md")
|
|
with open(output_path, "w", encoding="utf-8") as f:
|
|
f.write(full_markdown)
|
|
|
|
print(f"OK Conversion complete!")
|
|
print(f"OK Output saved to: {output_path}")
|
|
print(f"OK Total chapters: {len(chapters)}")
|
|
|
|
# Save images if any
|
|
if images:
|
|
images_dir = os.path.join(output_dir, f"{base_name}_images")
|
|
os.makedirs(images_dir, exist_ok=True)
|
|
for img_name, img_data in images.items():
|
|
img_path = os.path.join(images_dir, img_name)
|
|
with open(img_path, "wb") as f:
|
|
f.write(img_data)
|
|
print(f"OK {len(images)} images saved to: {images_dir}")
|
|
|
|
# Save metadata if available
|
|
metadata = {
|
|
'title': book.get_metadata('DC', 'title'),
|
|
'creator': book.get_metadata('DC', 'creator'),
|
|
'language': book.get_metadata('DC', 'language'),
|
|
'publisher': book.get_metadata('DC', 'publisher'),
|
|
'description': book.get_metadata('DC', 'description'),
|
|
}
|
|
|
|
if any(metadata.values()):
|
|
metadata_path = os.path.join(output_dir, f"{base_name}_metadata.json")
|
|
with open(metadata_path, "w", encoding="utf-8") as f:
|
|
json.dump(metadata, f, indent=2, ensure_ascii=False)
|
|
print(f"OK Metadata saved to: {metadata_path}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Convert the EPUB file in the input directory
|
|
epub_path = "input/the-art-of-spending-money.epub"
|
|
convert_epub_to_markdown(epub_path)
|