feat: MD 파일 병합 및 이미지 경로 통합 스크립트 추가 (#1)
- merge_markdown.py: 96개 페이지별 MD를 단일 파일로 병합
- 이미지를 output/images/ 폴더로 통합, p{NN}_ prefix로 파일명 충돌 방지
- file_range 파라미터로 부분 테스트 가능
- docs/tutorial.md: merge 명령어 및 사용법 문서화
- docs/history: 작업 이력 파일 추가
소요 시간: 10분 | Context: input 18k / output 2k tokens
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
205
convert_epub.py
Normal file
205
convert_epub.py
Normal file
@@ -0,0 +1,205 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
EPUB to Markdown converter using ebooklib and html2text
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
import ebooklib
|
||||
from ebooklib import epub
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
def html_to_markdown(soup):
|
||||
"""Convert BeautifulSoup HTML to Markdown format"""
|
||||
|
||||
def process_element(element):
|
||||
if isinstance(element, str):
|
||||
text = element.strip()
|
||||
if text:
|
||||
return text
|
||||
return ""
|
||||
|
||||
tag = element.name
|
||||
|
||||
# Headers
|
||||
if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
||||
level_num = int(tag[1])
|
||||
text = element.get_text().strip()
|
||||
return '\n' + '#' * level_num + ' ' + text + '\n'
|
||||
|
||||
# Paragraphs
|
||||
elif tag == 'p':
|
||||
text = ''.join(process_element(child) for child in element.children)
|
||||
return '\n' + text.strip() + '\n'
|
||||
|
||||
# Line breaks
|
||||
elif tag == 'br':
|
||||
return '\n'
|
||||
|
||||
# Bold
|
||||
elif tag in ['strong', 'b']:
|
||||
text = ''.join(process_element(child) for child in element.children)
|
||||
return '**' + text.strip() + '**'
|
||||
|
||||
# Italic
|
||||
elif tag in ['em', 'i']:
|
||||
text = ''.join(process_element(child) for child in element.children)
|
||||
return '*' + text.strip() + '*'
|
||||
|
||||
# Links
|
||||
elif tag == 'a':
|
||||
text = ''.join(process_element(child) for child in element.children)
|
||||
href = element.get('href', '')
|
||||
if href:
|
||||
return f'[{text.strip()}]({href})'
|
||||
return text.strip()
|
||||
|
||||
# Images
|
||||
elif tag == 'img':
|
||||
src = element.get('src', '')
|
||||
alt = element.get('alt', '')
|
||||
return f''
|
||||
|
||||
# Lists
|
||||
elif tag == 'ul':
|
||||
items = []
|
||||
for li in element.find_all('li', recursive=False):
|
||||
text = ''.join(process_element(child) for child in li.children)
|
||||
items.append('- ' + text.strip())
|
||||
return '\n' + '\n'.join(items) + '\n'
|
||||
|
||||
elif tag == 'ol':
|
||||
items = []
|
||||
for i, li in enumerate(element.find_all('li', recursive=False), 1):
|
||||
text = ''.join(process_element(child) for child in li.children)
|
||||
items.append(f'{i}. ' + text.strip())
|
||||
return '\n' + '\n'.join(items) + '\n'
|
||||
|
||||
# Blockquote
|
||||
elif tag == 'blockquote':
|
||||
text = ''.join(process_element(child) for child in element.children)
|
||||
lines = text.strip().split('\n')
|
||||
return '\n' + '\n'.join('> ' + line for line in lines) + '\n'
|
||||
|
||||
# Code
|
||||
elif tag == 'code':
|
||||
text = element.get_text()
|
||||
return '`' + text + '`'
|
||||
|
||||
elif tag == 'pre':
|
||||
text = element.get_text()
|
||||
return '\n```\n' + text + '\n```\n'
|
||||
|
||||
# Div and span - just process children
|
||||
elif tag in ['div', 'span', 'section', 'article']:
|
||||
return ''.join(process_element(child) for child in element.children)
|
||||
|
||||
# Default - process children
|
||||
else:
|
||||
return ''.join(process_element(child) for child in element.children)
|
||||
|
||||
# Process body or entire soup
|
||||
body = soup.find('body') if soup.find('body') else soup
|
||||
markdown = process_element(body)
|
||||
|
||||
# Clean up multiple newlines
|
||||
markdown = re.sub(r'\n{3,}', '\n\n', markdown)
|
||||
|
||||
return markdown.strip()
|
||||
|
||||
|
||||
def convert_epub_to_markdown(epub_path: str, output_dir: str = "output"):
|
||||
"""
|
||||
Convert EPUB file to Markdown
|
||||
|
||||
Args:
|
||||
epub_path: Path to the EPUB file
|
||||
output_dir: Directory to save the output (default: "output")
|
||||
"""
|
||||
# Create output directory if it doesn't exist
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# Get the base filename without extension
|
||||
epub_file = Path(epub_path)
|
||||
base_name = epub_file.stem
|
||||
|
||||
print(f"Converting {epub_path} to Markdown...")
|
||||
|
||||
# Read the EPUB file
|
||||
book = epub.read_epub(epub_path)
|
||||
|
||||
# Extract all text content
|
||||
chapters = []
|
||||
images = {}
|
||||
image_counter = 0
|
||||
|
||||
for item in book.get_items():
|
||||
if item.get_type() == ebooklib.ITEM_DOCUMENT:
|
||||
# Get HTML content
|
||||
html_content = item.get_content().decode('utf-8')
|
||||
|
||||
# Parse with BeautifulSoup
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
|
||||
# Convert to markdown-like format
|
||||
markdown_content = html_to_markdown(soup)
|
||||
|
||||
# Clean up the markdown
|
||||
markdown_content = markdown_content.strip()
|
||||
|
||||
if markdown_content:
|
||||
chapters.append(markdown_content)
|
||||
|
||||
elif item.get_type() == ebooklib.ITEM_IMAGE:
|
||||
# Save image
|
||||
image_counter += 1
|
||||
img_name = item.get_name().split('/')[-1]
|
||||
if not img_name:
|
||||
img_name = f"image_{image_counter}.{item.media_type.split('/')[-1]}"
|
||||
images[img_name] = item.get_content()
|
||||
|
||||
# Combine all chapters
|
||||
full_markdown = "\n\n---\n\n".join(chapters)
|
||||
|
||||
# Save as markdown
|
||||
output_path = os.path.join(output_dir, f"{base_name}.md")
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
f.write(full_markdown)
|
||||
|
||||
print(f"OK Conversion complete!")
|
||||
print(f"OK Output saved to: {output_path}")
|
||||
print(f"OK Total chapters: {len(chapters)}")
|
||||
|
||||
# Save images if any
|
||||
if images:
|
||||
images_dir = os.path.join(output_dir, f"{base_name}_images")
|
||||
os.makedirs(images_dir, exist_ok=True)
|
||||
for img_name, img_data in images.items():
|
||||
img_path = os.path.join(images_dir, img_name)
|
||||
with open(img_path, "wb") as f:
|
||||
f.write(img_data)
|
||||
print(f"OK {len(images)} images saved to: {images_dir}")
|
||||
|
||||
# Save metadata if available
|
||||
metadata = {
|
||||
'title': book.get_metadata('DC', 'title'),
|
||||
'creator': book.get_metadata('DC', 'creator'),
|
||||
'language': book.get_metadata('DC', 'language'),
|
||||
'publisher': book.get_metadata('DC', 'publisher'),
|
||||
'description': book.get_metadata('DC', 'description'),
|
||||
}
|
||||
|
||||
if any(metadata.values()):
|
||||
metadata_path = os.path.join(output_dir, f"{base_name}_metadata.json")
|
||||
with open(metadata_path, "w", encoding="utf-8") as f:
|
||||
json.dump(metadata, f, indent=2, ensure_ascii=False)
|
||||
print(f"OK Metadata saved to: {metadata_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Convert the EPUB file in the input directory
|
||||
epub_path = "input/the-art-of-spending-money.epub"
|
||||
convert_epub_to_markdown(epub_path)
|
||||
Reference in New Issue
Block a user