documan/convert_epub.py

#!/usr/bin/env python3
"""
EPUB to Markdown converter using ebooklib and html2text
"""

import os
import json
import re
from pathlib import Path
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup


def html_to_markdown(soup):
    """Convert BeautifulSoup HTML to Markdown format"""

    def process_element(element):
        if isinstance(element, str):
            text = element.strip()
            if text:
                return text
            return ""

        tag = element.name

        # Headers
        if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            level_num = int(tag[1])
            text = element.get_text().strip()
            return '\n' + '#' * level_num + ' ' + text + '\n'

        # Paragraphs
        elif tag == 'p':
            text = ''.join(process_element(child) for child in element.children)
            return '\n' + text.strip() + '\n'

        # Line breaks
        elif tag == 'br':
            return '\n'

        # Bold
        elif tag in ['strong', 'b']:
            text = ''.join(process_element(child) for child in element.children)
            return '**' + text.strip() + '**'

        # Italic
        elif tag in ['em', 'i']:
            text = ''.join(process_element(child) for child in element.children)
            return '*' + text.strip() + '*'

        # Links
        elif tag == 'a':
            text = ''.join(process_element(child) for child in element.children)
            href = element.get('href', '')
            if href:
                return f'[{text.strip()}]({href})'
            return text.strip()

        # Images
        elif tag == 'img':
            src = element.get('src', '')
            alt = element.get('alt', '')
            return f'![{alt}]({src})'

        # Lists
        elif tag == 'ul':
            items = []
            for li in element.find_all('li', recursive=False):
                text = ''.join(process_element(child) for child in li.children)
                items.append('- ' + text.strip())
            return '\n' + '\n'.join(items) + '\n'

        elif tag == 'ol':
            items = []
            for i, li in enumerate(element.find_all('li', recursive=False), 1):
                text = ''.join(process_element(child) for child in li.children)
                items.append(f'{i}. ' + text.strip())
            return '\n' + '\n'.join(items) + '\n'

        # Blockquote
        elif tag == 'blockquote':
            text = ''.join(process_element(child) for child in element.children)
            lines = text.strip().split('\n')
            return '\n' + '\n'.join('> ' + line for line in lines) + '\n'

        # Code
        elif tag == 'code':
            text = element.get_text()
            return '`' + text + '`'

        elif tag == 'pre':
            text = element.get_text()
            return '\n```\n' + text + '\n```\n'

        # Div and span - just process children
        elif tag in ['div', 'span', 'section', 'article']:
            return ''.join(process_element(child) for child in element.children)

        # Default - process children
        else:
            return ''.join(process_element(child) for child in element.children)

    # Process body or entire soup
    body = soup.find('body') if soup.find('body') else soup
    markdown = process_element(body)

    # Clean up multiple newlines
    markdown = re.sub(r'\n{3,}', '\n\n', markdown)

    return markdown.strip()


def convert_epub_to_markdown(epub_path: str, output_dir: str = "output"):
    """
    Convert EPUB file to Markdown

    Args:
        epub_path: Path to the EPUB file
        output_dir: Directory to save the output (default: "output")
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Get the base filename without extension
    epub_file = Path(epub_path)
    base_name = epub_file.stem

    print(f"Converting {epub_path} to Markdown...")

    # Read the EPUB file
    book = epub.read_epub(epub_path)

    # Extract all text content
    chapters = []
    images = {}
    image_counter = 0

    for item in book.get_items():
        if item.get_type() == ebooklib.ITEM_DOCUMENT:
            # Get HTML content
            html_content = item.get_content().decode('utf-8')

            # Parse with BeautifulSoup
            soup = BeautifulSoup(html_content, 'html.parser')

            # Convert to markdown-like format
            markdown_content = html_to_markdown(soup)

            # Clean up the markdown
            markdown_content = markdown_content.strip()

            if markdown_content:
                chapters.append(markdown_content)

        elif item.get_type() == ebooklib.ITEM_IMAGE:
            # Save image
            image_counter += 1
            img_name = item.get_name().split('/')[-1]
            if not img_name:
                img_name = f"image_{image_counter}.{item.media_type.split('/')[-1]}"
            images[img_name] = item.get_content()

    # Combine all chapters
    full_markdown = "\n\n---\n\n".join(chapters)

    # Save as markdown
    output_path = os.path.join(output_dir, f"{base_name}.md")
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(full_markdown)

    print(f"OK Conversion complete!")
    print(f"OK Output saved to: {output_path}")
    print(f"OK Total chapters: {len(chapters)}")

    # Save images if any
    if images:
        images_dir = os.path.join(output_dir, f"{base_name}_images")
        os.makedirs(images_dir, exist_ok=True)
        for img_name, img_data in images.items():
            img_path = os.path.join(images_dir, img_name)
            with open(img_path, "wb") as f:
                f.write(img_data)
        print(f"OK {len(images)} images saved to: {images_dir}")

    # Save metadata if available
    metadata = {
        'title': book.get_metadata('DC', 'title'),
        'creator': book.get_metadata('DC', 'creator'),
        'language': book.get_metadata('DC', 'language'),
        'publisher': book.get_metadata('DC', 'publisher'),
        'description': book.get_metadata('DC', 'description'),
    }

    if any(metadata.values()):
        metadata_path = os.path.join(output_dir, f"{base_name}_metadata.json")
        with open(metadata_path, "w", encoding="utf-8") as f:
            json.dump(metadata, f, indent=2, ensure_ascii=False)
        print(f"OK Metadata saved to: {metadata_path}")


if __name__ == "__main__":
    # Convert the EPUB file in the input directory
    epub_path = "input/the-art-of-spending-money.epub"
    convert_epub_to_markdown(epub_path)