doc2md/converters/pdf.py

#!/usr/bin/env python3
"""
PDF → Markdown 변환기 (페이지별 분류 + 라우팅)

페이지 타입:
  text            - 텍스트 위주 → marker-pdf 추출
  text-with-photo - 텍스트 + 사진 → marker-pdf + 이미지 크롭
  diagram         - 다이어그램/도면 → 페이지 PNG 렌더링 (에이전트가 Vision으로 처리)
  image-heavy     - 텍스트 거의 없음 → 페이지 PNG 렌더링
"""
from __future__ import annotations

import io
import re
from pathlib import Path

import fitz  # PyMuPDF
from PIL import Image


# ── 페이지 분류 ───────────────────────────────────────────────────────────────

def _pix_to_pil(pix: fitz.Pixmap) -> Image.Image:
    """PyMuPDF Pixmap → PIL Image."""
    mode = "RGBA" if pix.alpha else "RGB"
    return Image.frombytes(mode, (pix.width, pix.height), pix.samples)


def _is_diagram_image(img: Image.Image) -> bool:
    """
    래스터 이미지가 다이어그램인지 판별.
    다이어그램 특성: 제한된 색상 팔레트 + 높은 흰 배경 비율.
    """
    # 너무 작은 이미지(로고, 아이콘)는 스킵
    if img.width < 100 or img.height < 100:
        return False

    # 색상 수 (64색으로 양자화 후 실제 사용 색상)
    small = img.resize((200, 200), Image.LANCZOS).convert("RGB")
    quantized = small.quantize(colors=64)
    color_count = len(set(quantized.getdata()))

    # 흰 배경 비율
    gray = small.convert("L")
    pixels = list(gray.getdata())
    white_ratio = sum(1 for p in pixels if p > 240) / len(pixels)

    return color_count < 32 and white_ratio > 0.35


def classify_page(page: fitz.Page, doc: fitz.Document) -> str:
    """
    페이지를 분류한다.
    반환값: 'text' | 'text-with-photo' | 'diagram' | 'image-heavy'
    """
    text = page.get_text().strip()
    text_len = len(text)
    page_area = page.rect.width * page.rect.height

    drawings = page.get_drawings()
    images = page.get_images(full=True)

    text_density = text_len / page_area * 10_000  # 면적 대비 문자 수

    # 벡터 드로잉 밀도 (flowchart, CAD export 등은 수백 개 드로잉 포함)
    drawing_density = len(drawings) / page_area * 10_000

    # 1) 텍스트가 충분하면 텍스트 계열
    if text_density > 4:
        if not images:
            return "text"
        # 이미지가 있어도 작은 이미지(로고 등)면 text
        large_images = [
            img for img in images
            if doc.extract_image(img[0])["width"] > 150
            and doc.extract_image(img[0])["height"] > 150
        ]
        return "text-with-photo" if large_images else "text"

    # 2) 벡터 드로잉이 많으면 다이어그램
    if drawing_density > 1.5:
        return "diagram"

    # 3) 래스터 이미지가 있으면 다이어그램 여부 분석
    if images:
        for img_info in images[:3]:  # 최대 3개만 검사 (속도)
            try:
                xref = img_info[0]
                pix = fitz.Pixmap(doc, xref)
                if pix.colorspace and pix.colorspace.n > 1:
                    pil = _pix_to_pil(pix)
                    if _is_diagram_image(pil):
                        return "diagram"
            except Exception:
                pass
        return "text-with-photo" if text_len > 50 else "image-heavy"

    # 4) 텍스트도 이미지도 거의 없음
    return "image-heavy" if not text_len else "text"


# ── 페이지 PNG 렌더링 ─────────────────────────────────────────────────────────

def _render_page_png(page: fitz.Page, output_path: Path, scale: float = 2.0) -> None:
    """페이지를 고해상도 PNG로 렌더링."""
    mat = fitz.Matrix(scale, scale)
    pix = page.get_pixmap(matrix=mat)
    pix.save(str(output_path))


# ── 메인 변환 함수 ────────────────────────────────────────────────────────────

def convert_pdf(pdf_path: Path, output_dir: Path) -> dict:
    """
    PDF → MD 변환. AGENT_GUIDE.md 스펙의 JSON 구조를 dict로 반환.

    반환 dict:
      status        : "ok" | "error"
      input         : str
      output        : str (md 파일 경로)
      format        : "pdf"
      pages         : list of {n, type, image?}
      has_diagrams  : bool
      diagram_pages : list[int]
      images        : list[str]
      error?        : str
    """
    pdf_path = Path(pdf_path)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    images_dir = output_dir / f"{pdf_path.stem}_images"
    md_path = output_dir / f"{pdf_path.stem}.md"

    result: dict = {
        "status": "ok",
        "input": str(pdf_path),
        "output": str(md_path),
        "format": "pdf",
        "pages": [],
        "has_diagrams": False,
        "diagram_pages": [],
        "images": [],
    }

    try:
        doc = fitz.open(str(pdf_path))

        # ── 1) 각 페이지 분류 ──────────────────────────────────────────────
        page_types: list[str] = []
        for page in doc:
            ptype = classify_page(page, doc)
            page_types.append(ptype)
            result["pages"].append({"n": page.number + 1, "type": ptype})

        diagram_page_nums = [
            i + 1 for i, t in enumerate(page_types)
            if t in ("diagram", "image-heavy")
        ]
        result["has_diagrams"] = bool(diagram_page_nums)
        result["diagram_pages"] = diagram_page_nums

        # ── 2) 텍스트 추출 (marker-pdf) ────────────────────────────────────
        text_sections: list[str] = []
        try:
            from marker.converters.pdf import PdfConverter
            from marker.models import create_model_dict
            from marker.output import text_from_rendered

            converter = PdfConverter(artifact_dict=create_model_dict())
            rendered = converter(str(pdf_path))
            full_text, _, marker_images = text_from_rendered(rendered)

            # marker 추출 이미지 저장
            if marker_images:
                images_dir.mkdir(exist_ok=True)
                for img_name, img_data in marker_images.items():
                    try:
                        img_dest = images_dir / img_name
                        if isinstance(img_data, Image.Image):
                            img_data.save(str(img_dest))
                        elif isinstance(img_data, bytes) and img_data:
                            img_dest.write_bytes(img_data)
                        result["images"].append(str(img_dest))
                    except Exception:
                        pass

            # 이미지 경로 prefix 수정
            full_text = re.sub(
                r'!\[([^\]]*)\]\((?!http)([^)]+)\)',
                rf'![\1]({pdf_path.stem}_images/\2)',
                full_text,
            )
            text_sections.append(full_text)

        except ImportError:
            # marker-pdf 없으면 PyMuPDF 텍스트 추출로 fallback
            pages_text = []
            for page in doc:
                t = page.get_text().strip()
                if t:
                    pages_text.append(t)
            text_sections.append("\n\n---\n\n".join(pages_text))

        # ── 3) 다이어그램 페이지 PNG 렌더링 ────────────────────────────────
        if diagram_page_nums:
            images_dir.mkdir(exist_ok=True)
            diagram_section_lines = ["\n\n---\n\n## 다이어그램 페이지\n"]

            for page_num in diagram_page_nums:
                page = doc[page_num - 1]
                img_name = f"page_{page_num}.png"
                img_path = images_dir / img_name
                _render_page_png(page, img_path)
                result["images"].append(str(img_path))
                diagram_section_lines.append(
                    f"\n### Page {page_num}\n"
                    f"![Page {page_num} — 다이어그램]"
                    f"({pdf_path.stem}_images/{img_name})\n"
                )
                # pages 항목에 image 경로 추가
                for p in result["pages"]:
                    if p["n"] == page_num:
                        p["image"] = str(img_path)

            text_sections.append("".join(diagram_section_lines))

        doc.close()

        # ── 4) MD 파일 저장 ────────────────────────────────────────────────
        final_md = re.sub(r'\n{3,}', '\n\n', "\n\n".join(text_sections)).strip()
        md_path.write_text(final_md, encoding="utf-8")

    except Exception as e:
        result["status"] = "error"
        result["error"] = str(e)
        import traceback
        traceback.print_exc()

    return result