C.E.L_Slide_test2/src/pipeline_v2.py

"""Pipeline v2: TF-IDF 기반 블록 매칭 + 렌더링 파이프라인.

기존 pipeline.py를 건드리지 않고, 새 매칭/렌더링 엔진으로 동작하는 별도 파이프라인.

입출력 계약:
  입력: MDX 텍스트 + base_path
  출력: data/runs/{run_id}/ 에 final.html + 단계별 context

흐름:
  1. MDX 정규화
  2. zone 구분 (중목차 기준)
  3. TF-IDF 블록 매칭 (direct-fit / recipe 분기)
  4. 블록 렌더링 (템플릿 로드 + 슬롯 삽입)
  5. slide-base 조립
  6. 저장
"""
from __future__ import annotations

import json
import logging
import re
import time
from datetime import datetime
from pathlib import Path
from typing import Any

from src.block_matcher_tfidf import TfidfBlockMatcher
from src.catalog_blocks import load_blocks_catalog, find_block_by_id

logger = logging.getLogger(__name__)


def generate_slide_v2(
    mdx_content: str,
    base_path: str = "",
    catalog_path: str = "templates/catalog/blocks.yaml",
    threshold: float = 0.15,
) -> dict:
    """v2 파이프라인: MDX → 슬라이드 HTML.

    Returns:
        {"run_id": str, "run_dir": str, "final_html": str, "steps": dict}
    """
    templates_dir = Path("templates")
    run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
    run_dir = Path("data/runs") / run_id
    run_dir.mkdir(parents=True, exist_ok=True)
    steps_dir = run_dir / "steps"
    steps_dir.mkdir(exist_ok=True)

    steps = {}

    # ══ Step 1: MDX 정규화 ══
    logger.info("[v2] Step 1: MDX 정규화")
    from src.mdx_normalizer import normalize_mdx_content
    normalized = normalize_mdx_content(mdx_content)
    steps["step1_normalize"] = {
        "title": normalized.get("title", ""),
        "sections_count": len(normalized.get("sections", [])),
        "sections": normalized.get("sections", []),
    }

    # Step 1 저장
    _save_step_html(steps_dir / "step1_normalize.html", "Step 1: MDX 정규화", [
        f"<b>title:</b> {normalized.get('title', '')}",
        f"<b>sections:</b> {len(normalized.get('sections', []))}개",
        *[f"  level={s['level']} | {s['title']} | {len(s.get('content',''))}자"
          for s in normalized.get("sections", [])],
    ])

    # ══ Step 2: zone 구분 ══
    logger.info("[v2] Step 2: zone 구분")
    from src.section_parser import extract_major_sections
    sections = normalized.get("sections", [])
    major_sections = extract_major_sections(sections)
    steps["step2_zones"] = [
        {"title": s["title"], "sub_titles": s["sub_titles"], "content_len": len(s.get("content", ""))}
        for s in major_sections
    ]

    _save_step_html(steps_dir / "step2_zones.html", "Step 2: zone 구분", [
        *[f"<b>zone {i+1}:</b> {s['title']} | sub_titles={s['sub_titles']} | {len(s.get('content',''))}자"
          for i, s in enumerate(major_sections)],
    ])

    # ══ Step 3: TF-IDF 블록 매칭 ══
    logger.info("[v2] Step 3: TF-IDF 블록 매칭")
    matcher = TfidfBlockMatcher(catalog_path)
    catalog = load_blocks_catalog(catalog_path)

    match_results = {}
    step3_lines = []
    for sec in major_sections:
        title = sec["title"]
        sub_titles = sec["sub_titles"]
        content = sec.get("content", "")
        d1_items = [re.sub(r'\*+', '', d).strip()
                    for d in re.findall(r'^D1:\s*(.*)', content, re.MULTILINE)]

        top3 = matcher.match(title, sub_titles, d1_items, top_k=3)
        best = matcher.match_with_threshold(title, sub_titles, d1_items, threshold=threshold)

        match_results[title] = {
            "best": best,
            "top3": top3,
            "path": "direct-fit" if best else "recipe",
            "sub_titles": sub_titles,
            "content": content,
            "d1_items": d1_items,
        }

        step3_lines.append(f"<b>zone:</b> {title}")
        step3_lines.append(f"  sub_titles: {sub_titles}")
        if best:
            step3_lines.append(f"  → direct-fit: {best['block_id']} (score={best['score']})")
        else:
            step3_lines.append(f"  → recipe 경로")
        for j, c in enumerate(top3):
            step3_lines.append(f"    #{j+1}: {c['block_id']} (score={c['score']})")
        step3_lines.append("")

    steps["step3_matching"] = match_results

    _save_step_html(steps_dir / "step3_matching.html", "Step 3: TF-IDF 블록 매칭", step3_lines)

    # ══ Step 4: 블록 렌더링 + slide-base 조립 ══
    logger.info("[v2] Step 4: 블록 렌더링 + 조립")

    slide_title = normalized.get("title", "")
    # conclusion 추출
    from src.section_parser import extract_conclusion_text
    conclusion = extract_conclusion_text(mdx_content)
    conclusion = re.sub(r'^[\*•\-]\s*', '', conclusion).strip()

    # 각 zone의 블록 HTML 렌더링
    zone_htmls = []
    zone_csses = []
    total_zones = len(major_sections)

    for sec in major_sections:
        title = sec["title"]
        info = match_results[title]

        if info["path"] == "direct-fit" and info["best"]:
            # direct-fit: 블록 템플릿 로드
            block_id = info["best"]["block_id"]
            # template 경로: catalog에 없으면 structures/ 에서 찾기
            block_meta = find_block_by_id(catalog, block_id)
            template_path = ""
            if block_meta:
                template_path = block_meta.get("template", "")
            if not template_path:
                template_path = f"blocks/structures/{block_id}.html"
            if (templates_dir / template_path).exists():
                html, css = _render_block_template(
                    templates_dir, template_path, title, info
                )
                zone_htmls.append((title, html))
                if css:
                    zone_csses.append(css)
                logger.info(f"[v2] {title} → block '{block_id}' 렌더")
            else:
                html = _render_fallback(title, info)
                zone_htmls.append((title, html))
                logger.warning(f"[v2] {title} → block '{block_id}' catalog에 없음, fallback")
        else:
            # recipe 경로: direct render
            html = _render_fallback(title, info)
            zone_htmls.append((title, html))
            logger.info(f"[v2] {title} → recipe fallback render")

    # slide-base 조립
    final_html = _assemble_slide(
        templates_dir, slide_title, conclusion,
        zone_htmls, zone_csses, total_zones,
    )

    steps["step4_render"] = {
        "zones": [{"title": t, "html_len": len(h)} for t, h in zone_htmls],
    }

    _save_step_html(steps_dir / "step4_render.html", "Step 4: 렌더링", [
        *[f"<b>{t}:</b> {len(h)}자 HTML" for t, h in zone_htmls],
    ])

    # ══ Step 5: 저장 ══
    logger.info("[v2] Step 5: 저장")
    (run_dir / "final.html").write_text(final_html, encoding="utf-8")

    context = {
        "run_id": run_id,
        "title": slide_title,
        "conclusion": conclusion,
        "steps": {k: _safe_serialize(v) for k, v in steps.items()},
        "match_results": {k: _safe_serialize(v) for k, v in match_results.items()},
    }
    (run_dir / "final_context.json").write_text(
        json.dumps(context, ensure_ascii=False, indent=2), encoding="utf-8"
    )

    logger.info(f"[v2] 완료: {run_dir}")
    return {"run_id": run_id, "run_dir": str(run_dir), "final_html": final_html}


def _render_block_template(
    templates_dir: Path,
    template_path: str,
    zone_title: str,
    info: dict,
) -> tuple[str, str]:
    """블록 템플릿을 로드하고 그대로 반환.

    현재는 블록 HTML을 그대로 사용 (슬롯 교체는 추후).
    CSS는 분리하여 head로 이동.
    """
    full_path = templates_dir / template_path
    if not full_path.exists():
        logger.warning(f"[v2] 템플릿 없음: {full_path}")
        return _render_fallback(zone_title, info), ""

    raw = full_path.read_text(encoding="utf-8")

    # CSS 분리
    css_parts = re.findall(r'<style>(.*?)</style>', raw, re.DOTALL)
    css = "\n".join(css_parts)
    html = re.sub(r'<style>.*?</style>', '', raw, flags=re.DOTALL).strip()

    # HTML 주석 제거
    html = re.sub(r'<!--[\s\S]*?-->', '', html).strip()

    return html, css


def _render_fallback(zone_title: str, info: dict) -> str:
    """매칭 안 됐을 때 기본 렌더링. .bul 구조 사용."""
    sub_titles = info.get("sub_titles", [])
    content = info.get("content", "")
    d1_items = info.get("d1_items", [])

    parts = []
    parts.append(f'<div class="zone-title" style="font-size:var(--font-zone-title);font-weight:700;color:var(--color-zone-title);margin-bottom:var(--heading-gap);">{zone_title}</div>')

    if d1_items:
        for item in d1_items:
            if ": " in item:
                h, d = item.split(": ", 1)
                parts.append(f'<div class="bul">• <strong>{h}</strong>: {d}</div>')
            else:
                parts.append(f'<div class="bul">• {item}</div>')
    elif content:
        for line in content.split("\n"):
            line = line.strip()
            if not line or line.startswith("![") or line.startswith("[이미지:"):
                continue
            d1 = re.match(r'^D1:\s*(.*)', line)
            d2 = re.match(r'^D2:\s*(.*)', line)
            if d1:
                text = re.sub(r'\*+', '', d1.group(1)).strip()
                parts.append(f'<div class="bul">• <strong>{text}</strong></div>')
            elif d2:
                text = re.sub(r'\*+', '', d2.group(1)).strip()
                parts.append(f'<div class="bul">  • {text}</div>')

    return "\n".join(parts)


def _assemble_slide(
    templates_dir: Path,
    title: str,
    conclusion: str,
    zone_htmls: list[tuple[str, str]],
    zone_csses: list[str],
    total_zones: int,
) -> str:
    """slide-base.html에 zone들을 조립."""
    from jinja2 import Environment, FileSystemLoader

    # slide-base 로드
    slide_base_path = templates_dir / "blocks" / "slide-base.html"
    raw = slide_base_path.read_text(encoding="utf-8")
    raw = re.sub(r'<!--[\s\S]*?-->', '', raw)

    # body HTML 구성
    body_parts = []
    weight = 1.0 / max(total_zones, 1)
    for i, (zone_title, html) in enumerate(zone_htmls):
        height_pct = int(weight * 96)
        margin = "margin-bottom:2%;" if i < total_zones - 1 else ""
        body_parts.append(
            f'<div style="height:{height_pct}%;{margin}padding-top:var(--space-xs);">'
            f'<div style="height:100%;overflow:hidden;padding:0 var(--zone-padding-right) 0 var(--zone-padding-left);">'
            f'{html}</div></div>'
        )

    body_html = "\n".join(body_parts)

    # {% block body %} 치환
    raw = raw.replace("{% block body %}{% endblock %}", body_html)

    # 블록 CSS 합치기
    extra_css = "\n".join(zone_csses)

    # Jinja2 렌더 (include 지원)
    env = Environment(loader=FileSystemLoader(str(templates_dir)))
    template = env.from_string(raw)
    result = template.render(
        title=title,
        footer_text=conclusion,
        footer_pill_bg="",
    )

    # 블록 CSS를 head의 첫 </style> 앞에 삽입
    if extra_css and '</style>' in result:
        result = result.replace('</style>', f'\n{extra_css}\n</style>', 1)

    # body 안에 <style> 잔존하면 head로 이동 (safety net)
    body_start = result.find('<body')
    if body_start > 0:
        head_part = result[:body_start]
        body_part = result[body_start:]
        body_styles = re.findall(r'<style>([\s\S]*?)</style>', body_part)
        if body_styles:
            body_part = re.sub(r'<style>[\s\S]*?</style>', '', body_part)
            head_part = head_part.replace('</style>', f'\n{chr(10).join(body_styles)}\n</style>', 1)
            result = head_part + body_part

    # asset 임베딩 (svg/ 경로 → base64)
    from src.block_assembler import _embed_slide_assets
    result = _embed_slide_assets(result, templates_dir)

    return result


def _save_step_html(path: Path, title: str, lines: list[str]):
    """단계별 디버그 HTML 저장."""
    content = "\n".join(f"<div>{line}</div>" for line in lines)
    html = f"""<!DOCTYPE html><html><head><meta charset="UTF-8">
<style>*{{margin:0;padding:0;box-sizing:border-box;}}
body{{background:#e5e5e5;padding:10px;font-family:sans-serif;word-break:keep-all;font-size:12px;}}
div{{margin-bottom:2px;}}</style>
</head><body>
<div style="font-size:16px;font-weight:bold;margin-bottom:8px;">{title}</div>
{content}
</body></html>"""
    path.write_text(html, encoding="utf-8")


def _safe_serialize(obj):
    """JSON 직렬화 가능하도록 변환."""
    if isinstance(obj, dict):
        return {k: _safe_serialize(v) for k, v in obj.items()}
    if isinstance(obj, list):
        return [_safe_serialize(v) for v in obj]
    if isinstance(obj, (str, int, float, bool, type(None))):
        return obj
    return str(obj)