"""Pipeline v2: TF-IDF 기반 블록 매칭 + 렌더링 파이프라인. 기존 pipeline.py를 건드리지 않고, 새 매칭/렌더링 엔진으로 동작하는 별도 파이프라인. 입출력 계약: 입력: MDX 텍스트 + base_path 출력: data/runs/{run_id}/ 에 final.html + 단계별 context 흐름: 1. MDX 정규화 2. zone 구분 (중목차 기준) 3. TF-IDF 블록 매칭 (direct-fit / recipe 분기) 4. 블록 렌더링 (템플릿 로드 + 슬롯 삽입) 5. slide-base 조립 6. 저장 """ from __future__ import annotations import json import logging import re import time from datetime import datetime from pathlib import Path from typing import Any from src.block_matcher_tfidf import TfidfBlockMatcher from src.catalog_blocks import load_blocks_catalog, find_block_by_id logger = logging.getLogger(__name__) def generate_slide_v2( mdx_content: str, base_path: str = "", catalog_path: str = "templates/catalog/blocks.yaml", threshold: float = 0.15, ) -> dict: """v2 파이프라인: MDX → 슬라이드 HTML. Returns: {"run_id": str, "run_dir": str, "final_html": str, "steps": dict} """ templates_dir = Path("templates") run_id = datetime.now().strftime("%Y%m%d_%H%M%S") run_dir = Path("data/runs") / run_id run_dir.mkdir(parents=True, exist_ok=True) steps_dir = run_dir / "steps" steps_dir.mkdir(exist_ok=True) steps = {} # ══ Step 1: MDX 정규화 ══ logger.info("[v2] Step 1: MDX 정규화") from src.mdx_normalizer import normalize_mdx_content normalized = normalize_mdx_content(mdx_content) steps["step1_normalize"] = { "title": normalized.get("title", ""), "sections_count": len(normalized.get("sections", [])), "sections": normalized.get("sections", []), } # Step 1 저장 _save_step_html(steps_dir / "step1_normalize.html", "Step 1: MDX 정규화", [ f"title: {normalized.get('title', '')}", f"sections: {len(normalized.get('sections', []))}개", *[f" level={s['level']} | {s['title']} | {len(s.get('content',''))}자" for s in normalized.get("sections", [])], ]) # ══ Step 2: zone 구분 ══ logger.info("[v2] Step 2: zone 구분") from src.section_parser import extract_major_sections sections = normalized.get("sections", []) major_sections = extract_major_sections(sections) steps["step2_zones"] = [ {"title": s["title"], "sub_titles": s["sub_titles"], "content_len": len(s.get("content", ""))} for s in major_sections ] _save_step_html(steps_dir / "step2_zones.html", "Step 2: zone 구분", [ *[f"zone {i+1}: {s['title']} | sub_titles={s['sub_titles']} | {len(s.get('content',''))}자" for i, s in enumerate(major_sections)], ]) # ══ Step 3: TF-IDF 블록 매칭 ══ logger.info("[v2] Step 3: TF-IDF 블록 매칭") matcher = TfidfBlockMatcher(catalog_path) catalog = load_blocks_catalog(catalog_path) match_results = {} step3_lines = [] for sec in major_sections: title = sec["title"] sub_titles = sec["sub_titles"] content = sec.get("content", "") d1_items = [re.sub(r'\*+', '', d).strip() for d in re.findall(r'^D1:\s*(.*)', content, re.MULTILINE)] top3 = matcher.match(title, sub_titles, d1_items, top_k=3) best = matcher.match_with_threshold(title, sub_titles, d1_items, threshold=threshold) match_results[title] = { "best": best, "top3": top3, "path": "direct-fit" if best else "recipe", "sub_titles": sub_titles, "content": content, "d1_items": d1_items, } step3_lines.append(f"zone: {title}") step3_lines.append(f" sub_titles: {sub_titles}") if best: step3_lines.append(f" → direct-fit: {best['block_id']} (score={best['score']})") else: step3_lines.append(f" → recipe 경로") for j, c in enumerate(top3): step3_lines.append(f" #{j+1}: {c['block_id']} (score={c['score']})") step3_lines.append("") steps["step3_matching"] = match_results _save_step_html(steps_dir / "step3_matching.html", "Step 3: TF-IDF 블록 매칭", step3_lines) # ══ Step 4: 블록 렌더링 + slide-base 조립 ══ logger.info("[v2] Step 4: 블록 렌더링 + 조립") slide_title = normalized.get("title", "") # conclusion 추출 from src.section_parser import extract_conclusion_text conclusion = extract_conclusion_text(mdx_content) conclusion = re.sub(r'^[\*•\-]\s*', '', conclusion).strip() # 각 zone의 블록 HTML 렌더링 zone_htmls = [] zone_csses = [] total_zones = len(major_sections) for sec in major_sections: title = sec["title"] info = match_results[title] if info["path"] == "direct-fit" and info["best"]: # direct-fit: 블록 템플릿 로드 block_id = info["best"]["block_id"] # template 경로: catalog에 없으면 structures/ 에서 찾기 block_meta = find_block_by_id(catalog, block_id) template_path = "" if block_meta: template_path = block_meta.get("template", "") if not template_path: template_path = f"blocks/structures/{block_id}.html" if (templates_dir / template_path).exists(): html, css = _render_block_template( templates_dir, template_path, title, info ) zone_htmls.append((title, html)) if css: zone_csses.append(css) logger.info(f"[v2] {title} → block '{block_id}' 렌더") else: html = _render_fallback(title, info) zone_htmls.append((title, html)) logger.warning(f"[v2] {title} → block '{block_id}' catalog에 없음, fallback") else: # recipe 경로: direct render html = _render_fallback(title, info) zone_htmls.append((title, html)) logger.info(f"[v2] {title} → recipe fallback render") # slide-base 조립 final_html = _assemble_slide( templates_dir, slide_title, conclusion, zone_htmls, zone_csses, total_zones, ) steps["step4_render"] = { "zones": [{"title": t, "html_len": len(h)} for t, h in zone_htmls], } _save_step_html(steps_dir / "step4_render.html", "Step 4: 렌더링", [ *[f"{t}: {len(h)}자 HTML" for t, h in zone_htmls], ]) # ══ Step 5: 저장 ══ logger.info("[v2] Step 5: 저장") (run_dir / "final.html").write_text(final_html, encoding="utf-8") context = { "run_id": run_id, "title": slide_title, "conclusion": conclusion, "steps": {k: _safe_serialize(v) for k, v in steps.items()}, "match_results": {k: _safe_serialize(v) for k, v in match_results.items()}, } (run_dir / "final_context.json").write_text( json.dumps(context, ensure_ascii=False, indent=2), encoding="utf-8" ) logger.info(f"[v2] 완료: {run_dir}") return {"run_id": run_id, "run_dir": str(run_dir), "final_html": final_html} def _render_block_template( templates_dir: Path, template_path: str, zone_title: str, info: dict, ) -> tuple[str, str]: """블록 템플릿을 로드하고 그대로 반환. 현재는 블록 HTML을 그대로 사용 (슬롯 교체는 추후). CSS는 분리하여 head로 이동. """ full_path = templates_dir / template_path if not full_path.exists(): logger.warning(f"[v2] 템플릿 없음: {full_path}") return _render_fallback(zone_title, info), "" raw = full_path.read_text(encoding="utf-8") # CSS 분리 css_parts = re.findall(r'', raw, re.DOTALL) css = "\n".join(css_parts) html = re.sub(r'', '', raw, flags=re.DOTALL).strip() # HTML 주석 제거 html = re.sub(r'', '', html).strip() return html, css def _render_fallback(zone_title: str, info: dict) -> str: """매칭 안 됐을 때 기본 렌더링. .bul 구조 사용.""" sub_titles = info.get("sub_titles", []) content = info.get("content", "") d1_items = info.get("d1_items", []) parts = [] parts.append(f'

{zone_title}

') if d1_items: for item in d1_items: if ": " in item: h, d = item.split(": ", 1) parts.append(f'

• {h}: {d}

') else: parts.append(f'

• {item}

') elif content: for line in content.split("\n"): line = line.strip() if not line or line.startswith("![") or line.startswith("[이미지:"): continue d1 = re.match(r'^D1:\s*(.*)', line) d2 = re.match(r'^D2:\s*(.*)', line) if d1: text = re.sub(r'\*+', '', d1.group(1)).strip() parts.append(f'

• {text}

') elif d2: text = re.sub(r'\*+', '', d2.group(1)).strip() parts.append(f'

• {text}

') return "\n".join(parts) def _assemble_slide( templates_dir: Path, title: str, conclusion: str, zone_htmls: list[tuple[str, str]], zone_csses: list[str], total_zones: int, ) -> str: """slide-base.html에 zone들을 조립.""" from jinja2 import Environment, FileSystemLoader # slide-base 로드 slide_base_path = templates_dir / "blocks" / "slide-base.html" raw = slide_base_path.read_text(encoding="utf-8") raw = re.sub(r'', '', raw) # body HTML 구성 body_parts = [] weight = 1.0 / max(total_zones, 1) for i, (zone_title, html) in enumerate(zone_htmls): height_pct = int(weight * 96) margin = "margin-bottom:2%;" if i < total_zones - 1 else "" body_parts.append( f'

' f'

' f'{html}

' ) body_html = "\n".join(body_parts) # {% block body %} 치환 raw = raw.replace("{% block body %}{% endblock %}", body_html) # 블록 CSS 합치기 extra_css = "\n".join(zone_csses) # Jinja2 렌더 (include 지원) env = Environment(loader=FileSystemLoader(str(templates_dir))) template = env.from_string(raw) result = template.render( title=title, footer_text=conclusion, footer_pill_bg="", ) # 블록 CSS를 head의 첫 앞에 삽입 if extra_css and '' in result: result = result.replace('', f'\n{extra_css}\n', 1) # body 안에 ', body_part) if body_styles: body_part = re.sub(r'', '', body_part) head_part = head_part.replace('', f'\n{chr(10).join(body_styles)}\n', 1) result = head_part + body_part # asset 임베딩 (svg/ 경로 → base64) from src.block_assembler import _embed_slide_assets result = _embed_slide_assets(result, templates_dir) return result def _save_step_html(path: Path, title: str, lines: list[str]): """단계별 디버그 HTML 저장.""" content = "\n".join(f"

{line}

" for line in lines) html = f"""

{title}

{content} """ path.write_text(html, encoding="utf-8") def _safe_serialize(obj): """JSON 직렬화 가능하도록 변환.""" if isinstance(obj, dict): return {k: _safe_serialize(v) for k, v in obj.items()} if isinstance(obj, list): return [_safe_serialize(v) for v in obj] if isinstance(obj, (str, int, float, bool, type(None))): return obj return str(obj)