docs + V4 catalog + samples + Phase Q legacy 보존

전체 26 files (20 추가 + 6 수정), 10507 insertions. Phase Z 문서 : - docs/architecture/PHASE-Z-CHANGE-LOG.md (신설) — axis-by-axis 의사결정 history (newest-on-top). Step 7-A 부터 6 entry 박힘 + 2026-05-08 / 2026-05-08 #2 (compat 매트릭스 폐기 / 6-B 폐기 / F14 표현 정정 / label gate policy 분리). - docs/architecture/PHASE-Z-PIPELINE-OVERVIEW.md (수정) — Step 5/6/9 Gap note append (구조 무변, append-only). 6-B 폐기 사실 + Refinement F. - docs/architecture/PHASE-Z-PIPELINE-STATUS-BOARD.md (수정) — snapshot date 2026-05-08 갱신. §3 핵심 missing item 5 (Step 5/6/9 boundary axis breakdown + 폐기 기록). §6 한 줄 갱신 — 다음 axis 후보 A~F. Project root docs : - PLAN.md / PROGRESS.md / README.md (수정) — 토큰 체계 / 폴더 구조 / 설계 문서 / 역할 분리 반영. - IMPROVEMENT-REDESIGN.md (신설) — Phase Z 설계 핵심 문서. - PROCESS_OVERVIEW.html (신설) — 파이프라인 개요 시각. - docs/tasks/* (신설) — Phase Z task 문서. V4 catalog (Phase Z runtime 필수 의존성) : - tests/matching/v4_full32_result.yaml (신설, 4888 줄) — V4 매칭 결과 32 frame × 10 MDX section. lookup_v4_match() / lookup_v4_candidates() 가 본 파일 read. Phase Z runtime 이 *없으면 즉시 abort* — clone 후 즉시 동작 가능 보장. Samples : - samples/mdx_batch/04.mdx (신설) — MDX04 기본 sample. - samples/mdx/04. DX 지연 요인.mdx (신설) — MDX04 원본. Phase Q legacy 보존 (별 axis "Phase Q audit & salvage" 영역) : - src/block_matcher_tfidf.py / catalog_blocks.py / frame_extractor.py / pipeline_v2.py — Phase Q (옛 파이프라인) src 신규 untracked 파일들. Phase Z runtime 와 의존성 0. Phase Q audit axis 에서 검토 예정. - scripts/eval_block_matcher.py / fetch_all_frame_screenshots.py / match_17_units_my_matcher.py / match_mdx_strict.py / match_mdx_to_frames_tfidf.py / ocr_augment_texts.py / run_pipeline_v2.py / previews/ — Phase Q 작업 시 사용한 옛 script. 같이 보존. - run_mdx03_pipeline.py (수정) — Phase Q 진입점 (no flag) + Phase Z 진입점 (--phase-z2 flag) 동시 wrapper. Phase Z 만 사용 시 `python -m src.phase_z2_pipeline samples/mdx_batch/03.mdx <run_id>` 직접 호출. 비-scope : - tests/matching/ (v4_full32_result.yaml 외 ~63MB) — V4 진화 history / reports / DECK / ATTACH. Phase Q audit axis 에서 검토. - tests/pipeline/ (~15MB) — pipeline data. Phase Q audit 영역. - templates/catalog/blocks.yaml — 옛 block catalog. Phase Q audit. - templates/phase_z2/frames/ — 옛 frame partial 위치. Phase Q audit. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-08 09:47:58 +09:00
parent ec83405770
commit 85c680f02a
26 changed files with 10507 additions and 46 deletions
--- a/src/block_matcher_tfidf.py
+++ b/src/block_matcher_tfidf.py
@@ -0,0 +1,185 @@
+"""TF-IDF 기반 블록 매칭 엔진.
+
+texts.md의 원본 텍스트를 직접 사용 — keywords 수동 생성 불필요.
+frame_extractor가 텍스트를 추출하고, 여기서 TF-IDF 유사도를 계산.
+
+사용법:
+    matcher = TfidfBlockMatcher()
+    result = matcher.match("DX 시행을 위한 필수 요건", ["기술(디지털)", "사람(역량)"])
+"""
+from __future__ import annotations
+
+import logging
+import math
+import re
+from collections import Counter
+from pathlib import Path
+from typing import Any
+
+from src.frame_extractor import extract_all_frames
+
+logger = logging.getLogger(__name__)
+
+
+class TfidfBlockMatcher:
+    """TF-IDF 기반 블록 매칭기. texts.md 직접 사용."""
+
+    def __init__(
+        self,
+        blocks_dir: str | Path = "figma_to_html_agent/blocks",
+        catalog_path: str | Path = "templates/catalog/blocks.yaml",
+    ):
+        self.frames: list[dict] = extract_all_frames(blocks_dir)
+        self.catalog = self._load_catalog(catalog_path)
+
+        # 프레임별 전체 텍스트 + catalog 텍스트 합침
+        self.doc_texts: list[str] = []
+        self.doc_ids: list[str] = []
+        for frame in self.frames:
+            # texts.md 원본 텍스트 사용
+            text = frame.get("all_text", "")
+            # catalog에서 추가 정보 (when, description)
+            cat_entry = self._find_catalog_entry(frame["frame_id"])
+            if cat_entry:
+                text += " " + cat_entry.get("when", "")
+                text += " " + cat_entry.get("description", "")
+                self.doc_ids.append(cat_entry.get("id", frame["frame_id"]))
+            else:
+                self.doc_ids.append(frame["frame_id"])
+            self.doc_texts.append(text)
+
+        # IDF 사전 계산
+        self.idf = self._compute_idf(self.doc_texts)
+        logger.info(f"[tfidf] {len(self.frames)}개 프레임 인덱싱 완료 (texts.md 직접 사용)")
+
+    def _load_catalog(self, path: Path | str) -> list[dict]:
+        """catalog 로드 (있으면)."""
+        path = Path(path)
+        if not path.exists():
+            return []
+        import yaml
+        try:
+            data = yaml.safe_load(path.read_text(encoding="utf-8"))
+            return data if isinstance(data, list) else data.get("blocks", [])
+        except Exception:
+            return []
+
+    def _find_catalog_entry(self, frame_id: str) -> dict | None:
+        """frame_id로 catalog 항목 찾기."""
+        for entry in self.catalog:
+            if entry.get("source_frame") == frame_id:
+                return entry
+        return None
+
+    def _compute_idf(self, documents: list[str]) -> dict[str, float]:
+        """IDF 계산."""
+        N = len(documents)
+        doc_freq = Counter()
+        for doc in documents:
+            words = set(doc.split())
+            for w in words:
+                doc_freq[w] += 1
+        return {w: math.log(N / (freq + 1)) for w, freq in doc_freq.items()}
+
+    def _tfidf_vec(self, text: str) -> dict[str, float]:
+        """텍스트 → TF-IDF 벡터."""
+        words = text.split()
+        tf = Counter(words)
+        total = len(words) if words else 1
+        vec = {}
+        for w in tf:
+            idf = self.idf.get(w, math.log(len(self.doc_texts) + 1))
+            vec[w] = (tf[w] / total) * idf
+        return vec
+
+    def _cosine(self, a: dict, b: dict) -> float:
+        """cosine similarity."""
+        keys = set(a) | set(b)
+        dot = sum(a.get(k, 0) * b.get(k, 0) for k in keys)
+        mag_a = math.sqrt(sum(v ** 2 for v in a.values())) if a else 0
+        mag_b = math.sqrt(sum(v ** 2 for v in b.values())) if b else 0
+        if mag_a == 0 or mag_b == 0:
+            return 0.0
+        return dot / (mag_a * mag_b)
+
+    def _preprocess_query(self, text: str) -> str:
+        """MDX 쿼리 텍스트 전처리 (프레임 전처리와 동일 규칙)."""
+        text = text.replace("S/W", "SW 소프트웨어")
+        text = text.replace("H/W", "HW 하드웨어")
+        text = re.sub(r'\bDX\b', 'DX 디지털전환', text)
+        text = re.sub(r'\bBIM\b', 'BIM 건설정보모델링', text)
+        text = text.replace("(", " ").replace(")", " ")
+        text = text.replace("[", " ").replace("]", " ")
+        text = re.sub(r'[·•→←↔×+/]', ' ', text)
+        text = re.sub(r'\s+', ' ', text).strip()
+        return text
+
+    def match(
+        self,
+        zone_title: str,
+        sub_titles: list[str] | None = None,
+        d1_items: list[str] | None = None,
+        top_k: int = 3,
+    ) -> list[dict]:
+        """중목차/소목차 텍스트로 프레임 매칭.
+
+        Returns:
+            [{"block_id": str, "frame_id": str, "score": float,
+              "title_text": str, "rough_structure": str}]
+        """
+        if not self.doc_texts:
+            return []
+
+        # 쿼리 구성
+        parts = [zone_title]
+        if sub_titles:
+            parts.extend(sub_titles)
+        if d1_items:
+            parts.extend(d1_items)
+        query = self._preprocess_query(" ".join(parts))
+
+        # TF-IDF 유사도 계산
+        query_vec = self._tfidf_vec(query)
+        scores = []
+        for i, doc_text in enumerate(self.doc_texts):
+            doc_vec = self._tfidf_vec(doc_text)
+            score = self._cosine(query_vec, doc_vec)
+            scores.append((i, score))
+
+        # 상위 K개
+        scores.sort(key=lambda x: -x[1])
+        results = []
+        for idx, score in scores[:top_k]:
+            if score <= 0:
+                continue
+            frame = self.frames[idx]
+            results.append({
+                "block_id": self.doc_ids[idx],
+                "frame_id": frame["frame_id"],
+                "score": round(score, 4),
+                "method": "tfidf",
+                "title_text": frame.get("title_text", ""),
+                "rough_structure": frame.get("rough_structure", ""),
+                "item_count": frame.get("item_count", 0),
+            })
+
+        if results:
+            logger.info(
+                f"[tfidf] '{zone_title}' → top: {results[0]['block_id']} "
+                f"(score={results[0]['score']}, frame={results[0]['frame_id']})"
+            )
+
+        return results
+
+    def match_with_threshold(
+        self,
+        zone_title: str,
+        sub_titles: list[str] | None = None,
+        d1_items: list[str] | None = None,
+        threshold: float = 0.10,
+    ) -> dict | None:
+        """threshold 이상이면 best match, 아니면 None → recipe 경로."""
+        results = self.match(zone_title, sub_titles, d1_items, top_k=1)
+        if results and results[0]["score"] >= threshold:
+            return results[0]
+        return None
--- a/src/catalog_blocks.py
+++ b/src/catalog_blocks.py
@@ -0,0 +1,69 @@
+"""새 catalog 로더.
+
+기존 templates/catalog.yaml과 별도로,
+templates/catalog/blocks.yaml을 로드하는 모듈.
+
+기존 코드는 건드리지 않음.
+"""
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+
+def load_blocks_catalog(
+    path: str | Path = "templates/catalog/blocks.yaml",
+) -> list[dict]:
+    """blocks.yaml 로드.
+
+    Returns:
+        [{"id": "prerequisites-3col",
+          "structure_type": "3col-parallel",
+          "keywords": ["필수", "요건", ...],
+          "slots": ["sub_title", "body", "bullets"],
+          "recipe_compat": ["direct_fit", "parallel_cluster"],
+          "not_for": ["long_table"],
+          "template": "blocks/structures/prerequisites-3col.html",
+          "when": "3개 병렬 비교"}, ...]
+    """
+    import yaml
+    path = Path(path)
+    if not path.exists():
+        logger.warning(f"[catalog] blocks.yaml 없음: {path}")
+        return []
+
+    try:
+        data = yaml.safe_load(path.read_text(encoding="utf-8"))
+        blocks = data if isinstance(data, list) else data.get("blocks", [])
+        logger.info(f"[catalog] {len(blocks)}개 블록 로드: {path}")
+        return blocks
+    except Exception as e:
+        logger.error(f"[catalog] 로드 실패: {e}")
+        return []
+
+
+def find_block_by_id(
+    blocks: list[dict],
+    block_id: str,
+) -> dict | None:
+    """ID로 블록 찾기."""
+    return next((b for b in blocks if b.get("id") == block_id), None)
+
+
+def filter_blocks_by_structure(
+    blocks: list[dict],
+    structure_type: str,
+) -> list[dict]:
+    """structure_type으로 필터링."""
+    return [b for b in blocks if b.get("structure_type") == structure_type]
+
+
+def filter_blocks_by_recipe(
+    blocks: list[dict],
+    recipe: str,
+) -> list[dict]:
+    """recipe 호환 블록 필터링."""
+    return [b for b in blocks if recipe in b.get("recipe_compat", [])]
--- a/src/frame_extractor.py
+++ b/src/frame_extractor.py
@@ -0,0 +1,205 @@
+"""프레임별 텍스트 + 메타 추출기.
+
+figma_to_html_agent/blocks/{frame_id}/texts.md를 파싱하여
+TF-IDF 매칭용 데이터 구조를 만든다.
+
+keywords 수동 생성 불필요 — texts.md의 원본 텍스트를 직접 사용.
+"""
+from __future__ import annotations
+
+import logging
+import re
+from pathlib import Path
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+
+def extract_frame_meta(texts_md_path: Path) -> dict:
+    """texts.md에서 프레임 메타 추출.
+
+    Returns:
+        {
+            "frame_id": "1171281190",
+            "title_text": "필수조건",
+            "subtitle_texts": ["기술(디지털)", "사람(역량)", ...],
+            "body_texts": ["건설단계별 근본적인...", ...],
+            "all_text": "필수조건 기술 디지털 ...",  ← TF-IDF용 전체 텍스트
+            "item_count": 3,
+            "rough_structure": "3col",
+            "sections": [{"heading": "타이틀", "lines": ["필수조건"]}, ...]
+        }
+    """
+    if not texts_md_path.exists():
+        return {}
+
+    content = texts_md_path.read_text(encoding="utf-8")
+    frame_id = texts_md_path.parent.name
+
+    # 섹션별 파싱 (## 기준)
+    sections = []
+    current_heading = ""
+    current_lines = []
+
+    for line in content.split("\n"):
+        line = line.strip()
+        if line.startswith("## "):
+            if current_heading or current_lines:
+                sections.append({"heading": current_heading, "lines": current_lines})
+            current_heading = line.lstrip("# ").strip()
+            current_lines = []
+        elif line.startswith("### "):
+            # 서브섹션은 heading에 포함
+            current_lines.append(line.lstrip("# ").strip())
+        elif line.startswith("# "):
+            # 최상위 제목 (프레임 ID) — 건너뜀
+            continue
+        elif line.startswith(">"):
+            continue
+        elif line and not line.startswith("-"):
+            current_lines.append(line)
+        elif line.startswith("- "):
+            current_lines.append(line.lstrip("- ").strip())
+
+    if current_heading or current_lines:
+        sections.append({"heading": current_heading, "lines": current_lines})
+
+    # 층별 텍스트 분류
+    title_text = ""
+    subtitle_texts = []
+    body_texts = []
+
+    for sec in sections:
+        heading = sec["heading"].lower()
+        lines = sec["lines"]
+
+        if "타이틀" in heading or "제목" in heading:
+            title_text = " ".join(lines)
+        elif "서브" in heading or "헤더" in heading or "카테고리" in heading:
+            subtitle_texts.extend(lines)
+        elif "열" in heading or "col" in heading.lower():
+            # 열별 텍스트 → subtitle + body
+            for line in lines:
+                if len(line) < 20:
+                    subtitle_texts.append(line)
+                else:
+                    body_texts.append(line)
+        elif "행" in heading or "row" in heading.lower():
+            # 행별 텍스트
+            for line in lines:
+                if len(line) < 15:
+                    subtitle_texts.append(line)
+                else:
+                    body_texts.append(line)
+        elif "결론" in heading or "요약" in heading:
+            body_texts.extend(lines)
+        else:
+            # 기타 — 길이로 구분
+            for line in lines:
+                if len(line) < 20:
+                    subtitle_texts.append(line)
+                else:
+                    body_texts.append(line)
+
+    # rough_structure 추정
+    rough_structure = _guess_structure(sections, subtitle_texts)
+
+    # all_text: TF-IDF용 전체 텍스트 (전처리 적용)
+    all_parts = [title_text] + subtitle_texts + body_texts
+    all_text = " ".join(all_parts)
+    all_text = _preprocess_text(all_text)
+
+    return {
+        "frame_id": frame_id,
+        "title_text": title_text.strip(),
+        "subtitle_texts": subtitle_texts,
+        "body_texts": body_texts,
+        "all_text": all_text,
+        "item_count": len(subtitle_texts),
+        "rough_structure": rough_structure,
+        "sections": sections,
+    }
+
+
+def _guess_structure(sections: list[dict], subtitles: list[str]) -> str:
+    """섹션 구조에서 대략적인 블록 유형 추정."""
+    headings = [s["heading"].lower() for s in sections]
+    heading_text = " ".join(headings)
+
+    # 열 기반
+    col_count = sum(1 for h in headings if "열" in h or "col" in h)
+    if col_count >= 3:
+        return "3col"
+    if col_count >= 2:
+        return "2col"
+
+    # 행 기반
+    row_count = sum(1 for h in headings if "행" in h or "row" in h)
+    if row_count >= 2:
+        return "rows"
+
+    # 좌/우
+    if any("좌" in h or "left" in h for h in headings):
+        return "2col-compare"
+
+    # 표
+    if any("표" in h or "table" in h for h in headings):
+        return "table"
+
+    # 기본
+    if len(subtitles) >= 3:
+        return "list"
+
+    return "unknown"
+
+
+def _preprocess_text(text: str) -> str:
+    """TF-IDF용 텍스트 전처리.
+
+    - 표기 통일
+    - 괄호/특수문자 정리
+    - 중복 제거
+    """
+    # 표기 통일
+    text = text.replace("S/W", "SW 소프트웨어")
+    text = text.replace("H/W", "HW 하드웨어")
+    text = re.sub(r'\bDX\b', 'DX 디지털전환', text)
+    text = re.sub(r'\bBIM\b', 'BIM 건설정보모델링', text)
+
+    # 괄호 내용 유지하되 괄호 제거
+    text = text.replace("(", " ").replace(")", " ")
+    text = text.replace("[", " ").replace("]", " ")
+
+    # 특수문자 정리
+    text = re.sub(r'[·•→←↔×+/]', ' ', text)
+    text = re.sub(r'\s+', ' ', text).strip()
+
+    return text
+
+
+def extract_all_frames(
+    blocks_dir: str | Path = "figma_to_html_agent/blocks",
+) -> list[dict]:
+    """모든 프레임의 메타 추출.
+
+    Returns:
+        [{"frame_id": ..., "title_text": ..., "all_text": ..., ...}]
+    """
+    blocks_dir = Path(blocks_dir)
+    if not blocks_dir.exists():
+        logger.warning(f"[extractor] blocks 폴더 없음: {blocks_dir}")
+        return []
+
+    frames = []
+    for frame_dir in sorted(blocks_dir.iterdir()):
+        if not frame_dir.is_dir():
+            continue
+        texts_md = frame_dir / "texts.md"
+        if texts_md.exists():
+            meta = extract_frame_meta(texts_md)
+            if meta:
+                frames.append(meta)
+                logger.debug(f"[extractor] {meta['frame_id']}: {meta['title_text']} ({meta['rough_structure']})")
+
+    logger.info(f"[extractor] {len(frames)}개 프레임 추출 완료")
+    return frames
--- a/src/pipeline_v2.py
+++ b/src/pipeline_v2.py
@@ -0,0 +1,356 @@
+"""Pipeline v2: TF-IDF 기반 블록 매칭 + 렌더링 파이프라인.
+
+기존 pipeline.py를 건드리지 않고, 새 매칭/렌더링 엔진으로 동작하는 별도 파이프라인.
+
+입출력 계약:
+  입력: MDX 텍스트 + base_path
+  출력: data/runs/{run_id}/ 에 final.html + 단계별 context
+
+흐름:
+  1. MDX 정규화
+  2. zone 구분 (중목차 기준)
+  3. TF-IDF 블록 매칭 (direct-fit / recipe 분기)
+  4. 블록 렌더링 (템플릿 로드 + 슬롯 삽입)
+  5. slide-base 조립
+  6. 저장
+"""
+from __future__ import annotations
+
+import json
+import logging
+import re
+import time
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+
+from src.block_matcher_tfidf import TfidfBlockMatcher
+from src.catalog_blocks import load_blocks_catalog, find_block_by_id
+
+logger = logging.getLogger(__name__)
+
+
+def generate_slide_v2(
+    mdx_content: str,
+    base_path: str = "",
+    catalog_path: str = "templates/catalog/blocks.yaml",
+    threshold: float = 0.15,
+) -> dict:
+    """v2 파이프라인: MDX → 슬라이드 HTML.
+
+    Returns:
+        {"run_id": str, "run_dir": str, "final_html": str, "steps": dict}
+    """
+    templates_dir = Path("templates")
+    run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
+    run_dir = Path("data/runs") / run_id
+    run_dir.mkdir(parents=True, exist_ok=True)
+    steps_dir = run_dir / "steps"
+    steps_dir.mkdir(exist_ok=True)
+
+    steps = {}
+
+    # ══ Step 1: MDX 정규화 ══
+    logger.info("[v2] Step 1: MDX 정규화")
+    from src.mdx_normalizer import normalize_mdx_content
+    normalized = normalize_mdx_content(mdx_content)
+    steps["step1_normalize"] = {
+        "title": normalized.get("title", ""),
+        "sections_count": len(normalized.get("sections", [])),
+        "sections": normalized.get("sections", []),
+    }
+
+    # Step 1 저장
+    _save_step_html(steps_dir / "step1_normalize.html", "Step 1: MDX 정규화", [
+        f"<b>title:</b> {normalized.get('title', '')}",
+        f"<b>sections:</b> {len(normalized.get('sections', []))}개",
+        *[f"  level={s['level']} | {s['title']} | {len(s.get('content',''))}자"
+          for s in normalized.get("sections", [])],
+    ])
+
+    # ══ Step 2: zone 구분 ══
+    logger.info("[v2] Step 2: zone 구분")
+    from src.section_parser import extract_major_sections
+    sections = normalized.get("sections", [])
+    major_sections = extract_major_sections(sections)
+    steps["step2_zones"] = [
+        {"title": s["title"], "sub_titles": s["sub_titles"], "content_len": len(s.get("content", ""))}
+        for s in major_sections
+    ]
+
+    _save_step_html(steps_dir / "step2_zones.html", "Step 2: zone 구분", [
+        *[f"<b>zone {i+1}:</b> {s['title']} | sub_titles={s['sub_titles']} | {len(s.get('content',''))}자"
+          for i, s in enumerate(major_sections)],
+    ])
+
+    # ══ Step 3: TF-IDF 블록 매칭 ══
+    logger.info("[v2] Step 3: TF-IDF 블록 매칭")
+    matcher = TfidfBlockMatcher(catalog_path)
+    catalog = load_blocks_catalog(catalog_path)
+
+    match_results = {}
+    step3_lines = []
+    for sec in major_sections:
+        title = sec["title"]
+        sub_titles = sec["sub_titles"]
+        content = sec.get("content", "")
+        d1_items = [re.sub(r'\*+', '', d).strip()
+                    for d in re.findall(r'^D1:\s*(.*)', content, re.MULTILINE)]
+
+        top3 = matcher.match(title, sub_titles, d1_items, top_k=3)
+        best = matcher.match_with_threshold(title, sub_titles, d1_items, threshold=threshold)
+
+        match_results[title] = {
+            "best": best,
+            "top3": top3,
+            "path": "direct-fit" if best else "recipe",
+            "sub_titles": sub_titles,
+            "content": content,
+            "d1_items": d1_items,
+        }
+
+        step3_lines.append(f"<b>zone:</b> {title}")
+        step3_lines.append(f"  sub_titles: {sub_titles}")
+        if best:
+            step3_lines.append(f"  → direct-fit: {best['block_id']} (score={best['score']})")
+        else:
+            step3_lines.append(f"  → recipe 경로")
+        for j, c in enumerate(top3):
+            step3_lines.append(f"    #{j+1}: {c['block_id']} (score={c['score']})")
+        step3_lines.append("")
+
+    steps["step3_matching"] = match_results
+
+    _save_step_html(steps_dir / "step3_matching.html", "Step 3: TF-IDF 블록 매칭", step3_lines)
+
+    # ══ Step 4: 블록 렌더링 + slide-base 조립 ══
+    logger.info("[v2] Step 4: 블록 렌더링 + 조립")
+
+    slide_title = normalized.get("title", "")
+    # conclusion 추출
+    from src.section_parser import extract_conclusion_text
+    conclusion = extract_conclusion_text(mdx_content)
+    conclusion = re.sub(r'^[\*•\-]\s*', '', conclusion).strip()
+
+    # 각 zone의 블록 HTML 렌더링
+    zone_htmls = []
+    zone_csses = []
+    total_zones = len(major_sections)
+
+    for sec in major_sections:
+        title = sec["title"]
+        info = match_results[title]
+
+        if info["path"] == "direct-fit" and info["best"]:
+            # direct-fit: 블록 템플릿 로드
+            block_id = info["best"]["block_id"]
+            # template 경로: catalog에 없으면 structures/ 에서 찾기
+            block_meta = find_block_by_id(catalog, block_id)
+            template_path = ""
+            if block_meta:
+                template_path = block_meta.get("template", "")
+            if not template_path:
+                template_path = f"blocks/structures/{block_id}.html"
+            if (templates_dir / template_path).exists():
+                html, css = _render_block_template(
+                    templates_dir, template_path, title, info
+                )
+                zone_htmls.append((title, html))
+                if css:
+                    zone_csses.append(css)
+                logger.info(f"[v2] {title} → block '{block_id}' 렌더")
+            else:
+                html = _render_fallback(title, info)
+                zone_htmls.append((title, html))
+                logger.warning(f"[v2] {title} → block '{block_id}' catalog에 없음, fallback")
+        else:
+            # recipe 경로: direct render
+            html = _render_fallback(title, info)
+            zone_htmls.append((title, html))
+            logger.info(f"[v2] {title} → recipe fallback render")
+
+    # slide-base 조립
+    final_html = _assemble_slide(
+        templates_dir, slide_title, conclusion,
+        zone_htmls, zone_csses, total_zones,
+    )
+
+    steps["step4_render"] = {
+        "zones": [{"title": t, "html_len": len(h)} for t, h in zone_htmls],
+    }
+
+    _save_step_html(steps_dir / "step4_render.html", "Step 4: 렌더링", [
+        *[f"<b>{t}:</b> {len(h)}자 HTML" for t, h in zone_htmls],
+    ])
+
+    # ══ Step 5: 저장 ══
+    logger.info("[v2] Step 5: 저장")
+    (run_dir / "final.html").write_text(final_html, encoding="utf-8")
+
+    context = {
+        "run_id": run_id,
+        "title": slide_title,
+        "conclusion": conclusion,
+        "steps": {k: _safe_serialize(v) for k, v in steps.items()},
+        "match_results": {k: _safe_serialize(v) for k, v in match_results.items()},
+    }
+    (run_dir / "final_context.json").write_text(
+        json.dumps(context, ensure_ascii=False, indent=2), encoding="utf-8"
+    )
+
+    logger.info(f"[v2] 완료: {run_dir}")
+    return {"run_id": run_id, "run_dir": str(run_dir), "final_html": final_html}
+
+
+def _render_block_template(
+    templates_dir: Path,
+    template_path: str,
+    zone_title: str,
+    info: dict,
+) -> tuple[str, str]:
+    """블록 템플릿을 로드하고 그대로 반환.
+
+    현재는 블록 HTML을 그대로 사용 (슬롯 교체는 추후).
+    CSS는 분리하여 head로 이동.
+    """
+    full_path = templates_dir / template_path
+    if not full_path.exists():
+        logger.warning(f"[v2] 템플릿 없음: {full_path}")
+        return _render_fallback(zone_title, info), ""
+
+    raw = full_path.read_text(encoding="utf-8")
+
+    # CSS 분리
+    css_parts = re.findall(r'<style>(.*?)</style>', raw, re.DOTALL)
+    css = "\n".join(css_parts)
+    html = re.sub(r'<style>.*?</style>', '', raw, flags=re.DOTALL).strip()
+
+    # HTML 주석 제거
+    html = re.sub(r'<!--[\s\S]*?-->', '', html).strip()
+
+    return html, css
+
+
+def _render_fallback(zone_title: str, info: dict) -> str:
+    """매칭 안 됐을 때 기본 렌더링. .bul 구조 사용."""
+    sub_titles = info.get("sub_titles", [])
+    content = info.get("content", "")
+    d1_items = info.get("d1_items", [])
+
+    parts = []
+    parts.append(f'<div class="zone-title" style="font-size:var(--font-zone-title);font-weight:700;color:var(--color-zone-title);margin-bottom:var(--heading-gap);">{zone_title}</div>')
+
+    if d1_items:
+        for item in d1_items:
+            if ": " in item:
+                h, d = item.split(": ", 1)
+                parts.append(f'<div class="bul">• <strong>{h}</strong>: {d}</div>')
+            else:
+                parts.append(f'<div class="bul">• {item}</div>')
+    elif content:
+        for line in content.split("\n"):
+            line = line.strip()
+            if not line or line.startswith("![") or line.startswith("[이미지:"):
+                continue
+            d1 = re.match(r'^D1:\s*(.*)', line)
+            d2 = re.match(r'^D2:\s*(.*)', line)
+            if d1:
+                text = re.sub(r'\*+', '', d1.group(1)).strip()
+                parts.append(f'<div class="bul">• <strong>{text}</strong></div>')
+            elif d2:
+                text = re.sub(r'\*+', '', d2.group(1)).strip()
+                parts.append(f'<div class="bul">  • {text}</div>')
+
+    return "\n".join(parts)
+
+
+def _assemble_slide(
+    templates_dir: Path,
+    title: str,
+    conclusion: str,
+    zone_htmls: list[tuple[str, str]],
+    zone_csses: list[str],
+    total_zones: int,
+) -> str:
+    """slide-base.html에 zone들을 조립."""
+    from jinja2 import Environment, FileSystemLoader
+
+    # slide-base 로드
+    slide_base_path = templates_dir / "blocks" / "slide-base.html"
+    raw = slide_base_path.read_text(encoding="utf-8")
+    raw = re.sub(r'<!--[\s\S]*?-->', '', raw)
+
+    # body HTML 구성
+    body_parts = []
+    weight = 1.0 / max(total_zones, 1)
+    for i, (zone_title, html) in enumerate(zone_htmls):
+        height_pct = int(weight * 96)
+        margin = "margin-bottom:2%;" if i < total_zones - 1 else ""
+        body_parts.append(
+            f'<div style="height:{height_pct}%;{margin}padding-top:var(--space-xs);">'
+            f'<div style="height:100%;overflow:hidden;padding:0 var(--zone-padding-right) 0 var(--zone-padding-left);">'
+            f'{html}</div></div>'
+        )
+
+    body_html = "\n".join(body_parts)
+
+    # {% block body %} 치환
+    raw = raw.replace("{% block body %}{% endblock %}", body_html)
+
+    # 블록 CSS 합치기
+    extra_css = "\n".join(zone_csses)
+
+    # Jinja2 렌더 (include 지원)
+    env = Environment(loader=FileSystemLoader(str(templates_dir)))
+    template = env.from_string(raw)
+    result = template.render(
+        title=title,
+        footer_text=conclusion,
+        footer_pill_bg="",
+    )
+
+    # 블록 CSS를 head의 첫 </style> 앞에 삽입
+    if extra_css and '</style>' in result:
+        result = result.replace('</style>', f'\n{extra_css}\n</style>', 1)
+
+    # body 안에 <style> 잔존하면 head로 이동 (safety net)
+    body_start = result.find('<body')
+    if body_start > 0:
+        head_part = result[:body_start]
+        body_part = result[body_start:]
+        body_styles = re.findall(r'<style>([\s\S]*?)</style>', body_part)
+        if body_styles:
+            body_part = re.sub(r'<style>[\s\S]*?</style>', '', body_part)
+            head_part = head_part.replace('</style>', f'\n{chr(10).join(body_styles)}\n</style>', 1)
+            result = head_part + body_part
+
+    # asset 임베딩 (svg/ 경로 → base64)
+    from src.block_assembler import _embed_slide_assets
+    result = _embed_slide_assets(result, templates_dir)
+
+    return result
+
+
+def _save_step_html(path: Path, title: str, lines: list[str]):
+    """단계별 디버그 HTML 저장."""
+    content = "\n".join(f"<div>{line}</div>" for line in lines)
+    html = f"""<!DOCTYPE html><html><head><meta charset="UTF-8">
+<style>*{{margin:0;padding:0;box-sizing:border-box;}}
+body{{background:#e5e5e5;padding:10px;font-family:sans-serif;word-break:keep-all;font-size:12px;}}
+div{{margin-bottom:2px;}}</style>
+</head><body>
+<div style="font-size:16px;font-weight:bold;margin-bottom:8px;">{title}</div>
+{content}
+</body></html>"""
+    path.write_text(html, encoding="utf-8")
+
+
+def _safe_serialize(obj):
+    """JSON 직렬화 가능하도록 변환."""
+    if isinstance(obj, dict):
+        return {k: _safe_serialize(v) for k, v in obj.items()}
+    if isinstance(obj, list):
+        return [_safe_serialize(v) for v in obj]
+    if isinstance(obj, (str, int, float, bool, type(None))):
+        return obj
+    return str(obj)