Add Phase Z runtime foundation

- add visual fit classifier, router, retry, and failure routing modules - add composition planner and catalog-driven mapper - add Phase Z pipeline orchestration and architecture docs
2026-05-04 08:21:28 +09:00
parent 79f0c55745
commit e7848b602d
11 changed files with 5465 additions and 0 deletions
--- a/src/phase_z2_composition.py
+++ b/src/phase_z2_composition.py
@@ -0,0 +1,571 @@
+"""Phase Z-2 Composition Planner v0.
+
+Pipeline 의 빠진 layer = MDX 덩어리들을 *최종 zone unit* 으로 묶는 결정 layer.
+
+위치 :
+  parse_mdx → align_sections_to_v4_granularity → [본 모듈] → render
+
+원칙 (절대 룰) :
+  - 특정 MDX / frame / section 하드코딩 X (예: "04-2 면" / "F16 이면")
+  - 모든 결정 = catalog 메타 + V4 evidence parametric
+  - 같은 코드가 MDX 02/03/04/05/06... 모두 처리 — 결과는 케이스마다 다름
+  - drilling 결과 = 입력 (재료), composition planner 결과 = 출력 (zone units)
+  - slide-level layout = zone 까지만 나눔. zone 내부 분할은 frame partial 책임
+
+8 layout preset vocabulary :
+  L1 single / L2 horizontal-2 / L3 vertical-2
+  L4 top-1-bottom-2 / L5 top-2-bottom-1
+  L6 left-1-right-2 / L7 left-2-right-1
+  L8 grid-2x2
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Optional
+
+
+# ─── 8 Layout Preset Vocabulary ────────────────────────────────
+
+LAYOUT_PRESETS: dict[str, dict] = {
+    "single": {
+        "zones": 1,
+        "topology": "single",
+        "positions": ["primary"],
+        "css_areas": '"primary"',
+        "css_cols": "1fr",
+        "css_rows": "1fr",
+    },
+    "horizontal-2": {
+        "zones": 2,
+        "topology": "rows",
+        "positions": ["top", "bottom"],
+        "css_areas": '"top" "bottom"',
+        "css_cols": "1fr",
+        "css_rows": "1fr 1fr",
+    },
+    "vertical-2": {
+        "zones": 2,
+        "topology": "cols",
+        "positions": ["left", "right"],
+        "css_areas": '"left right"',
+        "css_cols": "1fr 1fr",
+        "css_rows": "1fr",
+    },
+    "top-1-bottom-2": {
+        "zones": 3,
+        "topology": "T",
+        "positions": ["top", "bottom-left", "bottom-right"],
+        "css_areas": '"top top" "bottom-left bottom-right"',
+        "css_cols": "1fr 1fr",
+        "css_rows": "1fr 1fr",
+    },
+    "top-2-bottom-1": {
+        "zones": 3,
+        "topology": "inverted-T",
+        "positions": ["top-left", "top-right", "bottom"],
+        "css_areas": '"top-left top-right" "bottom bottom"',
+        "css_cols": "1fr 1fr",
+        "css_rows": "1fr 1fr",
+    },
+    "left-1-right-2": {
+        "zones": 3,
+        "topology": "side-T-left",
+        "positions": ["left", "right-top", "right-bottom"],
+        "css_areas": '"left right-top" "left right-bottom"',
+        "css_cols": "1fr 1fr",
+        "css_rows": "1fr 1fr",
+    },
+    "left-2-right-1": {
+        "zones": 3,
+        "topology": "side-T-right",
+        "positions": ["left-top", "right", "left-bottom"],
+        "css_areas": '"left-top right" "left-bottom right"',
+        "css_cols": "1fr 1fr",
+        "css_rows": "1fr 1fr",
+    },
+    "grid-2x2": {
+        "zones": 4,
+        "topology": "2x2",
+        "positions": ["top-left", "top-right", "bottom-left", "bottom-right"],
+        "css_areas": '"top-left top-right" "bottom-left bottom-right"',
+        "css_cols": "1fr 1fr",
+        "css_rows": "1fr 1fr",
+    },
+}
+
+
+# ─── CompositionUnit ────────────────────────────────────────────
+
+@dataclass
+class CompositionUnit:
+    """Slide 내 1 zone 후보 = MDX section(s) + 매칭된 frame.
+
+    source_section_ids : 1 개 = single, 2+ = merged
+    merge_type :
+      - "single"                  : 단일 section
+      - "parent_merged"           : parent V4 entry 존재 (v0)
+      - "parent_merged_inferred"  : parent V4 entry 없음, child evidence 로 추론 (v0.1)
+    frame_* : V4 evidence 그대로 (catalog 메타 X 하드코딩 X)
+    score : 종합 점수
+    rationale : score breakdown 추적
+    review_required : True 면 자동 선택 X — debug 에만 노출, 사용자/AI 검토 후
+                      별도 path (light_edit / restructure / AI restructuring) 로 처리
+    review_reasons : 왜 review_required 가 True 인지 (자가검증용 — child label mix /
+                     template_id 불일치 / cardinality 불호환 등)
+    """
+    source_section_ids: list[str]
+    merge_type: str
+    frame_template_id: str
+    frame_id: str
+    frame_number: int
+    confidence: float
+    label: str                         # use_as_is / light_edit / restructure / reject
+    phase_z_status: str
+    raw_content: str
+    title: str
+    score: float = 0.0
+    rationale: dict = field(default_factory=dict)
+
+    # 자동 파이프라인 단계 상태 (review/UI 개념 X — 현재는 자동 결정 + 명확한 실패 기록만)
+    # auto_selectable=False 면 자동 선택 단계에서 제외. filter_reasons 가 그 이유.
+    # 예: parent_merged_inferred 의 W1/W2/W3 (rep status / all reject / majority not-auto-renderable)
+    # 사용자/AI 검토는 별 layer (interactive editor) 에서 처리. 본 dataclass 는 자동 결정 완결.
+    auto_selectable: bool = True
+    filter_reasons: list[str] = field(default_factory=list)
+    # informational signals — auto_selectable 여부와 무관. future axis 가 점수화할 영역.
+    # 예: "children disagree on rank-1 template_id" / "minority of children non-auto-renderable"
+    notes: list[str] = field(default_factory=list)
+
+
+# ─── Heading Tree ──────────────────────────────────────────────
+
+def derive_parent_id(section_id: str) -> Optional[str]:
+    """section_id 에서 parent 도출 — V4 키 컨벤션 기반.
+
+    예시 (코멘트, 룰 X) :
+      - "04-2.1" → "04-2"  (decimal suffix → strip)
+      - "04-1"   → None    (top-level, no parent)
+      - "04"     → None
+    """
+    parts = section_id.split("-", 1)
+    if len(parts) != 2:
+        return None
+    mdx_id, suffix = parts
+    if "." in suffix:
+        parent_suffix = suffix.split(".")[0]
+        return f"{mdx_id}-{parent_suffix}"
+    return None
+
+
+def build_heading_tree(sections) -> dict:
+    """Section list → tree {section_id: {section, children}}."""
+    tree = {s.section_id: {"section": s, "children": []} for s in sections}
+    for s in sections:
+        parent = derive_parent_id(s.section_id)
+        if parent and parent in tree:
+            tree[parent]["children"].append(s.section_id)
+    return tree
+
+
+# ─── Candidate Generation ──────────────────────────────────────
+
+def _apply_capacity_fit(candidate: CompositionUnit, capacity_fit_fn) -> None:
+    """capacity_fit_fn 결과를 candidate 의 rationale + auto_selectable + filter_reasons 에 반영.
+
+    fit_status 가 'ok' / 'no_contract' / 'unknown_source_shape' 이면 auto_selectable 영향 X
+    (no_contract 는 catalog-only mapper 가 별도로 ValueError 처리).
+    그 외 (strict_mismatch / exceeds_max / below_min / exceeds_truncate) 는 silent loss 또는
+    mapper FitError 가 발생할 후보 → auto_selectable=False + filter_reasons 'C1: ...'.
+    """
+    if capacity_fit_fn is None:
+        return
+    fit = capacity_fit_fn(candidate.frame_template_id, candidate.raw_content)
+    candidate.rationale["capacity_fit"] = fit
+    if fit["fit_status"] in {"ok", "no_contract", "unknown_source_shape"}:
+        return
+    candidate.auto_selectable = False
+    candidate.filter_reasons.append(
+        f"C1: capacity mismatch ({fit['fit_status']}) — {fit['mismatch_reason']}"
+    )
+
+
+def collect_candidates(sections, v4_lookup_fn, v4_label_to_status: dict,
+                       auto_renderable_statuses: Optional[set[str]] = None,
+                       capacity_fit_fn=None):
+    """Generate composition candidates.
+
+    v0.1 candidate types :
+      1. single                   : per leaf section (V4 entry 필수)
+      2. parent_merged            : parent 자체에 V4 entry 존재 (parent 가 직접 매칭됨)
+      3. parent_merged_inferred   : parent V4 없음. child evidence 로 representative
+                                    template_id 추론
+
+    원칙 :
+      - 특정 section_id / template_id / frame 하드코딩 X
+      - 모든 결정 = derive_parent_id() + V4 evidence + v4_label_to_status mapping + 주입된 fn (파라메트릭)
+
+    Args:
+        sections : align 결과
+        v4_lookup_fn : (section_id) → V4Match | None
+        v4_label_to_status : V4 label → Phase Z status mapping
+        auto_renderable_statuses : 자동 렌더 허용 status set (W1/W3 판정 입력)
+        capacity_fit_fn : Optional (template_id, content) → fit dict.
+            제공되면 모든 candidate 에 적용 — capacity mismatch 시 auto_selectable=False
+            (silent truncate / mapper FitError 사전 차단).
+
+    Returns:
+        list[CompositionUnit]
+    """
+    if auto_renderable_statuses is None:
+        auto_renderable_statuses = set()
+
+    candidates = []
+
+    # 1. Separate
+    for s in sections:
+        match = v4_lookup_fn(s.section_id)
+        if match is None:
+            continue
+        c = CompositionUnit(
+            source_section_ids=[s.section_id],
+            merge_type="single",
+            frame_template_id=match.template_id,
+            frame_id=match.frame_id,
+            frame_number=match.frame_number,
+            confidence=match.confidence,
+            label=match.label,
+            phase_z_status=v4_label_to_status.get(match.label, "unknown"),
+            raw_content=s.raw_content,
+            title=s.title,
+        )
+        _apply_capacity_fit(c, capacity_fit_fn)
+        candidates.append(c)
+
+    # parent → children 그룹화
+    parent_to_children: dict[str, list] = {}
+    for s in sections:
+        pid = derive_parent_id(s.section_id)
+        if pid:
+            parent_to_children.setdefault(pid, []).append(s)
+
+    # 2. parent_merged (parent 자체가 V4 에 매칭된 경우)
+    for pid, children in parent_to_children.items():
+        parent_match = v4_lookup_fn(pid)
+        if parent_match is None:
+            continue                    # branch 3 가 처리
+        if len(children) < 2:
+            continue                    # merge 의미 없음
+        merged_raw = "\n\n".join(c.raw_content for c in children)
+        c_pm = CompositionUnit(
+            source_section_ids=[c.section_id for c in children],
+            merge_type="parent_merged",
+            frame_template_id=parent_match.template_id,
+            frame_id=parent_match.frame_id,
+            frame_number=parent_match.frame_number,
+            confidence=parent_match.confidence,
+            label=parent_match.label,
+            phase_z_status=v4_label_to_status.get(parent_match.label, "unknown"),
+            raw_content=merged_raw,
+            title=pid,
+        )
+        _apply_capacity_fit(c_pm, capacity_fit_fn)
+        candidates.append(c_pm)
+
+    # 3. parent_merged_inferred (v0.1) — parent V4 없음, child evidence 기반
+    for pid, children in parent_to_children.items():
+        if v4_lookup_fn(pid) is not None:
+            continue                    # branch 2 가 이미 처리
+        if len(children) < 2:
+            continue
+        # children 중 V4 매칭 있는 것들만 evidence 로 사용
+        child_matches: list[tuple] = []
+        for c in children:
+            m = v4_lookup_fn(c.section_id)
+            if m is not None:
+                child_matches.append((c, m))
+        if len(child_matches) < 2:
+            continue                    # 최소 2 child evidence 필요
+
+        # representative = 가장 confidence 높은 child match (v0.1.1 단순 룰)
+        # 향후 axes : top-k convergence, template family agreement, cardinality_fit 등
+        rep_child, rep_match = max(child_matches, key=lambda cm: cm[1].confidence)
+
+        # 자동 선택 가능 여부 = auto_selectable. default True (strong inferred merge).
+        # 다음 weak 신호 중 하나라도 있으면 auto_selectable=False (filter_reasons 에 사유) :
+        #   W1 : representative status 가 auto-renderable 아님 → 자동 렌더 자체가 막힘
+        #   W2 : 모든 child 가 reject → merge 의미 자체가 없음
+        #   W3 : auto-renderable 아닌 child label 이 majority (>50%)
+        # informational notes (auto_selectable 영향 X, future axis 점수화 영역) :
+        #   N1 : children 의 rank-1 template_id 가 서로 다름 → top-k / family compat
+        #   N2 : non-auto-renderable child label 이 일부 (소수) 존재
+        rep_status = v4_label_to_status.get(rep_match.label, "unknown")
+        child_labels = [m.label for _, m in child_matches]
+        child_template_ids_unique = sorted({m.template_id for _, m in child_matches})
+        n_children = len(child_matches)
+        n_not_auto = sum(
+            1 for l in child_labels
+            if v4_label_to_status.get(l) not in auto_renderable_statuses
+        )
+
+        filter_reasons: list[str] = []
+        notes: list[str] = []
+
+        if rep_status not in auto_renderable_statuses:
+            filter_reasons.append(
+                f"W1: representative status '{rep_status}' (label={rep_match.label}) "
+                f"not in auto_renderable_statuses={sorted(auto_renderable_statuses)}."
+            )
+        if all(l == "reject" for l in child_labels):
+            filter_reasons.append(
+                "W2: all children labeled 'reject' — merge has no fit basis."
+            )
+        if n_children > 0 and n_not_auto * 2 > n_children:
+            non_auto_labels = sorted({
+                l for l in child_labels
+                if v4_label_to_status.get(l) not in auto_renderable_statuses
+            })
+            filter_reasons.append(
+                f"W3: majority of children ({n_not_auto}/{n_children}) have "
+                f"non-auto-renderable labels {non_auto_labels}."
+            )
+
+        if len(child_template_ids_unique) > 1:
+            notes.append(
+                f"N1: children's rank-1 template_id differs ({child_template_ids_unique}). "
+                f"representative='{rep_match.template_id}' (highest child confidence). "
+                f"top-k / family compatibility 평가는 future axis."
+            )
+        if 0 < n_not_auto <= n_children // 2:
+            non_auto_labels_minority = sorted({
+                l for l in child_labels
+                if v4_label_to_status.get(l) not in auto_renderable_statuses
+            })
+            notes.append(
+                f"N2: minority ({n_not_auto}/{n_children}) of children non-auto-renderable "
+                f"({non_auto_labels_minority}). representative is auto-renderable, merge proceeds."
+            )
+
+        auto_selectable = len(filter_reasons) == 0
+
+        merged_raw = "\n\n".join(c.raw_content for c, _ in child_matches)
+        c_inf = CompositionUnit(
+            source_section_ids=[c.section_id for c, _ in child_matches],
+            merge_type="parent_merged_inferred",
+            frame_template_id=rep_match.template_id,
+            frame_id=rep_match.frame_id,
+            frame_number=rep_match.frame_number,
+            confidence=rep_match.confidence,
+            label=rep_match.label,
+            phase_z_status=rep_status,
+            raw_content=merged_raw,
+            title=pid,
+            auto_selectable=auto_selectable,
+            filter_reasons=filter_reasons,
+            notes=notes,
+        )
+        _apply_capacity_fit(c_inf, capacity_fit_fn)
+        candidates.append(c_inf)
+
+    return candidates
+
+
+# ─── Scoring ───────────────────────────────────────────────────
+
+# v0 label weights — V4 label → score multiplier.
+# 향후 axes 추가 (cardinality_fit / hierarchy_coherence / density) 시 확장.
+V0_LABEL_WEIGHT = {
+    "use_as_is": 1.0,
+    "light_edit": 0.7,
+    "restructure": 0.4,
+    "reject": 0.0,
+}
+
+
+def score_candidate(c: CompositionUnit) -> CompositionUnit:
+    """v0 scoring : confidence × label_weight.
+
+    추후 추가될 axes (rationale 에 자리만 잡아둠) :
+      - cardinality_fit : item_count vs frame ideal/min/max
+      - hierarchy_coherence : merge_type 적합도
+      - density_score : content 밀도 vs zone 크기
+    """
+    label_weight = V0_LABEL_WEIGHT.get(c.label, 0.0)
+    frame_compat = c.confidence * label_weight
+    c.score = frame_compat
+    # 기존 rationale 보존 (예: collect_candidates 가 넣은 capacity_fit)
+    c.rationale.update({
+        "frame_compat": round(frame_compat, 4),
+        "confidence": c.confidence,
+        "label": c.label,
+        "label_weight": label_weight,
+        "merge_type": c.merge_type,
+        # placeholders for future axes
+        "hierarchy_coherence": None,
+        "density_score": None,
+    })
+    return c
+
+
+# ─── Selection ─────────────────────────────────────────────────
+
+def select_composition_units(candidates, allowed_statuses: set[str]) -> list[CompositionUnit]:
+    """Greedy non-overlapping selection by score, with coverage tiebreak.
+
+    1. 모든 candidate 점수 매김
+    2. filter :
+        - phase_z_status ∈ allowed_statuses
+        - auto_selectable=True (W1/W2/W3 신호 통과)
+    3. 정렬 키 = (score desc, source_section_ids 수 desc)
+       — 동점이면 더 많은 section 을 cover 하는 후보 우선.
+       parent_merged_inferred 가 같은 점수의 single 후보를 *coverage 우위* 로 이김.
+    4. greedy : 이미 covered 된 section 을 가진 후보는 skip
+    5. 최종 선택 = covered set 채워나감
+
+    auto_selectable=False candidate 는 자동 선택 X. debug 의 candidates_summary 에는 남음.
+    UI/editor layer 에서 사용자가 별도 처리 가능 (현 v0 범위 X).
+    """
+    scored = [score_candidate(c) for c in candidates]
+    viable = [
+        c for c in scored
+        if c.phase_z_status in allowed_statuses and c.auto_selectable
+    ]
+    viable.sort(key=lambda c: (c.score, len(c.source_section_ids)), reverse=True)
+
+    selected = []
+    covered = set()
+    for c in viable:
+        if any(sid in covered for sid in c.source_section_ids):
+            continue
+        selected.append(c)
+        covered.update(c.source_section_ids)
+
+    return selected
+
+
+# ─── Layout Preset Selection ───────────────────────────────────
+
+def select_layout_preset(units: list[CompositionUnit]) -> Optional[str]:
+    """v0 : count-based default selection.
+
+    1 unit → single
+    2 units → horizontal-2 (default. vertical-2 는 aspect signal 추가 시 분기)
+    3 units → top-1-bottom-2 (default. 다른 3-zone variant 는 content-weight signal 추가 시 분기)
+    4 units → grid-2x2
+
+    v0 한계 :
+      - aspect / content-weight 신호 미반영 → 2 units 는 항상 horizontal, 3 units 는 항상 top-1-bottom-2
+      - 향후 unit.raw_content 기반 weight 산정 시 정교화
+    """
+    n = len(units)
+    if n == 0:
+        return None
+    if n == 1:
+        return "single"
+    if n == 2:
+        return "horizontal-2"
+    if n == 3:
+        return "top-1-bottom-2"
+    if n == 4:
+        return "grid-2x2"
+    raise ValueError(
+        f"Composition v0 : layout for {n} units not supported (max 4). "
+        "Larger counts require split-into-multiple-slides decision (future)."
+    )
+
+
+# ─── Public entry — composition pipeline ───────────────────────
+
+def plan_composition(sections, v4_lookup_fn, v4_label_to_status: dict,
+                     allowed_statuses: set[str],
+                     capacity_fit_fn=None) -> tuple[list[CompositionUnit], Optional[str], dict]:
+    """Composition planner v0.2 entry.
+
+    v0.2 변경 :
+      - capacity_fit_fn 주입 시 모든 candidate 에 capacity 사전 검사
+        (silent truncate / mapper FitError 사전 차단). 불일치 시 auto_selectable=False
+        + filter_reason 'C1: ...'.
+
+    v0.1 / v0.1.1 동작 (유지) :
+      - parent_merged_inferred candidate 생성 (parent V4 없어도)
+      - review 개념 X. auto_selectable + filter_reasons 만으로 자동 결정
+      - selection : score desc + coverage 우세 tiebreak
+
+    Returns:
+        units : 자동 선택된 composition units
+        layout_preset : 8 vocabulary 중 하나 (또는 None)
+        debug : 후보 전체 + capacity_fit + filter_reasons + preset 결정 근거
+    """
+    candidates = collect_candidates(
+        sections, v4_lookup_fn, v4_label_to_status,
+        auto_renderable_statuses=allowed_statuses,
+        capacity_fit_fn=capacity_fit_fn,
+    )
+    scored_all = [score_candidate(c) for c in candidates]
+
+    units = select_composition_units(candidates, allowed_statuses)
+    preset = select_layout_preset(units)
+
+    def _candidate_state(c: CompositionUnit) -> str:
+        if c in units:
+            return "selected"
+        if c.phase_z_status not in allowed_statuses:
+            return "filtered_status"               # V4 label → status not auto-renderable
+        if not c.auto_selectable:
+            # filter_reasons prefix 로 capacity 와 weak 구분
+            if any(r.startswith("C") for r in c.filter_reasons):
+                return "filtered_capacity"          # C1 (capacity mismatch)
+            return "filtered_weak"                  # W1/W2/W3 (parent_merged_inferred only)
+        return "filtered_lost"                      # viable 였지만 coverage 충돌로 밀림
+
+    candidates_summary = [
+        {
+            "source_section_ids": c.source_section_ids,
+            "merge_type": c.merge_type,
+            "template_id": c.frame_template_id,
+            "label": c.label,
+            "phase_z_status": c.phase_z_status,
+            "score": c.score,
+            "selection_state": _candidate_state(c),
+            "auto_selectable": c.auto_selectable,
+            "filter_reasons": list(c.filter_reasons),
+            "notes": list(c.notes),
+            "capacity_fit": c.rationale.get("capacity_fit"),
+        }
+        for c in scored_all
+    ]
+
+    merge_candidates = [
+        s for s in candidates_summary
+        if s["merge_type"] in {"parent_merged", "parent_merged_inferred"}
+    ]
+    capacity_mismatches = [
+        s for s in candidates_summary
+        if s["selection_state"] == "filtered_capacity"
+    ]
+
+    debug = {
+        "planner_version": "v0.2",
+        "selection_rule": (
+            "score desc, then source_section_ids count desc (coverage tiebreak). "
+            "filter = phase_z_status ∉ allowed_statuses OR auto_selectable=False. "
+            "auto_selectable=False 사유 : C1 (capacity mismatch — silent truncate / FitError 차단), "
+            "W1 (rep not auto-renderable), W2 (all children reject), W3 (majority children non-auto-renderable)."
+        ),
+        "candidates_total": len(scored_all),
+        "candidates_viable_auto": len([
+            c for c in scored_all
+            if c.phase_z_status in allowed_statuses and c.auto_selectable
+        ]),
+        "candidates_summary": candidates_summary,
+        "merge_candidates": merge_candidates,
+        "capacity_mismatches": capacity_mismatches,
+        "selected_units_count": len(units),
+        "layout_preset": preset,
+        "layout_preset_rationale": (
+            f"v0 count-based: {len(units)} units → {preset}"
+            if preset else "no viable units"
+        ),
+    }
+
+    return units, preset, debug