C.E.L_Slide_test2/src/phase_z2_classifier.py

"""Phase Z-2 fit_classifier v0 (A1 — 분류 layer 만).

Selenium visual_runtime_check 의 결과 (clipped_inner / zone overflow) 를
spec `docs/architecture/PHASE-Z-FIT-CLASSIFIER-ROUTER-SPEC.md` §3 taxonomy
의 *category* 로 분류하는 layer.

본 모듈은 ***분류만***. action / router / rerender / behavior 변경 X.
출력 = debug.json 의 `fit_classification` trace.

원칙 :
  - className 이라는 raw 문자열 → semantic content_type 매핑은 *registry* 가 담당
  - excess_y (px) → line_equivalent 환산은 content_type 별 line-height 기준
  - category 결정은 spec §3.2 우선순위 그대로 적용 (frame_capacity_mismatch →
    tabular → structural_major → layout_zone_mismatch → structural_minor →
    moderate → minor → hard_visual_fail)
  - 모든 결정은 trace 에 명시 — *어느 룰이 왜 적용됐는지* debug 로 검증 가능

다음 step (별도 — A2) :
  overflow_router 가 본 module 의 category 를 받아 action 으로 매핑.
  본 step 에서 router 는 X.
"""

from __future__ import annotations

import re
from typing import Optional


# ─── §2 className → semantic content_type registry ───────────────
# spec PHASE-Z-FIT-CLASSIFIER-ROUTER-SPEC.md §2 의 registry 그대로.
# 패턴은 *위에서 아래로* 첫 매칭 우선. 더 specific 한 패턴이 위에 와야 함.

CONTENT_TYPE_PATTERNS: list[tuple[str, str, str]] = [
    # (regex pattern, semantic_content_type, description)

    # transform-block / transform-row → structural_unit
    # spec : "paired comparison (AS-IS/TO-BE 한 쌍이 의미 단위)"
    (r"^transform-block(__|$)", "structural_unit",
     "transform-block — paired comparison container"),
    (r"^transform-row(__|$)", "structural_unit",
     "transform-row — AS-IS/TO-BE pair row"),
    (r"^transform-rows$", "structural_unit",
     "transform-rows wrapper"),

    # tabular — table 클래스 또는 native <table>
    (r"(^|[-_])table($|[-_])", "tabular",
     "table — tabular content"),

    # text-line family → text_flow
    (r"^text-line(--|$)", "text_flow",
     "text-line — free-flowing text/bullet"),

    # frame internal cell (frame 내부의 단위 cell)
    (r"^f\d+b__cell(--|$)", "frame_internal_cell",
     "frame internal cell"),
    (r"^f\d+b__pillar(--|$)", "frame_internal_cell",
     "frame internal pillar"),
    (r"^f\d+b__quadrant(--|$)", "frame_internal_cell",
     "frame internal quadrant"),

    # frame label / title / banner / ribbon
    (r"^f\d+b__title$", "frame_label",
     "frame title"),
    (r"^f\d+b__section-title", "frame_label",
     "frame section-title"),
    (r"^f\d+b__banner", "frame_label",
     "frame banner"),
    (r"^f\d+b__ribbon", "frame_label",
     "frame ribbon"),
    (r"__label", "frame_label",
     "frame label"),

    # frame root (f29b, f13b, f16b 자체)
    (r"^f\d+b$", "frame_internal",
     "frame family root"),

    # visual asset
    (r"__bg(\b|$)", "visual_asset", "background asset"),
    (r"^bg-", "visual_asset", "background asset"),
    (r"__icon(\b|$)", "visual_asset", "icon asset"),
    (r"^img-", "visual_asset", "image asset"),
]


def classify_content_type(class_name: str) -> tuple[str, str]:
    """className 문자열 (공백 구분 multiple tokens 가능) → (semantic_content_type, match_reason).

    공백으로 split 한 후 각 token 에 대해 CONTENT_TYPE_PATTERNS 순차 매칭.
    *첫 매칭* 이 우선 (registry 의 순서가 우선순위).
    매칭 안 되면 ('unknown', '').

    예 :
      'f29b__cell f29b__cell--left' → ('frame_internal_cell', "...")
      'transform-block' → ('structural_unit', "...")
      'text-line text-line--bullet' → ('text_flow', "...")
    """
    if not class_name:
        return ("unknown", "")

    tokens = class_name.strip().split()
    for token in tokens:
        for pattern, ctype, desc in CONTENT_TYPE_PATTERNS:
            if re.search(pattern, token):
                return (ctype, f"token '{token}' matched pattern '{pattern}' ({desc})")
    return ("unknown", f"no pattern matched any of tokens {tokens}")


# ─── line_equivalent 환산 ─────────────────────────────────────────
# content_type 별 *대표 단위 height* — excess_y 를 줄(또는 단위) 단위로 환산.
# structural_unit / tabular 의 경우는 "1 단위" = transform-row 또는 table-row.

DEFAULT_UNIT_HEIGHTS: dict[str, float] = {
    # transform-row : padding 3+3 + line-height 11×1.45=15.95 ≈ 21.95
    "structural_unit": 21.95,
    # text-line : font 11 × line-height 1.6 = 17.6
    "text_flow": 17.6,
    # tabular row : 추정치 (실제 표 case 들어오면 calibration)
    "tabular": 22.0,
    # frame label / title : font 13 × line-height 1.3 = 16.9
    "frame_label": 16.9,
    # frame_internal* : 보수적 default (text-line 기준)
    "frame_internal": 17.6,
    "frame_internal_cell": 17.6,
    # visual asset : crop 가능, 단위는 의미 없음 (line_eq 사용 안 됨)
    "visual_asset": 17.6,
    # unknown : text-line default
    "unknown": 17.6,
}


def compute_line_equivalent(excess_y: float, content_type: str) -> float:
    """excess_y (px) → line_equivalent (몇 줄 / 단위 분량인가).

    content_type 별 default unit height 사용. 단위 height 가 0 이거나 없으면 0 반환.
    소수점 2 자리 round.
    """
    unit_h = DEFAULT_UNIT_HEIGHTS.get(content_type, 17.6)
    if unit_h <= 0:
        return 0.0
    return round(float(excess_y) / unit_h, 2)


# ─── §3 taxonomy classifier ──────────────────────────────────────

# spec §3.2 우선순위 :
#   1. frame_capacity_mismatch (composition 결과 우선)
#   2. tabular_overflow
#   3. structural_major_overflow
#   4. layout_zone_mismatch
#   5. structural_minor_overflow
#   6. moderate_overflow
#   7. minor_overflow
#   8. hard_visual_fail (fallback)


def classify_overflow(
    *,
    excess_y: float,
    excess_x: float,
    class_name: str,
    inner_content_signals: Optional[list[str]] = None,
    capacity_fit_status: Optional[str] = None,
) -> dict:
    """단일 overflow event (clipped_inner 또는 zone-self) 를 spec §3 category 로 분류.

    Args:
        excess_y / excess_x   : Selenium 측정 overflow px
        class_name            : Selenium 이 캡처한 className 문자열 (multi-token 가능)
        inner_content_signals : Selenium 이 추가로 보고한 *내부 콘텐츠 신호* list
            (예: ['structural_unit'] — clipped cell 안에 transform-block 이 있음).
            className 이 frame_internal_cell 같은 *컨테이너* 일 때 *실제 overflow 한
            content 의 type* 을 추론하기 위해 사용.
        capacity_fit_status   : composition v0.2 의 capacity_fit.fit_status (있으면 우선)

    Returns:
        dict with inputs / derived / category / rule_applied
    """
    inner_content_signals = list(inner_content_signals or [])
    raw_type, type_match = classify_content_type(class_name)

    # 컨테이너 (frame_internal_cell / frame_internal) 의 경우 inner signal 로 refine.
    # 이유 : Selenium 이 overflow:hidden 컨테이너 (cell) 를 잡지만, 실제 *overflow 한
    # content* 는 그 안의 transform-block / table / text-line. 컨테이너 className 만
    # 보고는 *어떤 종류의 content 가 잘리고 있는지* 모름. inner signal 이 그걸 알려줌.
    refined_via_inner = None
    if raw_type in {"frame_internal_cell", "frame_internal", "unknown"} and inner_content_signals:
        # spec §3.2 우선순위 따라 — tabular > structural_unit > text_flow
        if "tabular" in inner_content_signals:
            content_type, refined_via_inner = "tabular", "tabular (inner_signal)"
        elif "structural_unit" in inner_content_signals:
            content_type, refined_via_inner = "structural_unit", "structural_unit (inner_signal)"
        elif "text_flow" in inner_content_signals:
            content_type, refined_via_inner = "text_flow", "text_flow (inner_signal)"
        else:
            content_type = raw_type
    else:
        content_type = raw_type

    line_equivalent = compute_line_equivalent(excess_y, content_type)

    inputs = {
        "excess_y": float(excess_y),
        "excess_x": float(excess_x),
        "class_name": class_name,
        "inner_content_signals": inner_content_signals,
        "capacity_fit_status": capacity_fit_status,
    }
    derived = {
        "container_content_type": raw_type,           # className 만 본 결과
        "container_match": type_match,
        "content_type": content_type,                 # inner signal 로 refine 된 *최종* 분류
        "content_type_refined_via_inner": refined_via_inner,
        "line_equivalent": line_equivalent,
        "unit_height_used": DEFAULT_UNIT_HEIGHTS.get(content_type, 17.6),
    }

    def result(category: str, rule: str) -> dict:
        return {
            "inputs": inputs,
            "derived": derived,
            "category": category,
            "rule_applied": rule,
        }

    # 1. frame_capacity_mismatch — composition 결과가 이미 mismatch 신호
    if capacity_fit_status in {"strict_mismatch", "exceeds_max", "below_min", "exceeds_truncate"}:
        return result(
            "frame_capacity_mismatch",
            f"capacity_fit_status='{capacity_fit_status}' — composition 단계의 "
            f"capacity_fit 가 이미 mismatch 신호 (spec §3.2 우선순위 1)",
        )

    # 2. tabular_overflow — 표는 어떤 양이든 popup 영역
    if content_type == "tabular":
        return result(
            "tabular_overflow",
            f"content_type=tabular — 표는 행 단위 자르면 의미 손실 (spec §3.2 우선순위 2)",
        )

    # 3. structural_major_overflow — 1 개 이상 *완전 단위* 잘림
    if content_type == "structural_unit" and line_equivalent >= 1.0:
        return result(
            "structural_major_overflow",
            f"content_type=structural_unit AND line_equivalent={line_equivalent} >= 1.0 — "
            f"의미 단위 1+ 완전 잘림 (spec §3.2 우선순위 3)",
        )

    # 4. layout_zone_mismatch — frame root 자체 overflow
    if content_type == "frame_internal":
        return result(
            "layout_zone_mismatch",
            f"content_type=frame_internal — frame root 자체가 zone 안에 못 들어감 "
            f"(spec §3.2 우선순위 4)",
        )

    # 5. structural_minor_overflow — boundary spill (부분만 잘림)
    if content_type == "structural_unit":
        return result(
            "structural_minor_overflow",
            f"content_type=structural_unit AND line_equivalent={line_equivalent} < 1.0 — "
            f"boundary spill (부분 단위 잘림, 완전 단위 손실 아님) (spec §3.2 우선순위 5)",
        )

    # 6. moderate_overflow — text/label flow 의 중간 양
    if content_type in {"text_flow", "frame_label"} and 1.5 < line_equivalent <= 4.0:
        return result(
            "moderate_overflow",
            f"content_type={content_type} AND line_equivalent={line_equivalent} ∈ (1.5, 4] "
            f"(spec §3.2 우선순위 6)",
        )

    # 7. minor_overflow — text/label flow 의 작은 양
    if content_type in {"text_flow", "frame_label"} and line_equivalent <= 1.5:
        return result(
            "minor_overflow",
            f"content_type={content_type} AND line_equivalent={line_equivalent} ≤ 1.5 "
            f"(spec §3.2 우선순위 7)",
        )

    # 8. hard_visual_fail — fallback (위 어디에도 안 잡힘)
    return result(
        "hard_visual_fail",
        f"위 매핑 모두 미적용 (content_type={content_type}, line_equivalent="
        f"{line_equivalent}) — fallback (spec §3.2 우선순위 8)",
    )


# ─── visual_runtime_check 결과 → 전체 fit_classification trace ────


def _build_placement_diagnostic_for_zone(
    zone_position: str,
    placement_trace: Optional[dict],
    mapper_template_id: Optional[str],
) -> dict:
    """zone 별 placement diagnostic 빌더 — placement_trace from B4 → 진단 dict.

    phase_z2_pipeline.py 의 trace-only B1→B2→B4 chain 결과 (debug_zones[i].placement_trace)
    를 per-zone surface 한 진단. classifier 는 *consume only* — placement_trace
    raw 구조는 미변경.

    Args:
        zone_position      : zone position ("top" / "bottom_l" 등)
        placement_trace    : phase_z2_pipeline.py 의 placement_trace dict 또는 None
        mapper_template_id : 기존 mapper / V4 가 선택한 frame template_id
                             (placement_trace 에 mapper_frame_template_id 누락 시 fallback)

    Returns:
        per-zone placement diagnostic dict (shape-stable, missing fields = None / 0).
    """
    if placement_trace is None:
        return {
            "zone_position": zone_position,
            "mapper_frame_template_id": mapper_template_id,
            "b4_selected_template_id": None,
            "frame_selection_matches_mapper": None,
            "frame_selection_match_note": "no placement_trace recorded",
            "region_count": 0,
            "slot_assignment_count": 0,
            "rejection_count": 0,
        }
    return {
        "zone_position": zone_position,
        "mapper_frame_template_id": (
            placement_trace.get("mapper_frame_template_id") or mapper_template_id
        ),
        "b4_selected_template_id": placement_trace.get("selected_template_id"),
        "frame_selection_matches_mapper": placement_trace.get("frame_selection_matches_mapper"),
        "frame_selection_match_note": placement_trace.get("frame_selection_match_note"),
        "region_count": len(placement_trace.get("internal_regions") or []),
        "slot_assignment_count": len(placement_trace.get("slot_assignments") or []),
        "rejection_count": len(placement_trace.get("rejection") or []),
    }


def classify_visual_runtime_check(overflow: dict, debug_zones: list[dict]) -> dict:
    """Selenium overflow + composition 의 zone debug → 전체 fit_classification 산출.

    각 overflow event (zone-self overflow / cell-level clipped_inner) 를 개별 분류.

    Args:
        overflow      : run_overflow_check 결과 (passed, slide, zones[], ...)
        debug_zones   : pipeline 의 debug_zones list (zone 별 capacity_fit / template_id 등)

    Returns:
        dict :
          visual_check_passed     : Selenium 통과 여부
          classifications         : 각 overflow event 의 분류 결과 list
          summary                 : 텍스트 요약 (n events, categories seen)
          categories_seen         : 등장한 카테고리 unique list
          unclassified_signals    : 미분류 신호 (raw Selenium 결과 중 분류 안 된 것)
          placement_diagnostics   : per-zone placement_trace 진단 (B4 vs mapper
                                    divergence + region / slot_assignment / rejection
                                    count) — passed 여부 무관 항상 surface
    """
    # placement_diagnostics — debug_zones[i].placement_trace 를 per-zone diagnostic 으로 surface.
    # passed 여부 무관 항상 빌드 (B4 vs mapper divergence 가 passed 에서도 진단 가치).
    placement_diagnostics = [
        _build_placement_diagnostic_for_zone(
            zone_position=dz.get("position", "?"),
            placement_trace=dz.get("placement_trace"),
            mapper_template_id=dz.get("v4_template_id"),
        )
        for dz in (debug_zones or [])
    ]

    if overflow.get("passed", False):
        return {
            "visual_check_passed": True,
            "classifications": [],
            "summary": "visual check passed — no overflow to classify",
            "categories_seen": [],
            "unclassified_signals": [],
            "placement_diagnostics": placement_diagnostics,
        }

    # zone position → debug_zones 매핑 (capacity_fit_status 추출용)
    capacity_status_by_position: dict[str, Optional[str]] = {}
    template_id_by_position: dict[str, Optional[str]] = {}
    for dz in (debug_zones or []):
        pos = dz.get("position")
        capacity_status_by_position[pos] = (
            (dz.get("composition_rationale") or {})
            .get("capacity_fit", {})
            .get("fit_status")
        )
        template_id_by_position[pos] = dz.get("v4_template_id")

    classifications: list[dict] = []

    for z in overflow.get("zones", []):
        zone_position = z.get("position", "?")
        zone_template_id = z.get("template_id") or template_id_by_position.get(zone_position)
        capacity_fit_status = capacity_status_by_position.get(zone_position)

        # zone-self overflow (frame root 자체)
        if z.get("overflowed"):
            cls = classify_overflow(
                excess_y=z.get("excess_y", 0),
                excess_x=z.get("excess_x", 0),
                class_name=zone_template_id and f"f{re.sub(r'[^0-9]', '', str(zone_template_id))[:2] or '0'}b" or "f?b",
                # zone 자체는 frame root 패턴 매칭 → frame_internal 으로 분류 의도
                capacity_fit_status=capacity_fit_status,
            )
            cls["source"] = "zone_self_overflow"
            cls["zone_position"] = zone_position
            cls["zone_template_id"] = zone_template_id
            classifications.append(cls)

        # cell-level clipped_inner
        for c in z.get("clipped_inner", []):
            cls = classify_overflow(
                excess_y=c.get("excess_y", 0),
                excess_x=c.get("excess_x", 0),
                class_name=c.get("class_name", ""),
                inner_content_signals=c.get("inner_content_signals") or [],
                capacity_fit_status=capacity_fit_status,
            )
            cls["source"] = "clipped_inner"
            cls["zone_position"] = zone_position
            cls["zone_template_id"] = zone_template_id
            cls["client_height"] = c.get("clientHeight")
            cls["scroll_height"] = c.get("scrollHeight")
            classifications.append(cls)

    # slide-level / slide-body overflow (zones 외부) 도 분류 시도 (보통 zone-level 에서 잡히지만 보조)
    unclassified: list[dict] = []
    slide_m = overflow.get("slide") or {}
    if slide_m.get("overflowed"):
        unclassified.append({
            "level": "slide",
            "excess_y": slide_m.get("excess_y"),
            "excess_x": slide_m.get("excess_x"),
            "note": "slide-level overflow — 보통 zone 단위 분류로 충분, 미분류 보고만",
        })
    body_m = overflow.get("slide_body") or {}
    if body_m.get("overflowed"):
        unclassified.append({
            "level": "slide_body",
            "excess_y": body_m.get("excess_y"),
            "excess_x": body_m.get("excess_x"),
            "note": "slide_body overflow — 위와 같음",
        })

    categories = sorted({c["category"] for c in classifications})
    return {
        "visual_check_passed": False,
        "classifications": classifications,
        "summary": (
            f"{len(classifications)} overflow event(s) classified, "
            f"categories: {categories or 'none'}"
        ),
        "categories_seen": categories,
        "unclassified_signals": unclassified,
        "placement_diagnostics": placement_diagnostics,
    }