"""Phase Z-2 fit_classifier v0 (A1 — 분류 layer 만). Selenium visual_runtime_check 의 결과 (clipped_inner / zone overflow) 를 spec `docs/architecture/PHASE-Z-FIT-CLASSIFIER-ROUTER-SPEC.md` §3 taxonomy 의 *category* 로 분류하는 layer. 본 모듈은 ***분류만***. action / router / rerender / behavior 변경 X. 출력 = debug.json 의 `fit_classification` trace. 원칙 : - className 이라는 raw 문자열 → semantic content_type 매핑은 *registry* 가 담당 - excess_y (px) → line_equivalent 환산은 content_type 별 line-height 기준 - category 결정은 spec §3.2 우선순위 그대로 적용 (frame_capacity_mismatch → tabular → structural_major → layout_zone_mismatch → structural_minor → moderate → minor → hard_visual_fail) - 모든 결정은 trace 에 명시 — *어느 룰이 왜 적용됐는지* debug 로 검증 가능 다음 step (별도 — A2) : overflow_router 가 본 module 의 category 를 받아 action 으로 매핑. 본 step 에서 router 는 X. """ from __future__ import annotations import re from typing import Optional # ─── §2 className → semantic content_type registry ─────────────── # spec PHASE-Z-FIT-CLASSIFIER-ROUTER-SPEC.md §2 의 registry 그대로. # 패턴은 *위에서 아래로* 첫 매칭 우선. 더 specific 한 패턴이 위에 와야 함. CONTENT_TYPE_PATTERNS: list[tuple[str, str, str]] = [ # (regex pattern, semantic_content_type, description) # transform-block / transform-row → structural_unit # spec : "paired comparison (AS-IS/TO-BE 한 쌍이 의미 단위)" (r"^transform-block(__|$)", "structural_unit", "transform-block — paired comparison container"), (r"^transform-row(__|$)", "structural_unit", "transform-row — AS-IS/TO-BE pair row"), (r"^transform-rows$", "structural_unit", "transform-rows wrapper"), # tabular — table 클래스 또는 native (r"(^|[-_])table($|[-_])", "tabular", "table — tabular content"), # text-line family → text_flow (r"^text-line(--|$)", "text_flow", "text-line — free-flowing text/bullet"), # frame internal cell (frame 내부의 단위 cell) (r"^f\d+b__cell(--|$)", "frame_internal_cell", "frame internal cell"), (r"^f\d+b__pillar(--|$)", "frame_internal_cell", "frame internal pillar"), (r"^f\d+b__quadrant(--|$)", "frame_internal_cell", "frame internal quadrant"), # frame label / title / banner / ribbon (r"^f\d+b__title$", "frame_label", "frame title"), (r"^f\d+b__section-title", "frame_label", "frame section-title"), (r"^f\d+b__banner", "frame_label", "frame banner"), (r"^f\d+b__ribbon", "frame_label", "frame ribbon"), (r"__label", "frame_label", "frame label"), # frame root (f29b, f13b, f16b 자체) (r"^f\d+b$", "frame_internal", "frame family root"), # visual asset (r"__bg(\b|$)", "visual_asset", "background asset"), (r"^bg-", "visual_asset", "background asset"), (r"__icon(\b|$)", "visual_asset", "icon asset"), (r"^img-", "visual_asset", "image asset"), ] def classify_content_type(class_name: str) -> tuple[str, str]: """className 문자열 (공백 구분 multiple tokens 가능) → (semantic_content_type, match_reason). 공백으로 split 한 후 각 token 에 대해 CONTENT_TYPE_PATTERNS 순차 매칭. *첫 매칭* 이 우선 (registry 의 순서가 우선순위). 매칭 안 되면 ('unknown', ''). 예 : 'f29b__cell f29b__cell--left' → ('frame_internal_cell', "...") 'transform-block' → ('structural_unit', "...") 'text-line text-line--bullet' → ('text_flow', "...") """ if not class_name: return ("unknown", "") tokens = class_name.strip().split() for token in tokens: for pattern, ctype, desc in CONTENT_TYPE_PATTERNS: if re.search(pattern, token): return (ctype, f"token '{token}' matched pattern '{pattern}' ({desc})") return ("unknown", f"no pattern matched any of tokens {tokens}") # ─── line_equivalent 환산 ───────────────────────────────────────── # content_type 별 *대표 단위 height* — excess_y 를 줄(또는 단위) 단위로 환산. # structural_unit / tabular 의 경우는 "1 단위" = transform-row 또는 table-row. DEFAULT_UNIT_HEIGHTS: dict[str, float] = { # transform-row : padding 3+3 + line-height 11×1.45=15.95 ≈ 21.95 "structural_unit": 21.95, # text-line : font 11 × line-height 1.6 = 17.6 "text_flow": 17.6, # tabular row : 추정치 (실제 표 case 들어오면 calibration) "tabular": 22.0, # frame label / title : font 13 × line-height 1.3 = 16.9 "frame_label": 16.9, # frame_internal* : 보수적 default (text-line 기준) "frame_internal": 17.6, "frame_internal_cell": 17.6, # visual asset : crop 가능, 단위는 의미 없음 (line_eq 사용 안 됨) "visual_asset": 17.6, # unknown : text-line default "unknown": 17.6, } def compute_line_equivalent(excess_y: float, content_type: str) -> float: """excess_y (px) → line_equivalent (몇 줄 / 단위 분량인가). content_type 별 default unit height 사용. 단위 height 가 0 이거나 없으면 0 반환. 소수점 2 자리 round. """ unit_h = DEFAULT_UNIT_HEIGHTS.get(content_type, 17.6) if unit_h <= 0: return 0.0 return round(float(excess_y) / unit_h, 2) # ─── §3 taxonomy classifier ────────────────────────────────────── # spec §3.2 우선순위 : # 1. frame_capacity_mismatch (composition 결과 우선) # 2. tabular_overflow # 3. structural_major_overflow # 4. layout_zone_mismatch # 5. structural_minor_overflow # 6. moderate_overflow # 7. minor_overflow # 8. hard_visual_fail (fallback) def classify_overflow( *, excess_y: float, excess_x: float, class_name: str, inner_content_signals: Optional[list[str]] = None, capacity_fit_status: Optional[str] = None, ) -> dict: """단일 overflow event (clipped_inner 또는 zone-self) 를 spec §3 category 로 분류. Args: excess_y / excess_x : Selenium 측정 overflow px class_name : Selenium 이 캡처한 className 문자열 (multi-token 가능) inner_content_signals : Selenium 이 추가로 보고한 *내부 콘텐츠 신호* list (예: ['structural_unit'] — clipped cell 안에 transform-block 이 있음). className 이 frame_internal_cell 같은 *컨테이너* 일 때 *실제 overflow 한 content 의 type* 을 추론하기 위해 사용. capacity_fit_status : composition v0.2 의 capacity_fit.fit_status (있으면 우선) Returns: dict with inputs / derived / category / rule_applied """ inner_content_signals = list(inner_content_signals or []) raw_type, type_match = classify_content_type(class_name) # 컨테이너 (frame_internal_cell / frame_internal) 의 경우 inner signal 로 refine. # 이유 : Selenium 이 overflow:hidden 컨테이너 (cell) 를 잡지만, 실제 *overflow 한 # content* 는 그 안의 transform-block / table / text-line. 컨테이너 className 만 # 보고는 *어떤 종류의 content 가 잘리고 있는지* 모름. inner signal 이 그걸 알려줌. refined_via_inner = None if raw_type in {"frame_internal_cell", "frame_internal", "unknown"} and inner_content_signals: # spec §3.2 우선순위 따라 — tabular > structural_unit > text_flow if "tabular" in inner_content_signals: content_type, refined_via_inner = "tabular", "tabular (inner_signal)" elif "structural_unit" in inner_content_signals: content_type, refined_via_inner = "structural_unit", "structural_unit (inner_signal)" elif "text_flow" in inner_content_signals: content_type, refined_via_inner = "text_flow", "text_flow (inner_signal)" else: content_type = raw_type else: content_type = raw_type line_equivalent = compute_line_equivalent(excess_y, content_type) inputs = { "excess_y": float(excess_y), "excess_x": float(excess_x), "class_name": class_name, "inner_content_signals": inner_content_signals, "capacity_fit_status": capacity_fit_status, } derived = { "container_content_type": raw_type, # className 만 본 결과 "container_match": type_match, "content_type": content_type, # inner signal 로 refine 된 *최종* 분류 "content_type_refined_via_inner": refined_via_inner, "line_equivalent": line_equivalent, "unit_height_used": DEFAULT_UNIT_HEIGHTS.get(content_type, 17.6), } def result(category: str, rule: str) -> dict: return { "inputs": inputs, "derived": derived, "category": category, "rule_applied": rule, } # 1. frame_capacity_mismatch — composition 결과가 이미 mismatch 신호 if capacity_fit_status in {"strict_mismatch", "exceeds_max", "below_min", "exceeds_truncate"}: return result( "frame_capacity_mismatch", f"capacity_fit_status='{capacity_fit_status}' — composition 단계의 " f"capacity_fit 가 이미 mismatch 신호 (spec §3.2 우선순위 1)", ) # 2. tabular_overflow — 표는 어떤 양이든 popup 영역 if content_type == "tabular": return result( "tabular_overflow", f"content_type=tabular — 표는 행 단위 자르면 의미 손실 (spec §3.2 우선순위 2)", ) # 3. structural_major_overflow — 1 개 이상 *완전 단위* 잘림 if content_type == "structural_unit" and line_equivalent >= 1.0: return result( "structural_major_overflow", f"content_type=structural_unit AND line_equivalent={line_equivalent} >= 1.0 — " f"의미 단위 1+ 완전 잘림 (spec §3.2 우선순위 3)", ) # 4. layout_zone_mismatch — frame root 자체 overflow if content_type == "frame_internal": return result( "layout_zone_mismatch", f"content_type=frame_internal — frame root 자체가 zone 안에 못 들어감 " f"(spec §3.2 우선순위 4)", ) # 5. structural_minor_overflow — boundary spill (부분만 잘림) if content_type == "structural_unit": return result( "structural_minor_overflow", f"content_type=structural_unit AND line_equivalent={line_equivalent} < 1.0 — " f"boundary spill (부분 단위 잘림, 완전 단위 손실 아님) (spec §3.2 우선순위 5)", ) # 6. moderate_overflow — text/label flow 의 중간 양 if content_type in {"text_flow", "frame_label"} and 1.5 < line_equivalent <= 4.0: return result( "moderate_overflow", f"content_type={content_type} AND line_equivalent={line_equivalent} ∈ (1.5, 4] " f"(spec §3.2 우선순위 6)", ) # 7. minor_overflow — text/label flow 의 작은 양 if content_type in {"text_flow", "frame_label"} and line_equivalent <= 1.5: return result( "minor_overflow", f"content_type={content_type} AND line_equivalent={line_equivalent} ≤ 1.5 " f"(spec §3.2 우선순위 7)", ) # 8. hard_visual_fail — fallback (위 어디에도 안 잡힘) return result( "hard_visual_fail", f"위 매핑 모두 미적용 (content_type={content_type}, line_equivalent=" f"{line_equivalent}) — fallback (spec §3.2 우선순위 8)", ) # ─── visual_runtime_check 결과 → 전체 fit_classification trace ──── def _build_placement_diagnostic_for_zone( zone_position: str, placement_trace: Optional[dict], mapper_template_id: Optional[str], ) -> dict: """zone 별 placement diagnostic 빌더 — placement_trace from B4 → 진단 dict. phase_z2_pipeline.py 의 trace-only B1→B2→B4 chain 결과 (debug_zones[i].placement_trace) 를 per-zone surface 한 진단. classifier 는 *consume only* — placement_trace raw 구조는 미변경. Args: zone_position : zone position ("top" / "bottom_l" 등) placement_trace : phase_z2_pipeline.py 의 placement_trace dict 또는 None mapper_template_id : 기존 mapper / V4 가 선택한 frame template_id (placement_trace 에 mapper_frame_template_id 누락 시 fallback) Returns: per-zone placement diagnostic dict (shape-stable, missing fields = None / 0). """ if placement_trace is None: return { "zone_position": zone_position, "mapper_frame_template_id": mapper_template_id, "b4_selected_template_id": None, "frame_selection_matches_mapper": None, "frame_selection_match_note": "no placement_trace recorded", "region_count": 0, "slot_assignment_count": 0, "rejection_count": 0, } return { "zone_position": zone_position, "mapper_frame_template_id": ( placement_trace.get("mapper_frame_template_id") or mapper_template_id ), "b4_selected_template_id": placement_trace.get("selected_template_id"), "frame_selection_matches_mapper": placement_trace.get("frame_selection_matches_mapper"), "frame_selection_match_note": placement_trace.get("frame_selection_match_note"), "region_count": len(placement_trace.get("internal_regions") or []), "slot_assignment_count": len(placement_trace.get("slot_assignments") or []), "rejection_count": len(placement_trace.get("rejection") or []), } def classify_visual_runtime_check(overflow: dict, debug_zones: list[dict]) -> dict: """Selenium overflow + composition 의 zone debug → 전체 fit_classification 산출. 각 overflow event (zone-self overflow / cell-level clipped_inner) 를 개별 분류. Args: overflow : run_overflow_check 결과 (passed, slide, zones[], ...) debug_zones : pipeline 의 debug_zones list (zone 별 capacity_fit / template_id 등) Returns: dict : visual_check_passed : Selenium 통과 여부 (overflow.passed AND no classifications) classifications : 각 overflow event 의 분류 결과 list summary : 텍스트 요약 (n events, categories seen) categories_seen : 등장한 카테고리 unique list unclassified_signals : 미분류 신호 (raw Selenium 결과 중 분류 안 된 것) placement_diagnostics : per-zone placement_trace 진단 (B4 vs mapper divergence + region / slot_assignment / rejection count) — passed 여부 무관 항상 surface """ # Deferred import — phase_z2_pipeline imports this module at module top, so # a top-level `from phase_z2_pipeline import ...` would be circular. Pulled # in at call time so both modules are fully loaded. Tolerances are owned by # phase_z2_pipeline (single source of truth — see IMP-15 실행-1/2). from phase_z2_pipeline import IMAGE_ASPECT_DELTA_TOL, TABLE_SCROLL_TOL_PX # placement_diagnostics — debug_zones[i].placement_trace 를 per-zone diagnostic 으로 surface. # passed 여부 무관 항상 빌드 (B4 vs mapper divergence 가 passed 에서도 진단 가치). placement_diagnostics = [ _build_placement_diagnostic_for_zone( zone_position=dz.get("position", "?"), placement_trace=dz.get("placement_trace"), mapper_template_id=dz.get("v4_template_id"), ) for dz in (debug_zones or []) ] # IMP-15 실행-3 (issue #47): no early-return on overflow.passed=True. # image_events / table_events scans below run unconditionally; the final # visual_check_passed is widened to: overflow.passed AND no classifications. # zone position → debug_zones 매핑 (capacity_fit_status 추출용) capacity_status_by_position: dict[str, Optional[str]] = {} template_id_by_position: dict[str, Optional[str]] = {} for dz in (debug_zones or []): pos = dz.get("position") capacity_status_by_position[pos] = ( (dz.get("composition_rationale") or {}) .get("capacity_fit", {}) .get("fit_status") ) template_id_by_position[pos] = dz.get("v4_template_id") classifications: list[dict] = [] for z in overflow.get("zones", []): zone_position = z.get("position", "?") zone_template_id = z.get("template_id") or template_id_by_position.get(zone_position) capacity_fit_status = capacity_status_by_position.get(zone_position) # zone-self overflow (frame root 자체) if z.get("overflowed"): cls = classify_overflow( excess_y=z.get("excess_y", 0), excess_x=z.get("excess_x", 0), class_name=zone_template_id and f"f{re.sub(r'[^0-9]', '', str(zone_template_id))[:2] or '0'}b" or "f?b", # zone 자체는 frame root 패턴 매칭 → frame_internal 으로 분류 의도 capacity_fit_status=capacity_fit_status, ) cls["source"] = "zone_self_overflow" cls["zone_position"] = zone_position cls["zone_template_id"] = zone_template_id classifications.append(cls) # cell-level clipped_inner for c in z.get("clipped_inner", []): cls = classify_overflow( excess_y=c.get("excess_y", 0), excess_x=c.get("excess_x", 0), class_name=c.get("class_name", ""), inner_content_signals=c.get("inner_content_signals") or [], capacity_fit_status=capacity_fit_status, ) cls["source"] = "clipped_inner" cls["zone_position"] = zone_position cls["zone_template_id"] = zone_template_id cls["client_height"] = c.get("clientHeight") cls["scroll_height"] = c.get("scrollHeight") classifications.append(cls) # IMP-15 실행-3 (issue #47): image_events scan — image_aspect_mismatch emitter. # delta is None ⇒ skip (image not loaded; no false positive). # |delta| > IMAGE_ASPECT_DELTA_TOL ⇒ emit classification. for ev in (overflow.get("image_events") or []): delta = ev.get("delta") if delta is None: continue if abs(delta) > IMAGE_ASPECT_DELTA_TOL: classifications.append({ "category": "image_aspect_mismatch", "source": "image_event", "zone_position": ev.get("zone_position"), "zone_template_id": ev.get("zone_template_id"), "src": ev.get("src"), "natural_ratio": ev.get("natural_ratio"), "rendered_ratio": ev.get("rendered_ratio"), "delta": delta, "rule_applied": ( f"|delta|={abs(delta):.4f} > IMAGE_ASPECT_DELTA_TOL=" f"{IMAGE_ASPECT_DELTA_TOL} (IMP-15 실행-3)" ), }) # IMP-15 실행-3 (issue #47): table_events scan — tabular_overflow emitter. # wrapper_clipped_index is not None ⇒ skip (clipped_inner already covers this # case via zone cascade; honor dedup contract from pipeline producer). # excess_x or excess_y > TABLE_SCROLL_TOL_PX ⇒ emit tabular_overflow. for ev in (overflow.get("table_events") or []): if ev.get("wrapper_clipped_index") is not None: continue excess_x = ev.get("excess_x") or 0 excess_y = ev.get("excess_y") or 0 if excess_x > TABLE_SCROLL_TOL_PX or excess_y > TABLE_SCROLL_TOL_PX: classifications.append({ "category": "tabular_overflow", "source": "table_event", "zone_position": ev.get("zone_position"), "zone_template_id": ev.get("zone_template_id"), "excess_x": excess_x, "excess_y": excess_y, "rule_applied": ( f"table self-overflow — excess_x={excess_x} or excess_y=" f"{excess_y} > TABLE_SCROLL_TOL_PX={TABLE_SCROLL_TOL_PX} " f"(wrapper not clipped; IMP-15 실행-3)" ), }) # slide-level / slide-body overflow (zones 외부) 도 분류 시도 (보통 zone-level 에서 잡히지만 보조) unclassified: list[dict] = [] slide_m = overflow.get("slide") or {} if slide_m.get("overflowed"): unclassified.append({ "level": "slide", "excess_y": slide_m.get("excess_y"), "excess_x": slide_m.get("excess_x"), "note": "slide-level overflow — 보통 zone 단위 분류로 충분, 미분류 보고만", }) body_m = overflow.get("slide_body") or {} if body_m.get("overflowed"): unclassified.append({ "level": "slide_body", "excess_y": body_m.get("excess_y"), "excess_x": body_m.get("excess_x"), "note": "slide_body overflow — 위와 같음", }) categories = sorted({c["category"] for c in classifications}) # IMP-15 실행-3 (issue #47): widened semantic — overflow.passed alone is not # enough; any image/table classification also flips visual_check_passed. visual_check_passed = bool(overflow.get("passed", False)) and not classifications return { "visual_check_passed": visual_check_passed, "classifications": classifications, "summary": ( f"{len(classifications)} overflow event(s) classified, " f"categories: {categories or 'none'}" ), "categories_seen": categories, "unclassified_signals": unclassified, "placement_diagnostics": placement_diagnostics, }