Files
C.E.L_Slide_test2/src/phase_z2_classifier.py
kyeongmin 565e6b092e Add Phase Z classifier placement diagnostics
- consume debug_zones[i].placement_trace in classify_visual_runtime_check
- surface per-zone diagnostic in fit_classification.placement_diagnostics
- preserve canonical render SHA and existing classifier output schema
2026-05-04 17:40:21 +09:00

457 lines
19 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Phase Z-2 fit_classifier v0 (A1 — 분류 layer 만).
Selenium visual_runtime_check 의 결과 (clipped_inner / zone overflow) 를
spec `docs/architecture/PHASE-Z-FIT-CLASSIFIER-ROUTER-SPEC.md` §3 taxonomy
의 *category* 로 분류하는 layer.
본 모듈은 ***분류만***. action / router / rerender / behavior 변경 X.
출력 = debug.json 의 `fit_classification` trace.
원칙 :
- className 이라는 raw 문자열 → semantic content_type 매핑은 *registry* 가 담당
- excess_y (px) → line_equivalent 환산은 content_type 별 line-height 기준
- category 결정은 spec §3.2 우선순위 그대로 적용 (frame_capacity_mismatch →
tabular → structural_major → layout_zone_mismatch → structural_minor →
moderate → minor → hard_visual_fail)
- 모든 결정은 trace 에 명시 — *어느 룰이 왜 적용됐는지* debug 로 검증 가능
다음 step (별도 — A2) :
overflow_router 가 본 module 의 category 를 받아 action 으로 매핑.
본 step 에서 router 는 X.
"""
from __future__ import annotations
import re
from typing import Optional
# ─── §2 className → semantic content_type registry ───────────────
# spec PHASE-Z-FIT-CLASSIFIER-ROUTER-SPEC.md §2 의 registry 그대로.
# 패턴은 *위에서 아래로* 첫 매칭 우선. 더 specific 한 패턴이 위에 와야 함.
CONTENT_TYPE_PATTERNS: list[tuple[str, str, str]] = [
# (regex pattern, semantic_content_type, description)
# transform-block / transform-row → structural_unit
# spec : "paired comparison (AS-IS/TO-BE 한 쌍이 의미 단위)"
(r"^transform-block(__|$)", "structural_unit",
"transform-block — paired comparison container"),
(r"^transform-row(__|$)", "structural_unit",
"transform-row — AS-IS/TO-BE pair row"),
(r"^transform-rows$", "structural_unit",
"transform-rows wrapper"),
# tabular — table 클래스 또는 native <table>
(r"(^|[-_])table($|[-_])", "tabular",
"table — tabular content"),
# text-line family → text_flow
(r"^text-line(--|$)", "text_flow",
"text-line — free-flowing text/bullet"),
# frame internal cell (frame 내부의 단위 cell)
(r"^f\d+b__cell(--|$)", "frame_internal_cell",
"frame internal cell"),
(r"^f\d+b__pillar(--|$)", "frame_internal_cell",
"frame internal pillar"),
(r"^f\d+b__quadrant(--|$)", "frame_internal_cell",
"frame internal quadrant"),
# frame label / title / banner / ribbon
(r"^f\d+b__title$", "frame_label",
"frame title"),
(r"^f\d+b__section-title", "frame_label",
"frame section-title"),
(r"^f\d+b__banner", "frame_label",
"frame banner"),
(r"^f\d+b__ribbon", "frame_label",
"frame ribbon"),
(r"__label", "frame_label",
"frame label"),
# frame root (f29b, f13b, f16b 자체)
(r"^f\d+b$", "frame_internal",
"frame family root"),
# visual asset
(r"__bg(\b|$)", "visual_asset", "background asset"),
(r"^bg-", "visual_asset", "background asset"),
(r"__icon(\b|$)", "visual_asset", "icon asset"),
(r"^img-", "visual_asset", "image asset"),
]
def classify_content_type(class_name: str) -> tuple[str, str]:
"""className 문자열 (공백 구분 multiple tokens 가능) → (semantic_content_type, match_reason).
공백으로 split 한 후 각 token 에 대해 CONTENT_TYPE_PATTERNS 순차 매칭.
*첫 매칭* 이 우선 (registry 의 순서가 우선순위).
매칭 안 되면 ('unknown', '').
예 :
'f29b__cell f29b__cell--left' → ('frame_internal_cell', "...")
'transform-block' → ('structural_unit', "...")
'text-line text-line--bullet' → ('text_flow', "...")
"""
if not class_name:
return ("unknown", "")
tokens = class_name.strip().split()
for token in tokens:
for pattern, ctype, desc in CONTENT_TYPE_PATTERNS:
if re.search(pattern, token):
return (ctype, f"token '{token}' matched pattern '{pattern}' ({desc})")
return ("unknown", f"no pattern matched any of tokens {tokens}")
# ─── line_equivalent 환산 ─────────────────────────────────────────
# content_type 별 *대표 단위 height* — excess_y 를 줄(또는 단위) 단위로 환산.
# structural_unit / tabular 의 경우는 "1 단위" = transform-row 또는 table-row.
DEFAULT_UNIT_HEIGHTS: dict[str, float] = {
# transform-row : padding 3+3 + line-height 11×1.45=15.95 ≈ 21.95
"structural_unit": 21.95,
# text-line : font 11 × line-height 1.6 = 17.6
"text_flow": 17.6,
# tabular row : 추정치 (실제 표 case 들어오면 calibration)
"tabular": 22.0,
# frame label / title : font 13 × line-height 1.3 = 16.9
"frame_label": 16.9,
# frame_internal* : 보수적 default (text-line 기준)
"frame_internal": 17.6,
"frame_internal_cell": 17.6,
# visual asset : crop 가능, 단위는 의미 없음 (line_eq 사용 안 됨)
"visual_asset": 17.6,
# unknown : text-line default
"unknown": 17.6,
}
def compute_line_equivalent(excess_y: float, content_type: str) -> float:
"""excess_y (px) → line_equivalent (몇 줄 / 단위 분량인가).
content_type 별 default unit height 사용. 단위 height 가 0 이거나 없으면 0 반환.
소수점 2 자리 round.
"""
unit_h = DEFAULT_UNIT_HEIGHTS.get(content_type, 17.6)
if unit_h <= 0:
return 0.0
return round(float(excess_y) / unit_h, 2)
# ─── §3 taxonomy classifier ──────────────────────────────────────
# spec §3.2 우선순위 :
# 1. frame_capacity_mismatch (composition 결과 우선)
# 2. tabular_overflow
# 3. structural_major_overflow
# 4. layout_zone_mismatch
# 5. structural_minor_overflow
# 6. moderate_overflow
# 7. minor_overflow
# 8. hard_visual_fail (fallback)
def classify_overflow(
*,
excess_y: float,
excess_x: float,
class_name: str,
inner_content_signals: Optional[list[str]] = None,
capacity_fit_status: Optional[str] = None,
) -> dict:
"""단일 overflow event (clipped_inner 또는 zone-self) 를 spec §3 category 로 분류.
Args:
excess_y / excess_x : Selenium 측정 overflow px
class_name : Selenium 이 캡처한 className 문자열 (multi-token 가능)
inner_content_signals : Selenium 이 추가로 보고한 *내부 콘텐츠 신호* list
(예: ['structural_unit'] — clipped cell 안에 transform-block 이 있음).
className 이 frame_internal_cell 같은 *컨테이너* 일 때 *실제 overflow 한
content 의 type* 을 추론하기 위해 사용.
capacity_fit_status : composition v0.2 의 capacity_fit.fit_status (있으면 우선)
Returns:
dict with inputs / derived / category / rule_applied
"""
inner_content_signals = list(inner_content_signals or [])
raw_type, type_match = classify_content_type(class_name)
# 컨테이너 (frame_internal_cell / frame_internal) 의 경우 inner signal 로 refine.
# 이유 : Selenium 이 overflow:hidden 컨테이너 (cell) 를 잡지만, 실제 *overflow 한
# content* 는 그 안의 transform-block / table / text-line. 컨테이너 className 만
# 보고는 *어떤 종류의 content 가 잘리고 있는지* 모름. inner signal 이 그걸 알려줌.
refined_via_inner = None
if raw_type in {"frame_internal_cell", "frame_internal", "unknown"} and inner_content_signals:
# spec §3.2 우선순위 따라 — tabular > structural_unit > text_flow
if "tabular" in inner_content_signals:
content_type, refined_via_inner = "tabular", "tabular (inner_signal)"
elif "structural_unit" in inner_content_signals:
content_type, refined_via_inner = "structural_unit", "structural_unit (inner_signal)"
elif "text_flow" in inner_content_signals:
content_type, refined_via_inner = "text_flow", "text_flow (inner_signal)"
else:
content_type = raw_type
else:
content_type = raw_type
line_equivalent = compute_line_equivalent(excess_y, content_type)
inputs = {
"excess_y": float(excess_y),
"excess_x": float(excess_x),
"class_name": class_name,
"inner_content_signals": inner_content_signals,
"capacity_fit_status": capacity_fit_status,
}
derived = {
"container_content_type": raw_type, # className 만 본 결과
"container_match": type_match,
"content_type": content_type, # inner signal 로 refine 된 *최종* 분류
"content_type_refined_via_inner": refined_via_inner,
"line_equivalent": line_equivalent,
"unit_height_used": DEFAULT_UNIT_HEIGHTS.get(content_type, 17.6),
}
def result(category: str, rule: str) -> dict:
return {
"inputs": inputs,
"derived": derived,
"category": category,
"rule_applied": rule,
}
# 1. frame_capacity_mismatch — composition 결과가 이미 mismatch 신호
if capacity_fit_status in {"strict_mismatch", "exceeds_max", "below_min", "exceeds_truncate"}:
return result(
"frame_capacity_mismatch",
f"capacity_fit_status='{capacity_fit_status}' — composition 단계의 "
f"capacity_fit 가 이미 mismatch 신호 (spec §3.2 우선순위 1)",
)
# 2. tabular_overflow — 표는 어떤 양이든 popup 영역
if content_type == "tabular":
return result(
"tabular_overflow",
f"content_type=tabular — 표는 행 단위 자르면 의미 손실 (spec §3.2 우선순위 2)",
)
# 3. structural_major_overflow — 1 개 이상 *완전 단위* 잘림
if content_type == "structural_unit" and line_equivalent >= 1.0:
return result(
"structural_major_overflow",
f"content_type=structural_unit AND line_equivalent={line_equivalent} >= 1.0 — "
f"의미 단위 1+ 완전 잘림 (spec §3.2 우선순위 3)",
)
# 4. layout_zone_mismatch — frame root 자체 overflow
if content_type == "frame_internal":
return result(
"layout_zone_mismatch",
f"content_type=frame_internal — frame root 자체가 zone 안에 못 들어감 "
f"(spec §3.2 우선순위 4)",
)
# 5. structural_minor_overflow — boundary spill (부분만 잘림)
if content_type == "structural_unit":
return result(
"structural_minor_overflow",
f"content_type=structural_unit AND line_equivalent={line_equivalent} < 1.0 — "
f"boundary spill (부분 단위 잘림, 완전 단위 손실 아님) (spec §3.2 우선순위 5)",
)
# 6. moderate_overflow — text/label flow 의 중간 양
if content_type in {"text_flow", "frame_label"} and 1.5 < line_equivalent <= 4.0:
return result(
"moderate_overflow",
f"content_type={content_type} AND line_equivalent={line_equivalent} ∈ (1.5, 4] "
f"(spec §3.2 우선순위 6)",
)
# 7. minor_overflow — text/label flow 의 작은 양
if content_type in {"text_flow", "frame_label"} and line_equivalent <= 1.5:
return result(
"minor_overflow",
f"content_type={content_type} AND line_equivalent={line_equivalent} ≤ 1.5 "
f"(spec §3.2 우선순위 7)",
)
# 8. hard_visual_fail — fallback (위 어디에도 안 잡힘)
return result(
"hard_visual_fail",
f"위 매핑 모두 미적용 (content_type={content_type}, line_equivalent="
f"{line_equivalent}) — fallback (spec §3.2 우선순위 8)",
)
# ─── visual_runtime_check 결과 → 전체 fit_classification trace ────
def _build_placement_diagnostic_for_zone(
zone_position: str,
placement_trace: Optional[dict],
mapper_template_id: Optional[str],
) -> dict:
"""zone 별 placement diagnostic 빌더 — placement_trace from B4 → 진단 dict.
phase_z2_pipeline.py 의 trace-only B1→B2→B4 chain 결과 (debug_zones[i].placement_trace)
를 per-zone surface 한 진단. classifier 는 *consume only* — placement_trace
raw 구조는 미변경.
Args:
zone_position : zone position ("top" / "bottom_l" 등)
placement_trace : phase_z2_pipeline.py 의 placement_trace dict 또는 None
mapper_template_id : 기존 mapper / V4 가 선택한 frame template_id
(placement_trace 에 mapper_frame_template_id 누락 시 fallback)
Returns:
per-zone placement diagnostic dict (shape-stable, missing fields = None / 0).
"""
if placement_trace is None:
return {
"zone_position": zone_position,
"mapper_frame_template_id": mapper_template_id,
"b4_selected_template_id": None,
"frame_selection_matches_mapper": None,
"frame_selection_match_note": "no placement_trace recorded",
"region_count": 0,
"slot_assignment_count": 0,
"rejection_count": 0,
}
return {
"zone_position": zone_position,
"mapper_frame_template_id": (
placement_trace.get("mapper_frame_template_id") or mapper_template_id
),
"b4_selected_template_id": placement_trace.get("selected_template_id"),
"frame_selection_matches_mapper": placement_trace.get("frame_selection_matches_mapper"),
"frame_selection_match_note": placement_trace.get("frame_selection_match_note"),
"region_count": len(placement_trace.get("internal_regions") or []),
"slot_assignment_count": len(placement_trace.get("slot_assignments") or []),
"rejection_count": len(placement_trace.get("rejection") or []),
}
def classify_visual_runtime_check(overflow: dict, debug_zones: list[dict]) -> dict:
"""Selenium overflow + composition 의 zone debug → 전체 fit_classification 산출.
각 overflow event (zone-self overflow / cell-level clipped_inner) 를 개별 분류.
Args:
overflow : run_overflow_check 결과 (passed, slide, zones[], ...)
debug_zones : pipeline 의 debug_zones list (zone 별 capacity_fit / template_id 등)
Returns:
dict :
visual_check_passed : Selenium 통과 여부
classifications : 각 overflow event 의 분류 결과 list
summary : 텍스트 요약 (n events, categories seen)
categories_seen : 등장한 카테고리 unique list
unclassified_signals : 미분류 신호 (raw Selenium 결과 중 분류 안 된 것)
placement_diagnostics : per-zone placement_trace 진단 (B4 vs mapper
divergence + region / slot_assignment / rejection
count) — passed 여부 무관 항상 surface
"""
# placement_diagnostics — debug_zones[i].placement_trace 를 per-zone diagnostic 으로 surface.
# passed 여부 무관 항상 빌드 (B4 vs mapper divergence 가 passed 에서도 진단 가치).
placement_diagnostics = [
_build_placement_diagnostic_for_zone(
zone_position=dz.get("position", "?"),
placement_trace=dz.get("placement_trace"),
mapper_template_id=dz.get("v4_template_id"),
)
for dz in (debug_zones or [])
]
if overflow.get("passed", False):
return {
"visual_check_passed": True,
"classifications": [],
"summary": "visual check passed — no overflow to classify",
"categories_seen": [],
"unclassified_signals": [],
"placement_diagnostics": placement_diagnostics,
}
# zone position → debug_zones 매핑 (capacity_fit_status 추출용)
capacity_status_by_position: dict[str, Optional[str]] = {}
template_id_by_position: dict[str, Optional[str]] = {}
for dz in (debug_zones or []):
pos = dz.get("position")
capacity_status_by_position[pos] = (
(dz.get("composition_rationale") or {})
.get("capacity_fit", {})
.get("fit_status")
)
template_id_by_position[pos] = dz.get("v4_template_id")
classifications: list[dict] = []
for z in overflow.get("zones", []):
zone_position = z.get("position", "?")
zone_template_id = z.get("template_id") or template_id_by_position.get(zone_position)
capacity_fit_status = capacity_status_by_position.get(zone_position)
# zone-self overflow (frame root 자체)
if z.get("overflowed"):
cls = classify_overflow(
excess_y=z.get("excess_y", 0),
excess_x=z.get("excess_x", 0),
class_name=zone_template_id and f"f{re.sub(r'[^0-9]', '', str(zone_template_id))[:2] or '0'}b" or "f?b",
# zone 자체는 frame root 패턴 매칭 → frame_internal 으로 분류 의도
capacity_fit_status=capacity_fit_status,
)
cls["source"] = "zone_self_overflow"
cls["zone_position"] = zone_position
cls["zone_template_id"] = zone_template_id
classifications.append(cls)
# cell-level clipped_inner
for c in z.get("clipped_inner", []):
cls = classify_overflow(
excess_y=c.get("excess_y", 0),
excess_x=c.get("excess_x", 0),
class_name=c.get("class_name", ""),
inner_content_signals=c.get("inner_content_signals") or [],
capacity_fit_status=capacity_fit_status,
)
cls["source"] = "clipped_inner"
cls["zone_position"] = zone_position
cls["zone_template_id"] = zone_template_id
cls["client_height"] = c.get("clientHeight")
cls["scroll_height"] = c.get("scrollHeight")
classifications.append(cls)
# slide-level / slide-body overflow (zones 외부) 도 분류 시도 (보통 zone-level 에서 잡히지만 보조)
unclassified: list[dict] = []
slide_m = overflow.get("slide") or {}
if slide_m.get("overflowed"):
unclassified.append({
"level": "slide",
"excess_y": slide_m.get("excess_y"),
"excess_x": slide_m.get("excess_x"),
"note": "slide-level overflow — 보통 zone 단위 분류로 충분, 미분류 보고만",
})
body_m = overflow.get("slide_body") or {}
if body_m.get("overflowed"):
unclassified.append({
"level": "slide_body",
"excess_y": body_m.get("excess_y"),
"excess_x": body_m.get("excess_x"),
"note": "slide_body overflow — 위와 같음",
})
categories = sorted({c["category"] for c in classifications})
return {
"visual_check_passed": False,
"classifications": classifications,
"summary": (
f"{len(classifications)} overflow event(s) classified, "
f"categories: {categories or 'none'}"
),
"categories_seen": categories,
"unclassified_signals": unclassified,
"placement_diagnostics": placement_diagnostics,
}