From 761a43da5ed46e5f4cf46697787db26caaaf8224 Mon Sep 17 00:00:00 2001 From: kyeongmin Date: Thu, 7 May 2026 05:26:57 +0900 Subject: [PATCH] Add Phase Z B4 source-shape-aware placement - enable B1/B2/B4 source-shape-aware F13 placement behind env flag - align F13 placement_trace with mapper top_bullets cardinality - preserve canonical render output when flag is off --- src/phase_z2_content_extractor.py | 53 ++++++++++++++--- src/phase_z2_internal_region_planner.py | 76 +++++++++++++++++++++++++ src/phase_z2_pipeline.py | 12 +++- src/phase_z2_placement_planner.py | 33 ++++++----- 4 files changed, 151 insertions(+), 23 deletions(-) diff --git a/src/phase_z2_content_extractor.py b/src/phase_z2_content_extractor.py index f346a24..8c579f7 100644 --- a/src/phase_z2_content_extractor.py +++ b/src/phase_z2_content_extractor.py @@ -26,6 +26,7 @@ from __future__ import annotations import re from dataclasses import dataclass, field +from typing import Optional # B1 v0 helper 처리 정직 기록 (기존 보고 정정 — 2026-04-30) : # - `phase_z2_mapper` 미수정. 기존 mapper helper (`_extract_markdown_table` 등) move / @@ -46,12 +47,14 @@ class ContentObject: """SPEC v1 §1.1 base schema. v0 = text_block + transform_table 만 지원. Fields : - id : section 내 unique id (예: '03-2.transform-1' / '03-2.text-1') - type : "text_block" | "transform_table" - role : v0 = "summary" 만 (정밀화는 별 axis) - raw_payload : 원본 markdown (자름 / 변형 X — 원문 보존 룰) - size_estimate : type 별 (line_count / rows 등) - type_specific : type 별 detail (SPEC v1 §1.2) + id : section 내 unique id (예: '03-2.transform-1' / '03-2.text-1') + type : "text_block" | "transform_table" + role : v0 = "summary" 만 (정밀화는 별 axis) + raw_payload : 원본 markdown (자름 / 변형 X — 원문 보존 룰) + size_estimate : type 별 (line_count / rows 등) + type_specific : type 별 detail (SPEC v1 §1.2) + source_shape_index : positional index within source_shape (Option 1, optional) + source_shape_kind : "top_bullets" | "h3_subsections" | ... (Option 1, optional) """ id: str @@ -60,6 +63,8 @@ class ContentObject: raw_payload: str size_estimate: dict = field(default_factory=dict) type_specific: dict = field(default_factory=dict) + source_shape_index: Optional[int] = None + source_shape_kind: Optional[str] = None # ─── Transform table extraction ───────────────────────────────── @@ -187,7 +192,7 @@ def _detect_text_block_specific(content: str) -> tuple[dict, int]: # ─── Public entry ─────────────────────────────────────────────── -def extract_content_objects(section) -> list[ContentObject]: +def extract_content_objects(section, source_shape: Optional[str] = None) -> list[ContentObject]: """MDX section.raw_content → typed content_object list (SPEC v1 §1). v0 minimal : @@ -196,15 +201,45 @@ def extract_content_objects(section) -> list[ContentObject]: - 미지원 type (table / image / diagram / details) = 무시 (별 axis) - 원문 (raw_payload) = 자름 / 변형 X (원문 보존 룰) + Option 1 (source_shape-aware) : + - source_shape="top_bullets" : raw_content 를 mapper.split_source 로 N units 분할 → + unit 별 ContentObject 1 개 (text_block) with source_shape_index=i / source_shape_kind="top_bullets" + - source_shape=None 또는 미지원 값 (h3_subsections 등) : 기존 legacy 동작 + Args : - section : MdxSection-like 객체 (section_id, raw_content 필드 필요) + section : MdxSection-like 객체 (section_id, raw_content 필드 필요) + source_shape : "top_bullets" 시 source_shape-aware 분기. None 이면 legacy. Returns : - list[ContentObject] — 0 ~ 2 개 (content 비어 있으면 0, transform-only 면 1, mixed 면 2) + list[ContentObject] — legacy 0~2 / top_bullets N (bullet 수) """ content = section.raw_content section_id = section.section_id + if source_shape == "top_bullets": + from phase_z2_mapper import split_source + units = split_source("top_bullets", content) + objects: list[ContentObject] = [] + for i, unit in enumerate(units): + unit_text = unit if isinstance(unit, str) else str(unit) + if not unit_text.strip(): + continue + text_specific, line_count = _detect_text_block_specific(unit_text) + objects.append( + ContentObject( + id=f"{section_id}.text-{i + 1}", + type="text_block", + role="summary", + raw_payload=unit_text.strip(), + size_estimate={"line_count": line_count}, + type_specific=text_specific, + source_shape_index=i, + source_shape_kind="top_bullets", + ) + ) + return objects + + # legacy path (source_shape=None 또는 미지원 값) objects: list[ContentObject] = [] # 1. transform_table 추출 시도 (3-col with arrow) diff --git a/src/phase_z2_internal_region_planner.py b/src/phase_z2_internal_region_planner.py index 8a9547b..115c1a9 100644 --- a/src/phase_z2_internal_region_planner.py +++ b/src/phase_z2_internal_region_planner.py @@ -62,6 +62,7 @@ class InternalRegion: — B2 v0 에서 kind="frame_match" / frame_id=None / display_strategy="inline_full" 고정. 실제 frame 결정은 Step 9 / B4 책임. + source_shape_index : positional index from B1 source_shape split (Option 1, optional) """ region_id: str @@ -70,6 +71,7 @@ class InternalRegion: ratio_estimate: float content_unit_ids: list[str] frame_match_strategy: dict + source_shape_index: Optional[int] = None @dataclass @@ -146,6 +148,75 @@ _TYPE_ORDER_PRIORITY: dict[str, int] = { } +# ─── Option 1 source_shape-aware planner ───────────────────────── + + +def _plan_by_source_shape_index( + content_objects: list[ContentObject], + section_id: str, +) -> ZoneRegionPlan: + """source_shape_index 기준 *positional* region grouping (Option 1). + + 같은 source_shape_index 끼리 1 region. mapper 의 split_source 와 cardinality align — + F13 의 top_bullets 3 개 → 3 region 으로 mapper pillar_1/2/3 와 1:1 positional. + """ + groups: dict[int, list[ContentObject]] = {} + for obj in content_objects: + if obj.source_shape_index is None: + continue + groups.setdefault(obj.source_shape_index, []).append(obj) + + sorted_indices = sorted(groups.keys()) + + # size proxy + ratio (positional region 내부 size_estimate 합산) + index_sizes: dict[int, float] = {idx: sum(_size_proxy(o) for o in groups[idx]) for idx in sorted_indices} + total_size = sum(index_sizes.values()) + if total_size <= 0: + equal_share = 1.0 / max(len(sorted_indices), 1) + index_sizes = {idx: equal_share for idx in sorted_indices} + total_size = sum(index_sizes.values()) or 1.0 + + regions: list[InternalRegion] = [] + for ord_idx, sidx in enumerate(sorted_indices, start=1): + objs = groups[sidx] + # role / content_type : group 내 첫 obj 의 type 기반 (Option 1 pilot = text_block 동질) + primary_obj = objs[0] + ctype = primary_obj.type + regions.append( + InternalRegion( + region_id=f"{section_id}.region-{ord_idx}", + role=_TYPE_ROLE.get(ctype, "primary"), + content_type=ctype, + ratio_estimate=round(index_sizes[sidx] / total_size, 4), + content_unit_ids=[o.id for o in objs], + frame_match_strategy={ + "kind": "frame_match", + "frame_id": None, + "display_strategy": "inline_full", + }, + source_shape_index=sidx, + ) + ) + + region_count = len(regions) + if region_count == 1: + layout_type = "region-single" + placement = "single" + else: + layout_type = "region-vertical-stack" + placement = "vertical" + + region_order = [r.region_id for r in regions] + return ZoneRegionPlan( + internal_regions=regions, + region_layout=RegionLayout( + region_layout_type=layout_type, + region_order=region_order, + region_placement=placement, + ), + ) + + # ─── Public entry ──────────────────────────────────────────────── @@ -186,6 +257,11 @@ def plan_internal_regions( if not content_objects: return ZoneRegionPlan() + # Option 1 source_shape-aware path : ContentObjects 가 source_shape_index 보유 시 *positional* + # grouping. 같은 index 끼리 1 region. mapper 의 split_source 와 cardinality align. + if any(o.source_shape_index is not None for o in content_objects): + return _plan_by_source_shape_index(content_objects, section_id) + # 1. type 별 grouping groups = _group_by_type_preserving_order(content_objects) diff --git a/src/phase_z2_pipeline.py b/src/phase_z2_pipeline.py index d6cb393..1e6e2a7 100644 --- a/src/phase_z2_pipeline.py +++ b/src/phase_z2_pipeline.py @@ -1063,7 +1063,17 @@ def run_phase_z2_mvp1(mdx_path: Path, run_id: Optional[str] = None) -> Path: # 결과 (PlacementPlan) = debug_zones[i].placement_trace 로 *기록만*. # render path / mapper output / final.html 모두 미변경 — B5 baseline SHA 유지. # B4 frame selection = catalog declaration order (V4 evidence 미사용 — 별 axis). - content_objects = extract_content_objects(synth_section) + # Option 1 (PHASE_Z_B4_SOURCE_SHAPE_ENABLED, default OFF) : pilot = F13 top_bullets only. + b4_source_shape_enabled = ( + os.environ.get("PHASE_Z_B4_SOURCE_SHAPE_ENABLED", "").strip().lower() + in {"1", "true", "yes"} + ) + b1_source_shape = ( + contract.get("source_shape") + if b4_source_shape_enabled and contract.get("source_shape") == "top_bullets" + else None + ) + content_objects = extract_content_objects(synth_section, source_shape=b1_source_shape) placement_plan = plan_placement( content_objects=content_objects, frame_contracts=list(load_frame_contracts().values()), diff --git a/src/phase_z2_placement_planner.py b/src/phase_z2_placement_planner.py index c123e3a..2213e91 100644 --- a/src/phase_z2_placement_planner.py +++ b/src/phase_z2_placement_planner.py @@ -115,26 +115,33 @@ def _assign_region_to_sub_zone( frame_sub_zones: list[dict[str, Any]], assigned_sub_zone_ids: set[str], ) -> Optional[dict[str, Any]]: - """region 에 매칭할 sub_zone 선택 (B4 v0 narrowest-first heuristic). + """region 에 매칭할 sub_zone 선택. - rule (B4 v0 lock — F29 deadlock 방지) : + Option 1 (source_shape-aware) : + region.source_shape_index 보유 시 *positional 1:1* — frame_sub_zones[index] 채택. + accepts mismatch 또는 already assigned 시 None (rejection). + + Legacy (B4 v0 narrowest-first heuristic) : 1. not-yet-assigned 중 region.content_type 을 accepts 하는 후보 수집 2. 후보 중 accepts list 가장 *좁은* sub_zone 우선 3. 동률이면 declaration order (Python sort 의 stability 활용) - 예 (F29) : - region.content_type = text_block - candidates = [process_column(accepts=[text,transform], size 2), - product_column(accepts=[text], size 1)] - → product_column 선택 (narrowest) - - region.content_type = transform_table (이후 호출, product_column 이미 assigned) - candidates = [process_column] 만 - → process_column 선택 - Returns : - sub_zone dict 또는 None (compatible 후보 없음) + sub_zone dict 또는 None (compatible 후보 없음 / positional accepts mismatch) """ + # Option 1 positional matching + if region.source_shape_index is not None: + if region.source_shape_index >= len(frame_sub_zones): + return None + sz = frame_sub_zones[region.source_shape_index] + if sz["id"] in assigned_sub_zone_ids: + return None + accepts = sz.get("accepts") or [] + if region.content_type not in accepts: + return None + return sz + + # Legacy narrowest-first candidates: list[dict[str, Any]] = [] for sz in frame_sub_zones: if sz["id"] in assigned_sub_zone_ids: