diff --git a/src/phase_z2_content_extractor.py b/src/phase_z2_content_extractor.py index 8c579f7..261ee1e 100644 --- a/src/phase_z2_content_extractor.py +++ b/src/phase_z2_content_extractor.py @@ -48,13 +48,16 @@ class ContentObject: Fields : id : section 내 unique id (예: '03-2.transform-1' / '03-2.text-1') - type : "text_block" | "transform_table" + type : "text_block" | "transform_table" | "details" | "image" | "table" role : v0 = "summary" 만 (정밀화는 별 axis) raw_payload : 원본 markdown (자름 / 변형 X — 원문 보존 룰) size_estimate : type 별 (line_count / rows 등) type_specific : type 별 detail (SPEC v1 §1.2) source_shape_index : positional index within source_shape (Option 1, optional) source_shape_kind : "top_bullets" | "h3_subsections" | ... (Option 1, optional) + scope : "section" (default for v0) | "slide" (IMP-03 rich objects) + mdx_id : 2-digit MDX id (예: '03') — slide-level rich objects 용 + section_id : section 매핑 — slide-level rich objects 는 None """ id: str @@ -65,6 +68,9 @@ class ContentObject: type_specific: dict = field(default_factory=dict) source_shape_index: Optional[int] = None source_shape_kind: Optional[str] = None + scope: Optional[str] = None + mdx_id: Optional[str] = None + section_id: Optional[str] = None # ─── Transform table extraction ───────────────────────────────── @@ -274,6 +280,169 @@ def extract_content_objects(section, source_shape: Optional[str] = None) -> list return objects +# ─── IMP-03 (Step 3) — rich ContentObject extractor (slide-level) ─ + +# scope-lock 16 조건 (Gitea #3) : +# - SPEC v1 §1.2 의 table / image / details 3 type 추가 (diagram 제외) +# - 별 함수 분리 — v0 `extract_content_objects` signature/behavior 미터치 +# - slide-level attribution — section param 없음, id = `{mdx_id}.{type}-N`, +# ContentObject.scope='slide' / mdx_id= / section_id=None +# - transform_table dedup — arrow row 감지 시 skip (v0 가 단독 source) +# - asset row shape contract (mdx_normalizer SoT) : +# popup = {title:str, content:str} +# image = {alt:str, path:str} +# table = {headers:list[str], rows:list[list[str]]} +# - render path 미연결 — Step 3 artifact trace only +# - plan_placement() 는 *v0 list 만* 받음 (B4 회귀 X) + + +def _looks_like_transform_table(table: dict) -> bool: + """normalize_mdx_content 의 table 가 AS-IS / arrow / TO-BE 구조인지 감지. + + arrow row 가 *어떤 column 이든* 1 개 이상 등장 → transform 으로 분류 (v0 가 처리). + + Args : + table : {"headers": list[str], "rows": list[list[str]]} + Returns : + True = transform_table 후보 (rich extractor 는 skip) + False = 일반 table + """ + rows = table.get("rows") or [] + for row in rows: + for cell in row: + cell_s = str(cell) if cell is not None else "" + if any(g in cell_s for g in _ARROW_GLYPHS): + return True + headers = table.get("headers") or [] + for h in headers: + h_s = str(h) if h is not None else "" + if any(g in h_s for g in _ARROW_GLYPHS): + return True + return False + + +def _reconstruct_markdown_table(headers: list, rows: list) -> str: + """headers / rows → markdown table string (raw_md / raw_payload 용).""" + if not headers and not rows: + return "" + out_lines: list[str] = [] + if headers: + out_lines.append("| " + " | ".join(str(h) for h in headers) + " |") + out_lines.append("|" + "|".join("---" for _ in headers) + "|") + for row in rows: + out_lines.append("| " + " | ".join(str(c) for c in row) + " |") + return "\n".join(out_lines) + + +def extract_rich_content_objects( + normalized_assets: Optional[dict], + mdx_id: str, +) -> tuple[list[ContentObject], list[dict]]: + """IMP-03 — slide-level rich ContentObject extractor. + + Consumes mdx_normalizer's flat popup/image/table lists (via + `stage0_normalized_assets`) and emits typed ContentObjects with + slide-level attribution (`scope='slide'`, `section_id=None`). + + transform_table dedup : arrow glyph 감지 시 skip — v0 + `_capture_3col_transform_table()` 가 단독 transform_table source. + skip 시 진단 entry 반환 (`skipped_transform_table_duplicate` reason). + + Args : + normalized_assets : {popups: [{title, content}], images: [{alt, path}], + tables: [{headers, rows}]} 또는 None + mdx_id : 2-digit MDX id (예: '03') + + Returns : + (rich_objects, skip_diagnostics) + rich_objects : list[ContentObject] — slide-level + skip_diagnostics : list[dict] — 각 skip 사유 (index, reason) + """ + if not normalized_assets: + return [], [] + + out: list[ContentObject] = [] + skips: list[dict] = [] + + # details (popups) — sequence 1..N + for i, p in enumerate(normalized_assets.get("popups") or [], start=1): + title = (p.get("title") or "").strip() if isinstance(p, dict) else "" + body = (p.get("content") or "").strip() if isinstance(p, dict) else "" + line_count = body.count("\n") + (1 if body else 0) + out.append(ContentObject( + id=f"{mdx_id}.details-{i}", + type="details", + role="summary", + raw_payload=body, + size_estimate={"line_count": line_count, "bytes": len(body)}, + type_specific={ + "summary": title, + "body_raw": body, + "display_hint": "popup", + }, + scope="slide", + mdx_id=mdx_id, + section_id=None, + )) + + # image + for i, img in enumerate(normalized_assets.get("images") or [], start=1): + src = (img.get("path") or "").strip() if isinstance(img, dict) else "" + alt = (img.get("alt") or "").strip() if isinstance(img, dict) else "" + out.append(ContentObject( + id=f"{mdx_id}.image-{i}", + type="image", + role="summary", + raw_payload=src, + size_estimate={"bytes": len(src)}, + type_specific={ + "src": src, + "alt": alt, + "aspect_ratio": None, + "intrinsic_width_px": None, + "intrinsic_height_px": None, + }, + scope="slide", + mdx_id=mdx_id, + section_id=None, + )) + + # table — arrow 감지 시 skip + for i, t in enumerate(normalized_assets.get("tables") or [], start=1): + if not isinstance(t, dict): + skips.append({"index": i, "reason": "invalid_table_shape"}) + continue + if _looks_like_transform_table(t): + skips.append({ + "index": i, + "reason": "skipped_transform_table_duplicate", + "headers": t.get("headers") or [], + }) + continue + headers = t.get("headers") or [] + rows = t.get("rows") or [] + raw_md = _reconstruct_markdown_table(headers, rows) + out.append(ContentObject( + id=f"{mdx_id}.table-{i}", + type="table", + role="summary", + raw_payload=raw_md, + size_estimate={"rows": len(rows), "bytes": len(raw_md)}, + type_specific={ + "rows": len(rows), + "cols": len(headers), + "header_present": bool(headers), + "is_transform": False, + "raw_md": raw_md, + }, + scope="slide", + mdx_id=mdx_id, + section_id=None, + )) + + return out, skips + + # ─── Self-test (B1 v0 correctness 검증) ───────────────────────── @@ -354,5 +523,89 @@ def _run_self_test(): print("\n=== B1 v0 self-test PASS ===") +def _run_rich_self_test(): + """IMP-03 (Step 3) — rich ContentObject extractor 3 case self-test. + + cases : + 1. popup → details ContentObject + 2. image → image ContentObject + 3. table (non-transform) → table ContentObject + 4. table (arrow) → skip (transform_table dedup) + """ + + # ─── Test 1 : popup → details ─── + assets1 = { + "popups": [{"title": "F13 안", "content": "정책 사례 정리 ...\n2 번째 줄"}], + "images": [], + "tables": [], + } + rich1, skips1 = extract_rich_content_objects(assets1, mdx_id="03") + assert len(rich1) == 1 and not skips1, f"popup → 1 obj, got rich={len(rich1)} skips={len(skips1)}" + o = rich1[0] + assert o.id == "03.details-1" + assert o.type == "details" and o.role == "summary" + assert o.scope == "slide" and o.mdx_id == "03" and o.section_id is None + assert o.type_specific["summary"] == "F13 안" + assert o.type_specific["display_hint"] == "popup" + assert "정책 사례" in o.type_specific["body_raw"] + print("[OK] Rich Test 1 (popup → details) passed.") + + # ─── Test 2 : image ─── + assets2 = { + "popups": [], + "images": [{"alt": "BIM 모델", "path": "img/bim.png"}], + "tables": [], + } + rich2, skips2 = extract_rich_content_objects(assets2, mdx_id="03") + assert len(rich2) == 1 and not skips2 + o = rich2[0] + assert o.id == "03.image-1" + assert o.type == "image" and o.scope == "slide" + assert o.type_specific["src"] == "img/bim.png" + assert o.type_specific["alt"] == "BIM 모델" + assert o.type_specific["aspect_ratio"] is None + print("[OK] Rich Test 2 (image) passed.") + + # ─── Test 3 : non-transform table ─── + assets3 = { + "popups": [], + "images": [], + "tables": [{"headers": ["분류", "내용"], "rows": [["기술", "BIM"], ["인력", "전문가"]]}], + } + rich3, skips3 = extract_rich_content_objects(assets3, mdx_id="03") + assert len(rich3) == 1 and not skips3 + o = rich3[0] + assert o.id == "03.table-1" + assert o.type == "table" and o.scope == "slide" + assert o.type_specific["rows"] == 2 and o.type_specific["cols"] == 2 + assert o.type_specific["header_present"] is True + assert o.type_specific["is_transform"] is False + assert "기술" in o.type_specific["raw_md"] + print("[OK] Rich Test 3 (non-transform table) passed.") + + # ─── Test 4 : arrow transform table → skip ─── + assets4 = { + "popups": [], + "images": [], + "tables": [{"headers": ["AS-IS", "➜", "TO-BE"], + "rows": [["도면 중심", "➜", "BIM 중심"]]}], + } + rich4, skips4 = extract_rich_content_objects(assets4, mdx_id="03") + assert len(rich4) == 0, f"arrow table → 0 rich obj 기대, got {len(rich4)}" + assert len(skips4) == 1 + assert skips4[0]["reason"] == "skipped_transform_table_duplicate" + print("[OK] Rich Test 4 (arrow table → skip) passed.") + + # ─── Test 5 : empty normalized_assets → empty ─── + rich5, skips5 = extract_rich_content_objects(None, mdx_id="03") + assert rich5 == [] and skips5 == [] + rich6, skips6 = extract_rich_content_objects({}, mdx_id="03") + assert rich6 == [] and skips6 == [] + print("[OK] Rich Test 5 (empty) passed.") + + print("\n=== IMP-03 rich extractor self-test PASS ===") + + if __name__ == "__main__": _run_self_test() + _run_rich_self_test() diff --git a/src/phase_z2_pipeline.py b/src/phase_z2_pipeline.py index 0b7648b..898f5e8 100644 --- a/src/phase_z2_pipeline.py +++ b/src/phase_z2_pipeline.py @@ -64,7 +64,7 @@ from phase_z2_failure_router import enrich_retry_trace_with_failure_classificati # trace-only runtime 연결 v0 — B1 → B4 chain. # final.html / mapper / render path 미영향. debug_zones[i].placement_trace 만 기록. -from phase_z2_content_extractor import extract_content_objects +from phase_z2_content_extractor import extract_content_objects, extract_rich_content_objects from phase_z2_placement_planner import plan_placement @@ -223,7 +223,7 @@ def _stage0_chained_adapter( legacy_slide_title: str, legacy_sections: list[MdxSection], legacy_footer: Optional[str], -) -> tuple[str, list[MdxSection], Optional[str], dict]: +) -> tuple[str, list[MdxSection], Optional[str], dict, dict]: """IMP-02 — chained adapter for Stage 0 normalize → Phase Z Step 2 input. Chain: mdx_normalizer.normalize_mdx_content + section_parser.extract_major_sections @@ -233,7 +233,9 @@ def _stage0_chained_adapter( output with diagnostics indicating disabled. When ON, runs adapter chain; on any hard contract failure or exception, falls back to legacy and records fallback_reason. - Returns (slide_title, sections, footer, diagnostics). + Returns (slide_title, sections, footer, diagnostics, normalized_assets). + normalized_assets = {"popups": [...], "images": [...], "tables": [...]} + — IMP-03 Step 3 handoff. env=OFF or hard fallback 시 빈 list. """ diagnostics: dict = { "enabled": False, @@ -243,12 +245,14 @@ def _stage0_chained_adapter( "adapter_counts": None, "legacy_counts": {"sections": len(legacy_sections)}, } + # IMP-03 — Step 3 handoff. env=OFF / fallback 시 모든 list 가 비어 있음. + normalized_assets: dict = {"popups": [], "images": [], "tables": []} raw_flag = os.environ.get("PHASE_Z_STAGE0_ADAPTER_ENABLED", "").strip().lower() enabled = raw_flag in {"1", "true", "yes"} diagnostics["enabled"] = enabled if not enabled: - return legacy_slide_title, legacy_sections, legacy_footer, diagnostics + return legacy_slide_title, legacy_sections, legacy_footer, diagnostics, normalized_assets try: # Defer imports — legacy path must not depend on these modules. @@ -259,12 +263,12 @@ def _stage0_chained_adapter( normalized = normalize_mdx_content(raw_mdx) if not isinstance(normalized, dict) or not isinstance(normalized.get("sections"), list): diagnostics["fallback_reason"] = "MISSING_INVALID_IDS" - return legacy_slide_title, legacy_sections, legacy_footer, diagnostics + return legacy_slide_title, legacy_sections, legacy_footer, diagnostics, normalized_assets majors = extract_major_sections(normalized["sections"]) if not majors: diagnostics["fallback_reason"] = "NO_USABLE_SECTIONS" - return legacy_slide_title, legacy_sections, legacy_footer, diagnostics + return legacy_slide_title, legacy_sections, legacy_footer, diagnostics, normalized_assets adapter_title = (normalized.get("title") or "").strip() or legacy_slide_title conclusion = extract_conclusion_text(raw_mdx) @@ -304,10 +308,10 @@ def _stage0_chained_adapter( if section_num <= 0: diagnostics["fallback_reason"] = "NON_POSITIVE_SECTION_NUM" - return legacy_slide_title, legacy_sections, legacy_footer, diagnostics + return legacy_slide_title, legacy_sections, legacy_footer, diagnostics, normalized_assets if section_num in used_nums: diagnostics["fallback_reason"] = "DUPLICATE_IDS" - return legacy_slide_title, legacy_sections, legacy_footer, diagnostics + return legacy_slide_title, legacy_sections, legacy_footer, diagnostics, normalized_assets used_nums.add(section_num) diagnostics["id_reconstruction_log"].append({ @@ -337,12 +341,19 @@ def _stage0_chained_adapter( "footer_match": adapter_footer == legacy_footer, } diagnostics["used"] = True - return adapter_title, adapter_sections, adapter_footer, diagnostics + # IMP-03 — populate Step 3 handoff (success path only). + # All fallback paths leave normalized_assets as empty lists (defined at fn top). + normalized_assets = { + "popups": normalized.get("popups", []) or [], + "images": normalized.get("images", []) or [], + "tables": normalized.get("tables", []) or [], + } + return adapter_title, adapter_sections, adapter_footer, diagnostics, normalized_assets except Exception as exc: # noqa: BLE001 — adapter must never break legacy path diagnostics["fallback_reason"] = "ADAPTER_EXCEPTION" diagnostics["exception"] = repr(exc) - return legacy_slide_title, legacy_sections, legacy_footer, diagnostics + return legacy_slide_title, legacy_sections, legacy_footer, diagnostics, normalized_assets # ─── V4 lookup ────────────────────────────────────────────────── @@ -1480,7 +1491,14 @@ def run_phase_z2_mvp1( # (mdx_normalizer + section_parser) replaces legacy parse_mdx output; # on any contract failure or exception, falls back to legacy with # fallback_reason recorded in stage0_adapter_diagnostics. - slide_title, sections, slide_footer, stage0_adapter_diagnostics = _stage0_chained_adapter( + # IMP-03 — 5-tuple return adds stage0_normalized_assets (Step 3 handoff). + ( + slide_title, + sections, + slide_footer, + stage0_adapter_diagnostics, + stage0_normalized_assets, + ) = _stage0_chained_adapter( mdx_path, legacy_slide_title, legacy_sections, legacy_footer, ) _adapter_tag = ( @@ -1516,6 +1534,10 @@ def run_phase_z2_mvp1( # IMP-02 — additive only. enabled/used/fallback_reason + id reconstruction # trace + count diff. Out of scope: V4 / align / composition. "stage0_adapter_diagnostics": stage0_adapter_diagnostics, + # IMP-03 — Step 3 handoff (slide-level rich asset list). + # env=OFF / fallback 시 모든 list 가 비어 있음. consumer = Step 3 + # rich extractor (PHASE_Z_STEP3_RICH_OBJECTS_ENABLED canary). + "stage0_normalized_assets": stage0_normalized_assets, }, step_status="partial", pipeline_path_connected=True, @@ -1525,7 +1547,8 @@ def run_phase_z2_mvp1( "parse_mdx 결과: title / sections / footer 분리 + raw_content 보존. " "heading tree 미생성, orphan / details 감지 미완 (Step 2 ⚠ partial — 별 axis). " "orphans / details 필드는 schema lock — 빈 배열이라도 'detection 미수행' marker. " - "stage0_adapter_diagnostics = IMP-02 chained adapter trace (default OFF canary)." + "stage0_adapter_diagnostics = IMP-02 chained adapter trace (default OFF canary). " + "stage0_normalized_assets = IMP-03 Step 3 slide-level handoff (popups/images/tables list)." ), ) @@ -1891,6 +1914,49 @@ def run_phase_z2_mvp1( }) # ─── Step 3: Content Object 추출 (B1, trace-only) ─── + # IMP-03 — slide-level rich ContentObject 추출 (default OFF canary). + # scope-lock 16 조건 (Gitea #3) : + # - 별 함수 (extract_rich_content_objects) — v0 extract_content_objects unchanged + # - slide-level — section_id=None, id=`{mdx_id}.{type}-N`, scope='slide' + # - root-level once (per-zone duplication X) + # - plan_placement() 는 v0 list 만 받음 (B4 회귀 X) — 본 rich 결과는 artifact only + # - transform_table dedup : arrow row 감지 시 skip + rich_flag = os.environ.get("PHASE_Z_STEP3_RICH_OBJECTS_ENABLED", "").strip().lower() + rich_enabled_flag = rich_flag in {"1", "true", "yes"} + _assets_total = ( + len(stage0_normalized_assets.get("popups") or []) + + len(stage0_normalized_assets.get("images") or []) + + len(stage0_normalized_assets.get("tables") or []) + ) + rich_disabled_reason: Optional[str] = None + if not rich_enabled_flag: + rich_disabled_reason = "FLAG_OFF" + elif _assets_total == 0: + rich_disabled_reason = "NO_NORMALIZED_ASSETS" + + rich_objects: list = [] + rich_skips: list = [] + if rich_disabled_reason is None: + mdx_num_match = re.match(r"(\d+)", mdx_path.stem) + rich_mdx_id = mdx_num_match.group(1).zfill(2) if mdx_num_match else "00" + rich_objects, rich_skips = extract_rich_content_objects( + stage0_normalized_assets, mdx_id=rich_mdx_id, + ) + + # Count/list invariant check (IMP-02 ↔ IMP-03 chain) — soft warning, no fail. + invariant_warnings: list[dict] = [] + _adapter_counts = (stage0_adapter_diagnostics or {}).get("adapter_counts") or {} + if _adapter_counts: + for key in ("popups", "images", "tables"): + expected = _adapter_counts.get(key) + actual = len(stage0_normalized_assets.get(key) or []) + if expected is not None and expected != actual: + invariant_warnings.append({ + "field": key, + "adapter_counts": expected, + "stage0_normalized_assets_len": actual, + }) + _write_step_artifact( run_dir, 3, "content_objects", data={ @@ -1902,12 +1968,25 @@ def run_phase_z2_mvp1( } for dz in debug_zones ], + # IMP-03 — slide-level rich trace (additive, trace-only). + "rich_content_objects": [asdict(o) for o in rich_objects], + "rich_content_objects_enabled": rich_disabled_reason is None, + "rich_content_objects_scope": "slide", + "rich_content_objects_source": "stage0_normalized_assets", + "rich_content_objects_disabled_reason": rich_disabled_reason, + "rich_content_objects_skips": rich_skips, + "rich_content_objects_invariant_warnings": invariant_warnings, }, step_status="trace-only", pipeline_path_connected=False, inputs=["step02_normalized.json"], outputs=["step03_content_objects.json"], - note="현재는 trace 로 기록되지만 render payload 를 직접 만들지는 않음. mapper.py 가 별도로 MDX 직접 파싱.", + note=( + "현재는 trace 로 기록되지만 render payload 를 직접 만들지는 않음. " + "mapper.py 가 별도로 MDX 직접 파싱. " + "IMP-03 rich_content_objects = slide-level popup/image/table trace " + "(PHASE_Z_STEP3_RICH_OBJECTS_ENABLED canary, default OFF)." + ), ) # ─── Step 4: Section Internal Composition (B2, trace-only) ───