diff --git a/src/phase_z2_pipeline.py b/src/phase_z2_pipeline.py index 5208614..0b7648b 100644 --- a/src/phase_z2_pipeline.py +++ b/src/phase_z2_pipeline.py @@ -200,6 +200,151 @@ def parse_mdx(mdx_path: Path) -> tuple[str, list[MdxSection], Optional[str]]: return slide_title, sections, footer_text +# IMP-02 (Phase Z Step 2) — Stage 0 normalize chained adapter. +# scope-lock 7 조건 (Gitea #2): +# 1. inline helper near parse_mdx() +# 2. PHASE_Z_STAGE0_ADAPTER_ENABLED env flag, default OFF (canary, matches PHASE_Z_B4_*) +# 3. env=1 sample verification required (in review loop) +# 4. fallback_reason: str | None flat — 5 hard cases +# 5. verify normalize_mdx_content(raw_mdx)["sections"] is list +# 6. preserve Step 2 existing fields; stage0_adapter_diagnostics additive only +# 7. out of scope: V4 / align / composition / AI/Kei / frame selection / status semantics +_STAGE0_FALLBACK_REASONS = { + "ADAPTER_EXCEPTION", + "NO_USABLE_SECTIONS", + "MISSING_INVALID_IDS", + "DUPLICATE_IDS", + "NON_POSITIVE_SECTION_NUM", +} + + +def _stage0_chained_adapter( + mdx_path: Path, + legacy_slide_title: str, + legacy_sections: list[MdxSection], + legacy_footer: Optional[str], +) -> tuple[str, list[MdxSection], Optional[str], dict]: + """IMP-02 — chained adapter for Stage 0 normalize → Phase Z Step 2 input. + + Chain: mdx_normalizer.normalize_mdx_content + section_parser.extract_major_sections + + section_parser.extract_conclusion_text → reconstructed MdxSection list. + + Default OFF (canary, env=`1/true/yes` to enable). When OFF, returns legacy parse_mdx + output with diagnostics indicating disabled. When ON, runs adapter chain; on any + hard contract failure or exception, falls back to legacy and records fallback_reason. + + Returns (slide_title, sections, footer, diagnostics). + """ + diagnostics: dict = { + "enabled": False, + "used": False, + "fallback_reason": None, + "id_reconstruction_log": [], + "adapter_counts": None, + "legacy_counts": {"sections": len(legacy_sections)}, + } + + raw_flag = os.environ.get("PHASE_Z_STAGE0_ADAPTER_ENABLED", "").strip().lower() + enabled = raw_flag in {"1", "true", "yes"} + diagnostics["enabled"] = enabled + if not enabled: + return legacy_slide_title, legacy_sections, legacy_footer, diagnostics + + try: + # Defer imports — legacy path must not depend on these modules. + from mdx_normalizer import normalize_mdx_content + from section_parser import extract_conclusion_text, extract_major_sections + + raw_mdx = mdx_path.read_text(encoding="utf-8") + normalized = normalize_mdx_content(raw_mdx) + if not isinstance(normalized, dict) or not isinstance(normalized.get("sections"), list): + diagnostics["fallback_reason"] = "MISSING_INVALID_IDS" + return legacy_slide_title, legacy_sections, legacy_footer, diagnostics + + majors = extract_major_sections(normalized["sections"]) + if not majors: + diagnostics["fallback_reason"] = "NO_USABLE_SECTIONS" + return legacy_slide_title, legacy_sections, legacy_footer, diagnostics + + adapter_title = (normalized.get("title") or "").strip() or legacy_slide_title + conclusion = extract_conclusion_text(raw_mdx) + adapter_footer = conclusion if conclusion else None + + mdx_num_match = re.match(r"(\d+)", mdx_path.stem) + mdx_id = mdx_num_match.group(1).zfill(2) if mdx_num_match else "00" + + # Pre-scan raw MDX `## N. Title` headings → {title: section_num} map. + # Required to make scope-lock §5 "raw heading reuse first" functionally + # reachable, since extract_major_sections strips the leading `N.` from + # its level=2 group titles (Codex implementation review #6 catch). + raw_heading_map: dict[str, int] = {} + for h in re.finditer(r"^##\s+(\d+)\.\s+(.+?)$", raw_mdx, re.MULTILINE): + raw_heading_map[h.group(2).strip()] = int(h.group(1)) + + adapter_sections: list[MdxSection] = [] + used_nums: set[int] = set() + for idx, m in enumerate(majors, start=1): + mtitle = (m.get("title") or "").strip() + content = (m.get("content") or "").strip() + + if mtitle in raw_heading_map: + section_num = raw_heading_map[mtitle] + clean_title = mtitle + reuse_source = "raw_heading" + else: + inline_match = re.match(r"^(\d+)\.\s*(.+)$", mtitle) + if inline_match: + section_num = int(inline_match.group(1)) + clean_title = inline_match.group(2).strip() + reuse_source = "raw_heading_inline" + else: + section_num = idx + clean_title = mtitle + reuse_source = "order_fallback" + + if section_num <= 0: + diagnostics["fallback_reason"] = "NON_POSITIVE_SECTION_NUM" + return legacy_slide_title, legacy_sections, legacy_footer, diagnostics + if section_num in used_nums: + diagnostics["fallback_reason"] = "DUPLICATE_IDS" + return legacy_slide_title, legacy_sections, legacy_footer, diagnostics + used_nums.add(section_num) + + diagnostics["id_reconstruction_log"].append({ + "input_title": mtitle, + "section_num": section_num, + "reuse_source": reuse_source, + }) + + adapter_sections.append(MdxSection( + section_id=f"{mdx_id}-{section_num}", + section_num=section_num, + title=f"{section_num}. {clean_title}", + raw_content=content, + )) + + diagnostics["adapter_counts"] = { + "sections": len(adapter_sections), + "majors": len(majors), + "normalized_sections": len(normalized["sections"]), + "popups": len(normalized.get("popups", []) or []), + "images": len(normalized.get("images", []) or []), + "tables": len(normalized.get("tables", []) or []), + } + diagnostics["diff_vs_legacy"] = { + "title_match": adapter_title == legacy_slide_title, + "count_match": len(adapter_sections) == len(legacy_sections), + "footer_match": adapter_footer == legacy_footer, + } + diagnostics["used"] = True + return adapter_title, adapter_sections, adapter_footer, diagnostics + + except Exception as exc: # noqa: BLE001 — adapter must never break legacy path + diagnostics["fallback_reason"] = "ADAPTER_EXCEPTION" + diagnostics["exception"] = repr(exc) + return legacy_slide_title, legacy_sections, legacy_footer, diagnostics + + # ─── V4 lookup ────────────────────────────────────────────────── def load_v4_result() -> dict: @@ -1329,9 +1474,22 @@ def run_phase_z2_mvp1( ) # 1. Parse MDX (V4 무관) - slide_title, sections, slide_footer = parse_mdx(mdx_path) + legacy_slide_title, legacy_sections, legacy_footer = parse_mdx(mdx_path) + # IMP-02 — Stage 0 chained adapter dispatch (default OFF canary). + # When env PHASE_Z_STAGE0_ADAPTER_ENABLED=1/true/yes the adapter chain + # (mdx_normalizer + section_parser) replaces legacy parse_mdx output; + # on any contract failure or exception, falls back to legacy with + # fallback_reason recorded in stage0_adapter_diagnostics. + slide_title, sections, slide_footer, stage0_adapter_diagnostics = _stage0_chained_adapter( + mdx_path, legacy_slide_title, legacy_sections, legacy_footer, + ) + _adapter_tag = ( + "adapter-used" if stage0_adapter_diagnostics["used"] + else f"legacy({stage0_adapter_diagnostics['fallback_reason'] or 'disabled'})" + ) print(f" parsed : title='{slide_title}', sections={len(sections)} " - f"({[s.section_id for s in sections]}), footer={'yes' if slide_footer else 'no'}") + f"({[s.section_id for s in sections]}), footer={'yes' if slide_footer else 'no'}, " + f"stage0={_adapter_tag}") # ─── Step 2: MDX 정규화 ─── # orphans / details 필드는 schema lock — 빈 배열이라도 박혀야 @@ -1355,6 +1513,9 @@ def run_phase_z2_mvp1( ], "orphans": [], # schema lock — 중목차에 안 속한 텍스트 (감지 미구현) "details": [], # schema lock —
팝업 콘텐츠 (감지 미구현) + # IMP-02 — additive only. enabled/used/fallback_reason + id reconstruction + # trace + count diff. Out of scope: V4 / align / composition. + "stage0_adapter_diagnostics": stage0_adapter_diagnostics, }, step_status="partial", pipeline_path_connected=True, @@ -1363,7 +1524,8 @@ def run_phase_z2_mvp1( note=( "parse_mdx 결과: title / sections / footer 분리 + raw_content 보존. " "heading tree 미생성, orphan / details 감지 미완 (Step 2 ⚠ partial — 별 axis). " - "orphans / details 필드는 schema lock — 빈 배열이라도 'detection 미수행' marker." + "orphans / details 필드는 schema lock — 빈 배열이라도 'detection 미수행' marker. " + "stage0_adapter_diagnostics = IMP-02 chained adapter trace (default OFF canary)." ), )