feat(step2): chained adapter for Stage 0 normalize (IMP-02 #2)

- Add _stage0_chained_adapter() helper near parse_mdx() chaining
  mdx_normalizer.normalize_mdx_content + section_parser.extract_major_sections
  + section_parser.extract_conclusion_text → reconstructed MdxSection list
- PHASE_Z_STAGE0_ADAPTER_ENABLED env flag, default OFF (canary, matches
  PHASE_Z_B4_SOURCE_SHAPE_ENABLED / PHASE_Z_B4_GATEKEEPER pattern)
- Section ID reconstruction priority: raw_heading (pre-scan raw MDX
  `## N. Title` heading → title→num map lookup) → raw_heading_inline
  → order_fallback. All paths logged in id_reconstruction_log
- 5 hard fallback enums: ADAPTER_EXCEPTION / NO_USABLE_SECTIONS /
  MISSING_INVALID_IDS / DUPLICATE_IDS / NON_POSITIVE_SECTION_NUM
- Additive step02_normalized.json field stage0_adapter_diagnostics
  (enabled / used / fallback_reason / id_reconstruction_log /
  adapter_counts / diff_vs_legacy / legacy_counts)
- Preserve Step 2 existing 7 fields (slide_title / slide_footer /
  sections_count / sections / orphans / details) — additive only
- Defer mdx_normalizer / section_parser imports so legacy default-OFF
  path does not depend on those modules
- V4 / align_sections_to_v4_granularity / composition / AI/Kei /
  frame selection / status semantics unchanged

env OFF: legacy path PASS on MDX 03, no regression
env=1 : adapter path activates, IDs 03-1/03-2, raw_heading reuse
        triggered, downstream composition_planner abort surfaces
        as canary finding (out of scope for IMP-02)

Refs Gitea #2 (IMP-02 A-1 Stage 0 normalize chained adapter)
This commit is contained in:
2026-05-13 00:22:57 +09:00
parent 1dc81e0692
commit bac13c09c4

View File

@@ -200,6 +200,151 @@ def parse_mdx(mdx_path: Path) -> tuple[str, list[MdxSection], Optional[str]]:
return slide_title, sections, footer_text
# IMP-02 (Phase Z Step 2) — Stage 0 normalize chained adapter.
# scope-lock 7 조건 (Gitea #2):
# 1. inline helper near parse_mdx()
# 2. PHASE_Z_STAGE0_ADAPTER_ENABLED env flag, default OFF (canary, matches PHASE_Z_B4_*)
# 3. env=1 sample verification required (in review loop)
# 4. fallback_reason: str | None flat — 5 hard cases
# 5. verify normalize_mdx_content(raw_mdx)["sections"] is list
# 6. preserve Step 2 existing fields; stage0_adapter_diagnostics additive only
# 7. out of scope: V4 / align / composition / AI/Kei / frame selection / status semantics
_STAGE0_FALLBACK_REASONS = {
"ADAPTER_EXCEPTION",
"NO_USABLE_SECTIONS",
"MISSING_INVALID_IDS",
"DUPLICATE_IDS",
"NON_POSITIVE_SECTION_NUM",
}
def _stage0_chained_adapter(
mdx_path: Path,
legacy_slide_title: str,
legacy_sections: list[MdxSection],
legacy_footer: Optional[str],
) -> tuple[str, list[MdxSection], Optional[str], dict]:
"""IMP-02 — chained adapter for Stage 0 normalize → Phase Z Step 2 input.
Chain: mdx_normalizer.normalize_mdx_content + section_parser.extract_major_sections
+ section_parser.extract_conclusion_text → reconstructed MdxSection list.
Default OFF (canary, env=`1/true/yes` to enable). When OFF, returns legacy parse_mdx
output with diagnostics indicating disabled. When ON, runs adapter chain; on any
hard contract failure or exception, falls back to legacy and records fallback_reason.
Returns (slide_title, sections, footer, diagnostics).
"""
diagnostics: dict = {
"enabled": False,
"used": False,
"fallback_reason": None,
"id_reconstruction_log": [],
"adapter_counts": None,
"legacy_counts": {"sections": len(legacy_sections)},
}
raw_flag = os.environ.get("PHASE_Z_STAGE0_ADAPTER_ENABLED", "").strip().lower()
enabled = raw_flag in {"1", "true", "yes"}
diagnostics["enabled"] = enabled
if not enabled:
return legacy_slide_title, legacy_sections, legacy_footer, diagnostics
try:
# Defer imports — legacy path must not depend on these modules.
from mdx_normalizer import normalize_mdx_content
from section_parser import extract_conclusion_text, extract_major_sections
raw_mdx = mdx_path.read_text(encoding="utf-8")
normalized = normalize_mdx_content(raw_mdx)
if not isinstance(normalized, dict) or not isinstance(normalized.get("sections"), list):
diagnostics["fallback_reason"] = "MISSING_INVALID_IDS"
return legacy_slide_title, legacy_sections, legacy_footer, diagnostics
majors = extract_major_sections(normalized["sections"])
if not majors:
diagnostics["fallback_reason"] = "NO_USABLE_SECTIONS"
return legacy_slide_title, legacy_sections, legacy_footer, diagnostics
adapter_title = (normalized.get("title") or "").strip() or legacy_slide_title
conclusion = extract_conclusion_text(raw_mdx)
adapter_footer = conclusion if conclusion else None
mdx_num_match = re.match(r"(\d+)", mdx_path.stem)
mdx_id = mdx_num_match.group(1).zfill(2) if mdx_num_match else "00"
# Pre-scan raw MDX `## N. Title` headings → {title: section_num} map.
# Required to make scope-lock §5 "raw heading reuse first" functionally
# reachable, since extract_major_sections strips the leading `N.` from
# its level=2 group titles (Codex implementation review #6 catch).
raw_heading_map: dict[str, int] = {}
for h in re.finditer(r"^##\s+(\d+)\.\s+(.+?)$", raw_mdx, re.MULTILINE):
raw_heading_map[h.group(2).strip()] = int(h.group(1))
adapter_sections: list[MdxSection] = []
used_nums: set[int] = set()
for idx, m in enumerate(majors, start=1):
mtitle = (m.get("title") or "").strip()
content = (m.get("content") or "").strip()
if mtitle in raw_heading_map:
section_num = raw_heading_map[mtitle]
clean_title = mtitle
reuse_source = "raw_heading"
else:
inline_match = re.match(r"^(\d+)\.\s*(.+)$", mtitle)
if inline_match:
section_num = int(inline_match.group(1))
clean_title = inline_match.group(2).strip()
reuse_source = "raw_heading_inline"
else:
section_num = idx
clean_title = mtitle
reuse_source = "order_fallback"
if section_num <= 0:
diagnostics["fallback_reason"] = "NON_POSITIVE_SECTION_NUM"
return legacy_slide_title, legacy_sections, legacy_footer, diagnostics
if section_num in used_nums:
diagnostics["fallback_reason"] = "DUPLICATE_IDS"
return legacy_slide_title, legacy_sections, legacy_footer, diagnostics
used_nums.add(section_num)
diagnostics["id_reconstruction_log"].append({
"input_title": mtitle,
"section_num": section_num,
"reuse_source": reuse_source,
})
adapter_sections.append(MdxSection(
section_id=f"{mdx_id}-{section_num}",
section_num=section_num,
title=f"{section_num}. {clean_title}",
raw_content=content,
))
diagnostics["adapter_counts"] = {
"sections": len(adapter_sections),
"majors": len(majors),
"normalized_sections": len(normalized["sections"]),
"popups": len(normalized.get("popups", []) or []),
"images": len(normalized.get("images", []) or []),
"tables": len(normalized.get("tables", []) or []),
}
diagnostics["diff_vs_legacy"] = {
"title_match": adapter_title == legacy_slide_title,
"count_match": len(adapter_sections) == len(legacy_sections),
"footer_match": adapter_footer == legacy_footer,
}
diagnostics["used"] = True
return adapter_title, adapter_sections, adapter_footer, diagnostics
except Exception as exc: # noqa: BLE001 — adapter must never break legacy path
diagnostics["fallback_reason"] = "ADAPTER_EXCEPTION"
diagnostics["exception"] = repr(exc)
return legacy_slide_title, legacy_sections, legacy_footer, diagnostics
# ─── V4 lookup ──────────────────────────────────────────────────
def load_v4_result() -> dict:
@@ -1329,9 +1474,22 @@ def run_phase_z2_mvp1(
)
# 1. Parse MDX (V4 무관)
slide_title, sections, slide_footer = parse_mdx(mdx_path)
legacy_slide_title, legacy_sections, legacy_footer = parse_mdx(mdx_path)
# IMP-02 — Stage 0 chained adapter dispatch (default OFF canary).
# When env PHASE_Z_STAGE0_ADAPTER_ENABLED=1/true/yes the adapter chain
# (mdx_normalizer + section_parser) replaces legacy parse_mdx output;
# on any contract failure or exception, falls back to legacy with
# fallback_reason recorded in stage0_adapter_diagnostics.
slide_title, sections, slide_footer, stage0_adapter_diagnostics = _stage0_chained_adapter(
mdx_path, legacy_slide_title, legacy_sections, legacy_footer,
)
_adapter_tag = (
"adapter-used" if stage0_adapter_diagnostics["used"]
else f"legacy({stage0_adapter_diagnostics['fallback_reason'] or 'disabled'})"
)
print(f" parsed : title='{slide_title}', sections={len(sections)} "
f"({[s.section_id for s in sections]}), footer={'yes' if slide_footer else 'no'}")
f"({[s.section_id for s in sections]}), footer={'yes' if slide_footer else 'no'}, "
f"stage0={_adapter_tag}")
# ─── Step 2: MDX 정규화 ───
# orphans / details 필드는 schema lock — 빈 배열이라도 박혀야
@@ -1355,6 +1513,9 @@ def run_phase_z2_mvp1(
],
"orphans": [], # schema lock — 중목차에 안 속한 텍스트 (감지 미구현)
"details": [], # schema lock — <details> 팝업 콘텐츠 (감지 미구현)
# IMP-02 — additive only. enabled/used/fallback_reason + id reconstruction
# trace + count diff. Out of scope: V4 / align / composition.
"stage0_adapter_diagnostics": stage0_adapter_diagnostics,
},
step_status="partial",
pipeline_path_connected=True,
@@ -1363,7 +1524,8 @@ def run_phase_z2_mvp1(
note=(
"parse_mdx 결과: title / sections / footer 분리 + raw_content 보존. "
"heading tree 미생성, orphan / details 감지 미완 (Step 2 ⚠ partial — 별 axis). "
"orphans / details 필드는 schema lock — 빈 배열이라도 'detection 미수행' marker."
"orphans / details 필드는 schema lock — 빈 배열이라도 'detection 미수행' marker. "
"stage0_adapter_diagnostics = IMP-02 chained adapter trace (default OFF canary)."
),
)