feat(step2): chained adapter for Stage 0 normalize (IMP-02 #2)
- Add _stage0_chained_adapter() helper near parse_mdx() chaining
mdx_normalizer.normalize_mdx_content + section_parser.extract_major_sections
+ section_parser.extract_conclusion_text → reconstructed MdxSection list
- PHASE_Z_STAGE0_ADAPTER_ENABLED env flag, default OFF (canary, matches
PHASE_Z_B4_SOURCE_SHAPE_ENABLED / PHASE_Z_B4_GATEKEEPER pattern)
- Section ID reconstruction priority: raw_heading (pre-scan raw MDX
`## N. Title` heading → title→num map lookup) → raw_heading_inline
→ order_fallback. All paths logged in id_reconstruction_log
- 5 hard fallback enums: ADAPTER_EXCEPTION / NO_USABLE_SECTIONS /
MISSING_INVALID_IDS / DUPLICATE_IDS / NON_POSITIVE_SECTION_NUM
- Additive step02_normalized.json field stage0_adapter_diagnostics
(enabled / used / fallback_reason / id_reconstruction_log /
adapter_counts / diff_vs_legacy / legacy_counts)
- Preserve Step 2 existing 7 fields (slide_title / slide_footer /
sections_count / sections / orphans / details) — additive only
- Defer mdx_normalizer / section_parser imports so legacy default-OFF
path does not depend on those modules
- V4 / align_sections_to_v4_granularity / composition / AI/Kei /
frame selection / status semantics unchanged
env OFF: legacy path PASS on MDX 03, no regression
env=1 : adapter path activates, IDs 03-1/03-2, raw_heading reuse
triggered, downstream composition_planner abort surfaces
as canary finding (out of scope for IMP-02)
Refs Gitea #2 (IMP-02 A-1 Stage 0 normalize chained adapter)
This commit is contained in:
@@ -200,6 +200,151 @@ def parse_mdx(mdx_path: Path) -> tuple[str, list[MdxSection], Optional[str]]:
|
||||
return slide_title, sections, footer_text
|
||||
|
||||
|
||||
# IMP-02 (Phase Z Step 2) — Stage 0 normalize chained adapter.
|
||||
# scope-lock 7 조건 (Gitea #2):
|
||||
# 1. inline helper near parse_mdx()
|
||||
# 2. PHASE_Z_STAGE0_ADAPTER_ENABLED env flag, default OFF (canary, matches PHASE_Z_B4_*)
|
||||
# 3. env=1 sample verification required (in review loop)
|
||||
# 4. fallback_reason: str | None flat — 5 hard cases
|
||||
# 5. verify normalize_mdx_content(raw_mdx)["sections"] is list
|
||||
# 6. preserve Step 2 existing fields; stage0_adapter_diagnostics additive only
|
||||
# 7. out of scope: V4 / align / composition / AI/Kei / frame selection / status semantics
|
||||
_STAGE0_FALLBACK_REASONS = {
|
||||
"ADAPTER_EXCEPTION",
|
||||
"NO_USABLE_SECTIONS",
|
||||
"MISSING_INVALID_IDS",
|
||||
"DUPLICATE_IDS",
|
||||
"NON_POSITIVE_SECTION_NUM",
|
||||
}
|
||||
|
||||
|
||||
def _stage0_chained_adapter(
|
||||
mdx_path: Path,
|
||||
legacy_slide_title: str,
|
||||
legacy_sections: list[MdxSection],
|
||||
legacy_footer: Optional[str],
|
||||
) -> tuple[str, list[MdxSection], Optional[str], dict]:
|
||||
"""IMP-02 — chained adapter for Stage 0 normalize → Phase Z Step 2 input.
|
||||
|
||||
Chain: mdx_normalizer.normalize_mdx_content + section_parser.extract_major_sections
|
||||
+ section_parser.extract_conclusion_text → reconstructed MdxSection list.
|
||||
|
||||
Default OFF (canary, env=`1/true/yes` to enable). When OFF, returns legacy parse_mdx
|
||||
output with diagnostics indicating disabled. When ON, runs adapter chain; on any
|
||||
hard contract failure or exception, falls back to legacy and records fallback_reason.
|
||||
|
||||
Returns (slide_title, sections, footer, diagnostics).
|
||||
"""
|
||||
diagnostics: dict = {
|
||||
"enabled": False,
|
||||
"used": False,
|
||||
"fallback_reason": None,
|
||||
"id_reconstruction_log": [],
|
||||
"adapter_counts": None,
|
||||
"legacy_counts": {"sections": len(legacy_sections)},
|
||||
}
|
||||
|
||||
raw_flag = os.environ.get("PHASE_Z_STAGE0_ADAPTER_ENABLED", "").strip().lower()
|
||||
enabled = raw_flag in {"1", "true", "yes"}
|
||||
diagnostics["enabled"] = enabled
|
||||
if not enabled:
|
||||
return legacy_slide_title, legacy_sections, legacy_footer, diagnostics
|
||||
|
||||
try:
|
||||
# Defer imports — legacy path must not depend on these modules.
|
||||
from mdx_normalizer import normalize_mdx_content
|
||||
from section_parser import extract_conclusion_text, extract_major_sections
|
||||
|
||||
raw_mdx = mdx_path.read_text(encoding="utf-8")
|
||||
normalized = normalize_mdx_content(raw_mdx)
|
||||
if not isinstance(normalized, dict) or not isinstance(normalized.get("sections"), list):
|
||||
diagnostics["fallback_reason"] = "MISSING_INVALID_IDS"
|
||||
return legacy_slide_title, legacy_sections, legacy_footer, diagnostics
|
||||
|
||||
majors = extract_major_sections(normalized["sections"])
|
||||
if not majors:
|
||||
diagnostics["fallback_reason"] = "NO_USABLE_SECTIONS"
|
||||
return legacy_slide_title, legacy_sections, legacy_footer, diagnostics
|
||||
|
||||
adapter_title = (normalized.get("title") or "").strip() or legacy_slide_title
|
||||
conclusion = extract_conclusion_text(raw_mdx)
|
||||
adapter_footer = conclusion if conclusion else None
|
||||
|
||||
mdx_num_match = re.match(r"(\d+)", mdx_path.stem)
|
||||
mdx_id = mdx_num_match.group(1).zfill(2) if mdx_num_match else "00"
|
||||
|
||||
# Pre-scan raw MDX `## N. Title` headings → {title: section_num} map.
|
||||
# Required to make scope-lock §5 "raw heading reuse first" functionally
|
||||
# reachable, since extract_major_sections strips the leading `N.` from
|
||||
# its level=2 group titles (Codex implementation review #6 catch).
|
||||
raw_heading_map: dict[str, int] = {}
|
||||
for h in re.finditer(r"^##\s+(\d+)\.\s+(.+?)$", raw_mdx, re.MULTILINE):
|
||||
raw_heading_map[h.group(2).strip()] = int(h.group(1))
|
||||
|
||||
adapter_sections: list[MdxSection] = []
|
||||
used_nums: set[int] = set()
|
||||
for idx, m in enumerate(majors, start=1):
|
||||
mtitle = (m.get("title") or "").strip()
|
||||
content = (m.get("content") or "").strip()
|
||||
|
||||
if mtitle in raw_heading_map:
|
||||
section_num = raw_heading_map[mtitle]
|
||||
clean_title = mtitle
|
||||
reuse_source = "raw_heading"
|
||||
else:
|
||||
inline_match = re.match(r"^(\d+)\.\s*(.+)$", mtitle)
|
||||
if inline_match:
|
||||
section_num = int(inline_match.group(1))
|
||||
clean_title = inline_match.group(2).strip()
|
||||
reuse_source = "raw_heading_inline"
|
||||
else:
|
||||
section_num = idx
|
||||
clean_title = mtitle
|
||||
reuse_source = "order_fallback"
|
||||
|
||||
if section_num <= 0:
|
||||
diagnostics["fallback_reason"] = "NON_POSITIVE_SECTION_NUM"
|
||||
return legacy_slide_title, legacy_sections, legacy_footer, diagnostics
|
||||
if section_num in used_nums:
|
||||
diagnostics["fallback_reason"] = "DUPLICATE_IDS"
|
||||
return legacy_slide_title, legacy_sections, legacy_footer, diagnostics
|
||||
used_nums.add(section_num)
|
||||
|
||||
diagnostics["id_reconstruction_log"].append({
|
||||
"input_title": mtitle,
|
||||
"section_num": section_num,
|
||||
"reuse_source": reuse_source,
|
||||
})
|
||||
|
||||
adapter_sections.append(MdxSection(
|
||||
section_id=f"{mdx_id}-{section_num}",
|
||||
section_num=section_num,
|
||||
title=f"{section_num}. {clean_title}",
|
||||
raw_content=content,
|
||||
))
|
||||
|
||||
diagnostics["adapter_counts"] = {
|
||||
"sections": len(adapter_sections),
|
||||
"majors": len(majors),
|
||||
"normalized_sections": len(normalized["sections"]),
|
||||
"popups": len(normalized.get("popups", []) or []),
|
||||
"images": len(normalized.get("images", []) or []),
|
||||
"tables": len(normalized.get("tables", []) or []),
|
||||
}
|
||||
diagnostics["diff_vs_legacy"] = {
|
||||
"title_match": adapter_title == legacy_slide_title,
|
||||
"count_match": len(adapter_sections) == len(legacy_sections),
|
||||
"footer_match": adapter_footer == legacy_footer,
|
||||
}
|
||||
diagnostics["used"] = True
|
||||
return adapter_title, adapter_sections, adapter_footer, diagnostics
|
||||
|
||||
except Exception as exc: # noqa: BLE001 — adapter must never break legacy path
|
||||
diagnostics["fallback_reason"] = "ADAPTER_EXCEPTION"
|
||||
diagnostics["exception"] = repr(exc)
|
||||
return legacy_slide_title, legacy_sections, legacy_footer, diagnostics
|
||||
|
||||
|
||||
# ─── V4 lookup ──────────────────────────────────────────────────
|
||||
|
||||
def load_v4_result() -> dict:
|
||||
@@ -1329,9 +1474,22 @@ def run_phase_z2_mvp1(
|
||||
)
|
||||
|
||||
# 1. Parse MDX (V4 무관)
|
||||
slide_title, sections, slide_footer = parse_mdx(mdx_path)
|
||||
legacy_slide_title, legacy_sections, legacy_footer = parse_mdx(mdx_path)
|
||||
# IMP-02 — Stage 0 chained adapter dispatch (default OFF canary).
|
||||
# When env PHASE_Z_STAGE0_ADAPTER_ENABLED=1/true/yes the adapter chain
|
||||
# (mdx_normalizer + section_parser) replaces legacy parse_mdx output;
|
||||
# on any contract failure or exception, falls back to legacy with
|
||||
# fallback_reason recorded in stage0_adapter_diagnostics.
|
||||
slide_title, sections, slide_footer, stage0_adapter_diagnostics = _stage0_chained_adapter(
|
||||
mdx_path, legacy_slide_title, legacy_sections, legacy_footer,
|
||||
)
|
||||
_adapter_tag = (
|
||||
"adapter-used" if stage0_adapter_diagnostics["used"]
|
||||
else f"legacy({stage0_adapter_diagnostics['fallback_reason'] or 'disabled'})"
|
||||
)
|
||||
print(f" parsed : title='{slide_title}', sections={len(sections)} "
|
||||
f"({[s.section_id for s in sections]}), footer={'yes' if slide_footer else 'no'}")
|
||||
f"({[s.section_id for s in sections]}), footer={'yes' if slide_footer else 'no'}, "
|
||||
f"stage0={_adapter_tag}")
|
||||
|
||||
# ─── Step 2: MDX 정규화 ───
|
||||
# orphans / details 필드는 schema lock — 빈 배열이라도 박혀야
|
||||
@@ -1355,6 +1513,9 @@ def run_phase_z2_mvp1(
|
||||
],
|
||||
"orphans": [], # schema lock — 중목차에 안 속한 텍스트 (감지 미구현)
|
||||
"details": [], # schema lock — <details> 팝업 콘텐츠 (감지 미구현)
|
||||
# IMP-02 — additive only. enabled/used/fallback_reason + id reconstruction
|
||||
# trace + count diff. Out of scope: V4 / align / composition.
|
||||
"stage0_adapter_diagnostics": stage0_adapter_diagnostics,
|
||||
},
|
||||
step_status="partial",
|
||||
pipeline_path_connected=True,
|
||||
@@ -1363,7 +1524,8 @@ def run_phase_z2_mvp1(
|
||||
note=(
|
||||
"parse_mdx 결과: title / sections / footer 분리 + raw_content 보존. "
|
||||
"heading tree 미생성, orphan / details 감지 미완 (Step 2 ⚠ partial — 별 axis). "
|
||||
"orphans / details 필드는 schema lock — 빈 배열이라도 'detection 미수행' marker."
|
||||
"orphans / details 필드는 schema lock — 빈 배열이라도 'detection 미수행' marker. "
|
||||
"stage0_adapter_diagnostics = IMP-02 chained adapter trace (default OFF canary)."
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user