feat(step2): chained adapter for Stage 0 normalize (IMP-02 #2)
- Add _stage0_chained_adapter() helper near parse_mdx() chaining
mdx_normalizer.normalize_mdx_content + section_parser.extract_major_sections
+ section_parser.extract_conclusion_text → reconstructed MdxSection list
- PHASE_Z_STAGE0_ADAPTER_ENABLED env flag, default OFF (canary, matches
PHASE_Z_B4_SOURCE_SHAPE_ENABLED / PHASE_Z_B4_GATEKEEPER pattern)
- Section ID reconstruction priority: raw_heading (pre-scan raw MDX
`## N. Title` heading → title→num map lookup) → raw_heading_inline
→ order_fallback. All paths logged in id_reconstruction_log
- 5 hard fallback enums: ADAPTER_EXCEPTION / NO_USABLE_SECTIONS /
MISSING_INVALID_IDS / DUPLICATE_IDS / NON_POSITIVE_SECTION_NUM
- Additive step02_normalized.json field stage0_adapter_diagnostics
(enabled / used / fallback_reason / id_reconstruction_log /
adapter_counts / diff_vs_legacy / legacy_counts)
- Preserve Step 2 existing 7 fields (slide_title / slide_footer /
sections_count / sections / orphans / details) — additive only
- Defer mdx_normalizer / section_parser imports so legacy default-OFF
path does not depend on those modules
- V4 / align_sections_to_v4_granularity / composition / AI/Kei /
frame selection / status semantics unchanged
env OFF: legacy path PASS on MDX 03, no regression
env=1 : adapter path activates, IDs 03-1/03-2, raw_heading reuse
triggered, downstream composition_planner abort surfaces
as canary finding (out of scope for IMP-02)
Refs Gitea #2 (IMP-02 A-1 Stage 0 normalize chained adapter)
This commit is contained in:
@@ -200,6 +200,151 @@ def parse_mdx(mdx_path: Path) -> tuple[str, list[MdxSection], Optional[str]]:
|
|||||||
return slide_title, sections, footer_text
|
return slide_title, sections, footer_text
|
||||||
|
|
||||||
|
|
||||||
|
# IMP-02 (Phase Z Step 2) — Stage 0 normalize chained adapter.
|
||||||
|
# scope-lock 7 조건 (Gitea #2):
|
||||||
|
# 1. inline helper near parse_mdx()
|
||||||
|
# 2. PHASE_Z_STAGE0_ADAPTER_ENABLED env flag, default OFF (canary, matches PHASE_Z_B4_*)
|
||||||
|
# 3. env=1 sample verification required (in review loop)
|
||||||
|
# 4. fallback_reason: str | None flat — 5 hard cases
|
||||||
|
# 5. verify normalize_mdx_content(raw_mdx)["sections"] is list
|
||||||
|
# 6. preserve Step 2 existing fields; stage0_adapter_diagnostics additive only
|
||||||
|
# 7. out of scope: V4 / align / composition / AI/Kei / frame selection / status semantics
|
||||||
|
_STAGE0_FALLBACK_REASONS = {
|
||||||
|
"ADAPTER_EXCEPTION",
|
||||||
|
"NO_USABLE_SECTIONS",
|
||||||
|
"MISSING_INVALID_IDS",
|
||||||
|
"DUPLICATE_IDS",
|
||||||
|
"NON_POSITIVE_SECTION_NUM",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _stage0_chained_adapter(
|
||||||
|
mdx_path: Path,
|
||||||
|
legacy_slide_title: str,
|
||||||
|
legacy_sections: list[MdxSection],
|
||||||
|
legacy_footer: Optional[str],
|
||||||
|
) -> tuple[str, list[MdxSection], Optional[str], dict]:
|
||||||
|
"""IMP-02 — chained adapter for Stage 0 normalize → Phase Z Step 2 input.
|
||||||
|
|
||||||
|
Chain: mdx_normalizer.normalize_mdx_content + section_parser.extract_major_sections
|
||||||
|
+ section_parser.extract_conclusion_text → reconstructed MdxSection list.
|
||||||
|
|
||||||
|
Default OFF (canary, env=`1/true/yes` to enable). When OFF, returns legacy parse_mdx
|
||||||
|
output with diagnostics indicating disabled. When ON, runs adapter chain; on any
|
||||||
|
hard contract failure or exception, falls back to legacy and records fallback_reason.
|
||||||
|
|
||||||
|
Returns (slide_title, sections, footer, diagnostics).
|
||||||
|
"""
|
||||||
|
diagnostics: dict = {
|
||||||
|
"enabled": False,
|
||||||
|
"used": False,
|
||||||
|
"fallback_reason": None,
|
||||||
|
"id_reconstruction_log": [],
|
||||||
|
"adapter_counts": None,
|
||||||
|
"legacy_counts": {"sections": len(legacy_sections)},
|
||||||
|
}
|
||||||
|
|
||||||
|
raw_flag = os.environ.get("PHASE_Z_STAGE0_ADAPTER_ENABLED", "").strip().lower()
|
||||||
|
enabled = raw_flag in {"1", "true", "yes"}
|
||||||
|
diagnostics["enabled"] = enabled
|
||||||
|
if not enabled:
|
||||||
|
return legacy_slide_title, legacy_sections, legacy_footer, diagnostics
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Defer imports — legacy path must not depend on these modules.
|
||||||
|
from mdx_normalizer import normalize_mdx_content
|
||||||
|
from section_parser import extract_conclusion_text, extract_major_sections
|
||||||
|
|
||||||
|
raw_mdx = mdx_path.read_text(encoding="utf-8")
|
||||||
|
normalized = normalize_mdx_content(raw_mdx)
|
||||||
|
if not isinstance(normalized, dict) or not isinstance(normalized.get("sections"), list):
|
||||||
|
diagnostics["fallback_reason"] = "MISSING_INVALID_IDS"
|
||||||
|
return legacy_slide_title, legacy_sections, legacy_footer, diagnostics
|
||||||
|
|
||||||
|
majors = extract_major_sections(normalized["sections"])
|
||||||
|
if not majors:
|
||||||
|
diagnostics["fallback_reason"] = "NO_USABLE_SECTIONS"
|
||||||
|
return legacy_slide_title, legacy_sections, legacy_footer, diagnostics
|
||||||
|
|
||||||
|
adapter_title = (normalized.get("title") or "").strip() or legacy_slide_title
|
||||||
|
conclusion = extract_conclusion_text(raw_mdx)
|
||||||
|
adapter_footer = conclusion if conclusion else None
|
||||||
|
|
||||||
|
mdx_num_match = re.match(r"(\d+)", mdx_path.stem)
|
||||||
|
mdx_id = mdx_num_match.group(1).zfill(2) if mdx_num_match else "00"
|
||||||
|
|
||||||
|
# Pre-scan raw MDX `## N. Title` headings → {title: section_num} map.
|
||||||
|
# Required to make scope-lock §5 "raw heading reuse first" functionally
|
||||||
|
# reachable, since extract_major_sections strips the leading `N.` from
|
||||||
|
# its level=2 group titles (Codex implementation review #6 catch).
|
||||||
|
raw_heading_map: dict[str, int] = {}
|
||||||
|
for h in re.finditer(r"^##\s+(\d+)\.\s+(.+?)$", raw_mdx, re.MULTILINE):
|
||||||
|
raw_heading_map[h.group(2).strip()] = int(h.group(1))
|
||||||
|
|
||||||
|
adapter_sections: list[MdxSection] = []
|
||||||
|
used_nums: set[int] = set()
|
||||||
|
for idx, m in enumerate(majors, start=1):
|
||||||
|
mtitle = (m.get("title") or "").strip()
|
||||||
|
content = (m.get("content") or "").strip()
|
||||||
|
|
||||||
|
if mtitle in raw_heading_map:
|
||||||
|
section_num = raw_heading_map[mtitle]
|
||||||
|
clean_title = mtitle
|
||||||
|
reuse_source = "raw_heading"
|
||||||
|
else:
|
||||||
|
inline_match = re.match(r"^(\d+)\.\s*(.+)$", mtitle)
|
||||||
|
if inline_match:
|
||||||
|
section_num = int(inline_match.group(1))
|
||||||
|
clean_title = inline_match.group(2).strip()
|
||||||
|
reuse_source = "raw_heading_inline"
|
||||||
|
else:
|
||||||
|
section_num = idx
|
||||||
|
clean_title = mtitle
|
||||||
|
reuse_source = "order_fallback"
|
||||||
|
|
||||||
|
if section_num <= 0:
|
||||||
|
diagnostics["fallback_reason"] = "NON_POSITIVE_SECTION_NUM"
|
||||||
|
return legacy_slide_title, legacy_sections, legacy_footer, diagnostics
|
||||||
|
if section_num in used_nums:
|
||||||
|
diagnostics["fallback_reason"] = "DUPLICATE_IDS"
|
||||||
|
return legacy_slide_title, legacy_sections, legacy_footer, diagnostics
|
||||||
|
used_nums.add(section_num)
|
||||||
|
|
||||||
|
diagnostics["id_reconstruction_log"].append({
|
||||||
|
"input_title": mtitle,
|
||||||
|
"section_num": section_num,
|
||||||
|
"reuse_source": reuse_source,
|
||||||
|
})
|
||||||
|
|
||||||
|
adapter_sections.append(MdxSection(
|
||||||
|
section_id=f"{mdx_id}-{section_num}",
|
||||||
|
section_num=section_num,
|
||||||
|
title=f"{section_num}. {clean_title}",
|
||||||
|
raw_content=content,
|
||||||
|
))
|
||||||
|
|
||||||
|
diagnostics["adapter_counts"] = {
|
||||||
|
"sections": len(adapter_sections),
|
||||||
|
"majors": len(majors),
|
||||||
|
"normalized_sections": len(normalized["sections"]),
|
||||||
|
"popups": len(normalized.get("popups", []) or []),
|
||||||
|
"images": len(normalized.get("images", []) or []),
|
||||||
|
"tables": len(normalized.get("tables", []) or []),
|
||||||
|
}
|
||||||
|
diagnostics["diff_vs_legacy"] = {
|
||||||
|
"title_match": adapter_title == legacy_slide_title,
|
||||||
|
"count_match": len(adapter_sections) == len(legacy_sections),
|
||||||
|
"footer_match": adapter_footer == legacy_footer,
|
||||||
|
}
|
||||||
|
diagnostics["used"] = True
|
||||||
|
return adapter_title, adapter_sections, adapter_footer, diagnostics
|
||||||
|
|
||||||
|
except Exception as exc: # noqa: BLE001 — adapter must never break legacy path
|
||||||
|
diagnostics["fallback_reason"] = "ADAPTER_EXCEPTION"
|
||||||
|
diagnostics["exception"] = repr(exc)
|
||||||
|
return legacy_slide_title, legacy_sections, legacy_footer, diagnostics
|
||||||
|
|
||||||
|
|
||||||
# ─── V4 lookup ──────────────────────────────────────────────────
|
# ─── V4 lookup ──────────────────────────────────────────────────
|
||||||
|
|
||||||
def load_v4_result() -> dict:
|
def load_v4_result() -> dict:
|
||||||
@@ -1329,9 +1474,22 @@ def run_phase_z2_mvp1(
|
|||||||
)
|
)
|
||||||
|
|
||||||
# 1. Parse MDX (V4 무관)
|
# 1. Parse MDX (V4 무관)
|
||||||
slide_title, sections, slide_footer = parse_mdx(mdx_path)
|
legacy_slide_title, legacy_sections, legacy_footer = parse_mdx(mdx_path)
|
||||||
|
# IMP-02 — Stage 0 chained adapter dispatch (default OFF canary).
|
||||||
|
# When env PHASE_Z_STAGE0_ADAPTER_ENABLED=1/true/yes the adapter chain
|
||||||
|
# (mdx_normalizer + section_parser) replaces legacy parse_mdx output;
|
||||||
|
# on any contract failure or exception, falls back to legacy with
|
||||||
|
# fallback_reason recorded in stage0_adapter_diagnostics.
|
||||||
|
slide_title, sections, slide_footer, stage0_adapter_diagnostics = _stage0_chained_adapter(
|
||||||
|
mdx_path, legacy_slide_title, legacy_sections, legacy_footer,
|
||||||
|
)
|
||||||
|
_adapter_tag = (
|
||||||
|
"adapter-used" if stage0_adapter_diagnostics["used"]
|
||||||
|
else f"legacy({stage0_adapter_diagnostics['fallback_reason'] or 'disabled'})"
|
||||||
|
)
|
||||||
print(f" parsed : title='{slide_title}', sections={len(sections)} "
|
print(f" parsed : title='{slide_title}', sections={len(sections)} "
|
||||||
f"({[s.section_id for s in sections]}), footer={'yes' if slide_footer else 'no'}")
|
f"({[s.section_id for s in sections]}), footer={'yes' if slide_footer else 'no'}, "
|
||||||
|
f"stage0={_adapter_tag}")
|
||||||
|
|
||||||
# ─── Step 2: MDX 정규화 ───
|
# ─── Step 2: MDX 정규화 ───
|
||||||
# orphans / details 필드는 schema lock — 빈 배열이라도 박혀야
|
# orphans / details 필드는 schema lock — 빈 배열이라도 박혀야
|
||||||
@@ -1355,6 +1513,9 @@ def run_phase_z2_mvp1(
|
|||||||
],
|
],
|
||||||
"orphans": [], # schema lock — 중목차에 안 속한 텍스트 (감지 미구현)
|
"orphans": [], # schema lock — 중목차에 안 속한 텍스트 (감지 미구현)
|
||||||
"details": [], # schema lock — <details> 팝업 콘텐츠 (감지 미구현)
|
"details": [], # schema lock — <details> 팝업 콘텐츠 (감지 미구현)
|
||||||
|
# IMP-02 — additive only. enabled/used/fallback_reason + id reconstruction
|
||||||
|
# trace + count diff. Out of scope: V4 / align / composition.
|
||||||
|
"stage0_adapter_diagnostics": stage0_adapter_diagnostics,
|
||||||
},
|
},
|
||||||
step_status="partial",
|
step_status="partial",
|
||||||
pipeline_path_connected=True,
|
pipeline_path_connected=True,
|
||||||
@@ -1363,7 +1524,8 @@ def run_phase_z2_mvp1(
|
|||||||
note=(
|
note=(
|
||||||
"parse_mdx 결과: title / sections / footer 분리 + raw_content 보존. "
|
"parse_mdx 결과: title / sections / footer 분리 + raw_content 보존. "
|
||||||
"heading tree 미생성, orphan / details 감지 미완 (Step 2 ⚠ partial — 별 axis). "
|
"heading tree 미생성, orphan / details 감지 미완 (Step 2 ⚠ partial — 별 axis). "
|
||||||
"orphans / details 필드는 schema lock — 빈 배열이라도 'detection 미수행' marker."
|
"orphans / details 필드는 schema lock — 빈 배열이라도 'detection 미수행' marker. "
|
||||||
|
"stage0_adapter_diagnostics = IMP-02 chained adapter trace (default OFF canary)."
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user