From 842a46144c14738ff6aa4d56ecb8ac156a987538 Mon Sep 17 00:00:00 2001 From: kyeongmin Date: Sat, 23 May 2026 20:40:54 +0900 Subject: [PATCH] feat(#87): IMP-87 u1~u5 empty_shell honesty gate + BLOCKED exit EMPTY_SHELL_NO_CONTENT overall enum + 3-marker detection (frame_template_id="__empty__" OR label="empty_shell" OR merge_type="empty_shell") routes empty-placeholder-only slides to BLOCKED CLI exit 1 + red final_status.html, blocking fake PASS reports (feedback_artifact_status_naming). Coverage accounting split: legacy covered_section_ids preserved + new content_rendered_section_ids / empty_shell_section_ids. mdx05 Case B (zero V4 evidence) honestly classified instead of synthesizing fabricated rank-1 reject frames. IMP-30 u6/u7 stale empty-shell PASS assertions inverted (29 tests). IMP-85 smoke parametrize: mdx05 removed from exit-0 list + dedicated BLOCKED exit test added (4 tests). No production behavior change for chain_exhausted Case A; no AI route activation; no mdx-id hardcoding. 53 targeted + 76 adjacent Phase Z tests PASS. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/phase_z2_pipeline.py | 166 ++++++- tests/test_phase_z2_imp30_first_render.py | 26 +- ...test_phase_z2_imp87_empty_shell_honesty.py | 460 ++++++++++++++++++ tests/test_pipeline_smoke_imp85.py | 97 +++- 4 files changed, 731 insertions(+), 18 deletions(-) create mode 100644 tests/test_phase_z2_imp87_empty_shell_honesty.py diff --git a/src/phase_z2_pipeline.py b/src/phase_z2_pipeline.py index 20460ab..1ba04cf 100644 --- a/src/phase_z2_pipeline.py +++ b/src/phase_z2_pipeline.py @@ -2936,6 +2936,71 @@ def write_overflow_error(run_dir: Path, overflow: dict) -> Path: # ─── Debug.json (single slide + zones[]) ─────────────────────── + +def _is_empty_shell_unit(u: CompositionUnit) -> bool: + """IMP-87 u1 — true when a CompositionUnit is the IMP-30 u4 empty-shell + placeholder (frame_template_id="__empty__" / label="empty_shell" / + merge_type="empty_shell"). + + Used by compute_slide_status to redefine `full_mdx_coverage` over + rendered content units only: an empty-shell unit attaches the aligned + section_ids to a ``__empty__`` frame for layout purposes, but the slide + surface carries no MDX content for those sections. Counting it as + coverage would violate feedback_artifact_status_naming (overall / + coverage must reflect actual content state, not pipeline completion). + + The three markers are checked as independent OR-branches so a + CompositionUnit synthesised by any of the IMP-30 u4 entry points + (phase A / phase B / e2e) is classified consistently. + """ + if getattr(u, "frame_template_id", None) == "__empty__": + return True + if getattr(u, "label", None) == "empty_shell": + return True + if getattr(u, "merge_type", None) == "empty_shell": + return True + return False + + +def _final_status_html_class(overall: str) -> str: + """IMP-87 u3 — map ``overall`` enum string → CSS class for the step20 + final_status.html dashboard. + + EMPTY_SHELL_NO_CONTENT MUST resolve to ``"fail"`` (red) so the Case B + honesty defect (Stage 1 mdx05) surfaces in the same colour band as visual + failures and regressions, not the legacy ``"partial"`` amber band. The + explicit check runs BEFORE the legacy substring-based mapping because + the literal ``"EMPTY_SHELL_NO_CONTENT"`` contains neither ``"PASS"`` nor + ``"FAIL"`` / ``"REGRESSION"`` and would otherwise default to + ``"partial"`` (Stage 2 axis A5 lock). + + All other enums preserve pre-IMP-87 substring semantics so the legacy + PASS / RENDERED_WITH_VISUAL_REGRESSION / PARTIAL_COVERAGE / etc. paths + keep the colour they had before u3. + """ + if overall == "EMPTY_SHELL_NO_CONTENT": + return "fail" + if "PASS" in overall: + return "pass" + if "FAIL" in overall or "REGRESSION" in overall: + return "fail" + return "partial" + + +def _is_blocked_overall(overall: str) -> bool: + """IMP-87 u3 — true iff ``overall`` warrants a BLOCKED CLI exit + (returncode 1) independent of the visual_check / full_mdx_coverage axes. + + Currently the single blocked enum is EMPTY_SHELL_NO_CONTENT (Stage 1 + mdx05 Case B). The CLI consults this helper BEFORE the legacy + visual_fail / partial_coverage branches so a content-empty placeholder + slide that happens to pass Selenium overflow checks (no content → no + overflow) cannot silently return a content-empty artifact without an + exit signal (Stage 2 axis A4 lock). + """ + return overall == "EMPTY_SHELL_NO_CONTENT" + + def compute_slide_status(sections: list[MdxSection], units: list[CompositionUnit], comp_debug: dict, @@ -2955,6 +3020,13 @@ def compute_slide_status(sections: list[MdxSection], u4 empty-shell — needs user/AI adaptation 신호) overall enum : + EMPTY_SHELL_NO_CONTENT — IMP-87 u2 : every selected unit is an + IMP-30 u4 empty-shell placeholder + (no content-rendered units). Takes + precedence over the 4-way ladder below + because such a slide can technically + pass Selenium overflow checks but + carries no MDX content. PASS — visual OK + full coverage + adapter_needed=0 RENDERED_WITH_VISUAL_REGRESSION — full coverage 이지만 visual fail PARTIAL_COVERAGE — 일부 section 필터됨, 렌더된 부분만 visual OK @@ -2964,13 +3036,43 @@ def compute_slide_status(sections: list[MdxSection], Stage 1 Q3 + Codex #10 D4 lock.) """ aligned_ids = [s.section_id for s in sections] - covered = set() + # IMP-87 u1 — split coverage into legacy display vs honesty axis. + # ``covered`` (legacy, preserved for downstream display / IMP-05 / IMP-06 + # readers) still tracks every section attached to any selected unit. + # ``content_covered`` is the new honesty axis: only non-empty-shell units + # count as "rendered with content". Sections attached solely to an + # empty-shell placeholder (IMP-30 u4 frame_template_id="__empty__") + # are routed into ``filtered_section_ids`` so an EMPTY-SHELL-only slide + # cannot inherit full_mdx_coverage=True (Case B honesty defect lock, + # Stage 1 anchor c53722ad). + covered: set = set() + content_covered: set = set() for u in units: covered.update(u.source_section_ids) - filtered_ids = sorted(set(aligned_ids) - covered) + if not _is_empty_shell_unit(u): + content_covered.update(u.source_section_ids) + filtered_ids = sorted(set(aligned_ids) - content_covered) full_coverage = len(filtered_ids) == 0 visual_passed = bool(overflow.get("passed", False)) + # IMP-87 u2 — Additive empty/content accounting (used by the overall enum + # precedence block below and surfaced on the return dict for downstream + # introspection). ``content_rendered_section_ids`` mirrors the new honesty + # axis (content_covered) at the surface level so consumers can ask "how + # much real MDX content actually rendered" without re-running the helper. + # ``empty_shell_section_ids`` exposes the placeholder counterpart for the + # same reason. Both pairs are purely additive — no existing field is + # removed or repurposed (legacy ``covered_section_ids`` display semantics + # locked by u1). + empty_shell_units_list = [u for u in units if _is_empty_shell_unit(u)] + content_units_list = [u for u in units if not _is_empty_shell_unit(u)] + empty_shell_section_ids = sorted({ + sid + for u in empty_shell_units_list + for sid in (u.source_section_ids or []) + }) + content_rendered_section_ids = sorted(content_covered) + adapter_needed_units = list(adapter_needed_units or []) content_truncated = [] fallback_selections = [] @@ -3057,7 +3159,24 @@ def compute_slide_status(sections: list[MdxSection], "position": source_position, }) - if full_coverage and visual_passed: + # IMP-87 u2 — EMPTY_SHELL_NO_CONTENT precedence over the legacy 4-way + # ladder (Stage 2 axis A3). If the slide has aligned MDX sections but + # every selected unit is an IMP-30 u4 empty-shell placeholder (zero + # content units selected, at least one empty-shell unit selected), the + # visible artifact carries no real content — ``overall`` MUST report + # EMPTY_SHELL_NO_CONTENT so the u3 CLI exit / final_status.html styling + # can branch on it. The check runs before the visual/coverage ladder + # because a content-empty placeholder slide can technically pass Selenium + # overflow checks (no content → no overflow); without this precedence the + # ladder would mislabel it PASS (Stage 1 mdx05 Case B honesty defect). + is_empty_shell_only = ( + len(aligned_ids) > 0 + and len(empty_shell_units_list) > 0 + and len(content_units_list) == 0 + ) + if is_empty_shell_only: + overall = "EMPTY_SHELL_NO_CONTENT" + elif full_coverage and visual_passed: overall = "PASS" elif full_coverage and not visual_passed: overall = "RENDERED_WITH_VISUAL_REGRESSION" @@ -3102,6 +3221,11 @@ def compute_slide_status(sections: list[MdxSection], "full_mdx_coverage": full_coverage, "aligned_section_ids": aligned_ids, "covered_section_ids": sorted(covered), + # IMP-87 u2 — additive empty/content accounting (overall enum precedence above). + "content_rendered_section_ids": content_rendered_section_ids, + "content_rendered_unit_count": len(content_units_list), + "empty_shell_section_ids": empty_shell_section_ids, + "empty_shell_unit_count": len(empty_shell_units_list), "filtered_section_ids": filtered_ids, "filtered_section_reasons": filtered_section_reasons, "selection_path": "fallback_used" if fallback_selections else "rank_1", @@ -5929,7 +6053,11 @@ def run_phase_z2_mvp1( ) # Step 20 HTML — 최종 판정 시각 보고 _overall = slide_status.get("overall", "?") - _ov_class = "pass" if "PASS" in _overall else "fail" if "FAIL" in _overall or "REGRESSION" in _overall else "partial" + # IMP-87 u3 — route the dashboard CSS class through _final_status_html_class + # so EMPTY_SHELL_NO_CONTENT renders as "fail" (red) instead of the legacy + # substring-default "partial" amber. Other enums keep their pre-IMP-87 + # colour (Stage 2 axis A5 lock). + _ov_class = _final_status_html_class(_overall) _vfs = slide_status.get("visual_fail_reasons") or [] _vfs_html = ( "" @@ -6048,6 +6176,36 @@ def run_phase_z2_mvp1( f"(impl_status={nap.get('next_action_implementation_status')})") # 13. Exit 정책 — visual fail 은 abort, partial coverage 는 abort 안 하지만 PASS 도 아님 + # IMP-87 u3 — BLOCKED exit on EMPTY_SHELL_NO_CONTENT precedes the legacy + # visual_fail / partial_coverage branches (Stage 2 axis A4). A slide whose + # every selected unit is an IMP-30 u4 empty-shell placeholder carries no + # MDX content; without this branch a Case B run could pass Selenium + # overflow checks (nothing to overflow) and silently return overall + # EMPTY_SHELL_NO_CONTENT without any CLI exit signal, violating + # feedback_artifact_status_naming. + if _is_blocked_overall(overall): + _aligned = slide_status.get("aligned_section_ids") or [] + _empty_ids = slide_status.get("empty_shell_section_ids") or [] + _empty_count = slide_status.get("empty_shell_unit_count", 0) + _content_count = slide_status.get("content_rendered_unit_count", 0) + print( + f"\n[Phase Z-2 IMP-87 u3] BLOCKED @ empty_shell_no_content ({overall})", + file=sys.stderr, + ) + print( + " reason : every selected unit is IMP-30 u4 empty-shell placeholder " + "(no MDX content rendered)", + file=sys.stderr, + ) + print(f" aligned : {_aligned}", file=sys.stderr) + print(f" shell : {_empty_ids}", file=sys.stderr) + print( + f" units : empty_shell_unit_count={_empty_count} " + f"content_rendered_unit_count={_content_count}", + file=sys.stderr, + ) + sys.exit(1) + if not slide_status["visual_check_passed"]: err_path = write_overflow_error(run_dir, overflow) print(f"\n[Phase Z-2 MVP-1.5b] FAIL @ visual_runtime_check ({overall})", file=sys.stderr) diff --git a/tests/test_phase_z2_imp30_first_render.py b/tests/test_phase_z2_imp30_first_render.py index fee5e4f..d3cd965 100644 --- a/tests/test_phase_z2_imp30_first_render.py +++ b/tests/test_phase_z2_imp30_first_render.py @@ -1012,11 +1012,15 @@ def test_u6_empty_shell_unit_listed_with_empty_identifiers(): assert entry["selection_path"] == "empty_shell" assert entry["fallback_reason"] == "no_v4_rank_1_for_any_section" assert entry["v4_rank"] is None - # full_mdx_coverage holds because shell.source_section_ids covers every - # aligned section id — u4 deliberately sets this so coverage stays True - # under the terminal first-render invariant. - assert status["full_mdx_coverage"] is True - assert status["overall"] == "PASS" + # IMP-87 u4 — honesty defect inversion. The shell.source_section_ids + # still feeds legacy covered_section_ids for display, but the content- + # rendered axis (u1) excludes empty-shell units, so full_mdx_coverage + # MUST flip to False. Overall (u2) MUST elevate to + # EMPTY_SHELL_NO_CONTENT before the legacy ladder, otherwise a slide + # whose sole rendered unit is __empty__ would be reported as PASS — + # the exact Stage 1 mdx05 honesty defect this issue exists to fix. + assert status["full_mdx_coverage"] is False + assert status["overall"] == "EMPTY_SHELL_NO_CONTENT" # ─── u6 case 4 : mixed selection — provisional + normal units coexist ── @@ -1357,10 +1361,14 @@ def test_u7_e2e_zero_v4_empty_shell_status_surface(u7_patch_selector_deps): assert shell_entry["phase_z_status"] == "empty_shell" assert shell_entry["frame_template_id"] == "__empty__" assert shell_entry["source_section_ids"] == ["S1", "S2"] - # Coverage check — both sections counted as covered by the shell unit - # (rendered=True path; PASS enum unchanged by provisional qualifier). - assert status["full_mdx_coverage"] is True - assert status["overall"] == "PASS" + # IMP-87 u4 — honesty defect inversion. The shell unit still attaches + # both sections to legacy covered_section_ids (display preserved), but + # the content-rendered axis (u1) excludes empty-shell units, so + # full_mdx_coverage MUST flip to False. Overall (u2) MUST elevate to + # EMPTY_SHELL_NO_CONTENT before the legacy 4-way ladder, so a zero-V4 + # slide cannot disguise itself as PASS through visual-overflow alone. + assert status["full_mdx_coverage"] is False + assert status["overall"] == "EMPTY_SHELL_NO_CONTENT" # ─── u7 case 3 : e2e normal path unchanged when opt-in flags both on ───── diff --git a/tests/test_phase_z2_imp87_empty_shell_honesty.py b/tests/test_phase_z2_imp87_empty_shell_honesty.py new file mode 100644 index 0000000..2393195 --- /dev/null +++ b/tests/test_phase_z2_imp87_empty_shell_honesty.py @@ -0,0 +1,460 @@ +"""IMP-87 empty-shell honesty regression tests — u1+u2+u3 scope. + +Anchors the Stage 1 contract that an empty-shell-only run MUST NOT report +full_mdx_coverage=True (u1), MUST surface a distinct ``overall`` enum plus +additive empty/content accounting (u2), AND MUST resolve to a BLOCKED CLI +exit (returncode 1) with a red ``"fail"`` CSS class on the step20 +final_status.html dashboard (u3). + +u1 (baseline): + 1) ``_is_empty_shell_unit`` helper — three independent OR-branches over + ``frame_template_id == "__empty__"`` / ``label == "empty_shell"`` / + ``merge_type == "empty_shell"``. Any IMP-30 u4 phase A / phase B / e2e + synthesised placeholder is classified consistently. + 2) ``compute_slide_status`` coverage split — the legacy ``covered`` + accumulator (display semantics, preserved for IMP-05 / IMP-06 readers) + still tracks every section attached to any selected unit, while the new + ``content_covered`` axis counts only non-empty-shell units. The + ``filtered_section_ids`` / ``full_mdx_coverage`` axes derive from + ``content_covered`` so an EMPTY-SHELL-only slide cannot inherit + full_mdx_coverage=True (Case B honesty defect lock, Stage 1 anchor + c53722ad). + +u2: + 3) ``overall == "EMPTY_SHELL_NO_CONTENT"`` precedence over the legacy + 4-way ladder when every selected unit is an empty-shell placeholder + (zero content units selected, at least one empty-shell unit selected, + at least one aligned section). The precedence runs BEFORE the + visual/coverage ladder so a content-empty slide that happens to pass + Selenium overflow checks cannot be mislabelled PASS. + 4) Additive return-dict fields ``content_rendered_section_ids`` / + ``content_rendered_unit_count`` / ``empty_shell_section_ids`` / + ``empty_shell_unit_count``. No legacy field is removed or repurposed. + +u3 (added by this revision): + 5) ``_final_status_html_class`` helper — EMPTY_SHELL_NO_CONTENT resolves + to ``"fail"`` (red) so the step20 final_status.html dashboard surfaces + the Case B honesty defect in the same colour band as visual failures + and regressions, not the legacy ``"partial"`` amber band. Existing + PASS / RENDERED_WITH_VISUAL_REGRESSION / PARTIAL_COVERAGE / etc. + enums keep their pre-IMP-87 substring-based colour (Stage 2 axis A5). + 6) ``_is_blocked_overall`` helper — true iff ``overall`` warrants a + BLOCKED CLI exit (returncode 1) independent of the visual_check / + full_mdx_coverage axes. Currently only EMPTY_SHELL_NO_CONTENT is + blocked; every other enum returns False so the legacy CLI ladder + (visual_fail → exit 1, partial_coverage → return, PASS → return) is + preserved bit-for-bit (Stage 2 axis A4). + +Per feedback_scope_qualified_verification, each test declares which case it +covers (A=content only, B=empty-shell only, C=mixed) and whether the +assertion is the u1 surface (helper/coverage), the u2 surface (overall enum +/ accounting fields), or the u3 surface (CLI exit helper / HTML class). +""" +from __future__ import annotations + +from src.phase_z2_composition import CompositionUnit +from src.phase_z2_pipeline import ( + MdxSection, + _final_status_html_class, + _is_blocked_overall, + _is_empty_shell_unit, + compute_slide_status, +) + + +def _mk_section(section_id: str) -> MdxSection: + return MdxSection( + section_id=section_id, + section_num=int(section_id.lstrip("S") or "0"), + title=f"Section {section_id}", + raw_content=f"raw {section_id}", + ) + + +def _mk_content_unit(*, section_ids: list[str], **overrides) -> CompositionUnit: + base = dict( + source_section_ids=list(section_ids), + merge_type="single", + frame_template_id="MOCK_template_direct_a", + frame_id="MOCK_frame_001", + frame_number=1, + confidence=0.9, + label="use_as_is", + phase_z_status="matched_zone", + raw_content="alpha", + title="MOCK content", + v4_rank=1, + selection_path="rank_1", + fallback_reason=None, + score=1.0, + provisional=False, + ) + base.update(overrides) + return CompositionUnit(**base) + + +def _mk_empty_shell_unit(*, section_ids: list[str]) -> CompositionUnit: + """Mirror of IMP-30 u4 phase B empty-shell synthesis. All three marker + fields set so _is_empty_shell_unit returns True via every branch + independently — the per-marker tests below override individually.""" + return CompositionUnit( + source_section_ids=list(section_ids), + merge_type="empty_shell", + frame_template_id="__empty__", + frame_id="__empty__", + frame_number=0, + confidence=0.0, + label="empty_shell", + phase_z_status="empty_shell", + raw_content="\n\n".join(f"raw {sid}" for sid in section_ids), + title=" / ".join(f"Section {sid}" for sid in section_ids), + v4_rank=None, + selection_path="empty_shell", + fallback_reason="no_v4_rank_1_for_any_section", + score=0.0, + provisional=True, + ) + + +# ════════════════════════════════════════════════════════════════════════ +# Helper unit tests — _is_empty_shell_unit detection (3-marker OR) +# ════════════════════════════════════════════════════════════════════════ + + +def test_helper_detects_unit_via_frame_template_id_marker_only(): + """Empty-shell classifier triggers on frame_template_id="__empty__" + even when label / merge_type look like a normal content unit. Guards + a future code path that only sets the frame_template_id field.""" + u = _mk_content_unit( + section_ids=["S1"], + frame_template_id="__empty__", + ) + assert _is_empty_shell_unit(u) is True + + +def test_helper_detects_unit_via_label_marker_only(): + """Classifier triggers on label="empty_shell" alone.""" + u = _mk_content_unit(section_ids=["S1"], label="empty_shell") + assert _is_empty_shell_unit(u) is True + + +def test_helper_detects_unit_via_merge_type_marker_only(): + """Classifier triggers on merge_type="empty_shell" alone.""" + u = _mk_content_unit(section_ids=["S1"], merge_type="empty_shell") + assert _is_empty_shell_unit(u) is True + + +def test_helper_rejects_normal_content_unit(): + """A vanilla rank-1 use_as_is unit is NOT an empty shell.""" + u = _mk_content_unit(section_ids=["S1"]) + assert _is_empty_shell_unit(u) is False + + +# ════════════════════════════════════════════════════════════════════════ +# Case B — empty-shell only (mdx05 Stage 1 reproduction, u1 surface axes) +# ════════════════════════════════════════════════════════════════════════ + + +def test_case_b_empty_shell_only_breaks_full_mdx_coverage(): + """Stage 1 mdx05 reproduction at the u1 surface. With one empty-shell + unit covering every aligned section, the new content_covered axis is + empty, so ``full_mdx_coverage`` MUST be False and every aligned + section MUST surface in ``filtered_section_ids``. The legacy + ``covered_section_ids`` field (display semantics) still includes the + shell-covered sections — preserved for IMP-05 / IMP-06 readers.""" + sections = [_mk_section("S1"), _mk_section("S2"), _mk_section("S3")] + shell = _mk_empty_shell_unit(section_ids=["S1", "S2", "S3"]) + overflow_pass = {"passed": True, "fail_reasons": []} + comp_debug = {"candidates_summary": []} + + status = compute_slide_status( + sections, [shell], comp_debug, overflow_pass, + adapter_needed_units=None, debug_zones=None, + ) + + assert status["full_mdx_coverage"] is False + assert status["filtered_section_ids"] == ["S1", "S2", "S3"] + # Legacy covered axis preserved — IMP-87 u1 does NOT change display + # semantics for downstream readers; it only redefines the honesty axis. + assert status["covered_section_ids"] == ["S1", "S2", "S3"] + + +# ════════════════════════════════════════════════════════════════════════ +# Case C — mixed empty + content units (partial coverage realism) +# ════════════════════════════════════════════════════════════════════════ + + +def test_case_c_mixed_empty_and_content_filters_shell_sections_only(): + """A content unit covers S1; an empty_shell placeholder covers S2. + Only S2 should surface as filtered (the shell does not count as content + coverage), but the legacy ``covered_section_ids`` field still lists + both sections (display semantics unchanged by u1).""" + sections = [_mk_section("S1"), _mk_section("S2")] + units = [ + _mk_content_unit(section_ids=["S1"]), + _mk_empty_shell_unit(section_ids=["S2"]), + ] + overflow_pass = {"passed": True, "fail_reasons": []} + comp_debug = {"candidates_summary": []} + + status = compute_slide_status( + sections, units, comp_debug, overflow_pass, + adapter_needed_units=None, debug_zones=None, + ) + + assert status["full_mdx_coverage"] is False + assert status["filtered_section_ids"] == ["S2"] + assert status["covered_section_ids"] == ["S1", "S2"] + + +# ════════════════════════════════════════════════════════════════════════ +# Case A — normal content-only PASS preserved +# ════════════════════════════════════════════════════════════════════════ + + +def test_case_a_normal_content_only_preserves_full_coverage(): + """IMP-05 / IMP-30 regression guard at the u1 surface. A slide whose + every selected unit is non-empty-shell content must continue to report + ``full_mdx_coverage`` == True with an empty ``filtered_section_ids``. + No behavioral change vs pre-IMP-87 baseline — u1 must be additive.""" + sections = [_mk_section("S1"), _mk_section("S2")] + units = [ + _mk_content_unit(section_ids=["S1"]), + _mk_content_unit(section_ids=["S2"], frame_id="MOCK_frame_002"), + ] + overflow_pass = {"passed": True, "fail_reasons": []} + comp_debug = {"candidates_summary": []} + + status = compute_slide_status( + sections, units, comp_debug, overflow_pass, + adapter_needed_units=None, debug_zones=None, + ) + + assert status["full_mdx_coverage"] is True + assert status["filtered_section_ids"] == [] + assert status["covered_section_ids"] == ["S1", "S2"] + + +# ════════════════════════════════════════════════════════════════════════ +# IMP-87 u2 — EMPTY_SHELL_NO_CONTENT overall + additive accounting fields +# ════════════════════════════════════════════════════════════════════════ +# +# These tests assert the u2 surface ONLY: +# - ``overall`` enum precedence (EMPTY_SHELL_NO_CONTENT before the 4-way +# ladder) for Case B; legacy ladder preserved for Cases A and C. +# - Additive return-dict fields populate correctly across the three cases. +# CLI exit codes and final_status.html styling are u3 scope and remain +# unasserted here. + + +def test_case_b_u2_overall_is_empty_shell_no_content_even_when_visual_passes(): + """Case B (empty-shell only) — Stage 1 mdx05 reproduction at the u2 + surface. The precedence check MUST fire before the visual/coverage + ladder: even though ``overflow_pass={passed: True}`` would otherwise + push the ladder to PASS (the content-empty slide has nothing to + overflow), the u2 precedence forces ``overall`` to + EMPTY_SHELL_NO_CONTENT. Guards the Stage 1 honesty defect.""" + sections = [_mk_section("S1"), _mk_section("S2"), _mk_section("S3")] + shell = _mk_empty_shell_unit(section_ids=["S1", "S2", "S3"]) + overflow_pass = {"passed": True, "fail_reasons": []} + comp_debug = {"candidates_summary": []} + + status = compute_slide_status( + sections, [shell], comp_debug, overflow_pass, + adapter_needed_units=None, debug_zones=None, + ) + + assert status["overall"] == "EMPTY_SHELL_NO_CONTENT" + + +def test_case_b_u2_overall_remains_empty_shell_no_content_when_visual_fails(): + """Case B — precedence is unconditional on visual_passed. A visual fail + must NOT downgrade EMPTY_SHELL_NO_CONTENT to a ladder enum; the empty- + shell signal dominates because the slide carries no MDX content to + rescue regardless of overflow status.""" + sections = [_mk_section("S1"), _mk_section("S2")] + shell = _mk_empty_shell_unit(section_ids=["S1", "S2"]) + overflow_fail = {"passed": False, "fail_reasons": ["mock_overflow"]} + comp_debug = {"candidates_summary": []} + + status = compute_slide_status( + sections, [shell], comp_debug, overflow_fail, + adapter_needed_units=None, debug_zones=None, + ) + + assert status["overall"] == "EMPTY_SHELL_NO_CONTENT" + + +def test_case_b_u2_accounting_fields_populate_for_empty_shell_only(): + """Case B — additive accounting fields. All aligned sections appear in + ``empty_shell_section_ids`` and none in ``content_rendered_section_ids``. + Counts mirror the unit partition (1 shell unit, 0 content units).""" + sections = [_mk_section("S1"), _mk_section("S2"), _mk_section("S3")] + shell = _mk_empty_shell_unit(section_ids=["S1", "S2", "S3"]) + overflow_pass = {"passed": True, "fail_reasons": []} + comp_debug = {"candidates_summary": []} + + status = compute_slide_status( + sections, [shell], comp_debug, overflow_pass, + adapter_needed_units=None, debug_zones=None, + ) + + assert status["empty_shell_section_ids"] == ["S1", "S2", "S3"] + assert status["content_rendered_section_ids"] == [] + assert status["empty_shell_unit_count"] == 1 + assert status["content_rendered_unit_count"] == 0 + + +def test_case_a_u2_overall_pass_preserved_for_content_only(): + """Case A — content-only slide. u2 precedence MUST NOT fire (zero + empty-shell units) so the legacy ladder still reports PASS when visual + + full coverage line up. Regression guard against precedence over-firing.""" + sections = [_mk_section("S1"), _mk_section("S2")] + units = [ + _mk_content_unit(section_ids=["S1"]), + _mk_content_unit(section_ids=["S2"], frame_id="MOCK_frame_002"), + ] + overflow_pass = {"passed": True, "fail_reasons": []} + comp_debug = {"candidates_summary": []} + + status = compute_slide_status( + sections, units, comp_debug, overflow_pass, + adapter_needed_units=None, debug_zones=None, + ) + + assert status["overall"] == "PASS" + assert status["empty_shell_section_ids"] == [] + assert status["content_rendered_section_ids"] == ["S1", "S2"] + assert status["empty_shell_unit_count"] == 0 + assert status["content_rendered_unit_count"] == 2 + + +def test_case_c_u2_mixed_falls_to_legacy_partial_coverage_ladder(): + """Case C — mixed empty + content. u2 precedence MUST NOT fire (one + content unit exists), so the legacy ladder takes over: not-full-coverage + + visual-pass → PARTIAL_COVERAGE. Accounting fields split the unit + partition cleanly (S1 = content, S2 = shell).""" + sections = [_mk_section("S1"), _mk_section("S2")] + units = [ + _mk_content_unit(section_ids=["S1"]), + _mk_empty_shell_unit(section_ids=["S2"]), + ] + overflow_pass = {"passed": True, "fail_reasons": []} + comp_debug = {"candidates_summary": []} + + status = compute_slide_status( + sections, units, comp_debug, overflow_pass, + adapter_needed_units=None, debug_zones=None, + ) + + assert status["overall"] == "PARTIAL_COVERAGE" + assert status["empty_shell_section_ids"] == ["S2"] + assert status["content_rendered_section_ids"] == ["S1"] + assert status["empty_shell_unit_count"] == 1 + assert status["content_rendered_unit_count"] == 1 + + +# ════════════════════════════════════════════════════════════════════════ +# IMP-87 u3 — _final_status_html_class CSS class mapping +# ════════════════════════════════════════════════════════════════════════ +# +# These tests assert the u3 dashboard CSS-class surface ONLY. CLI exit +# decisions are covered by the _is_blocked_overall tests further down. End- +# to-end pipeline integration (full run → BLOCKED returncode, red HTML in +# the actual artifact file) is intentionally left to the smoke layer in +# u5; here the helper-level guarantee is enough because the call site +# (``_ov_class = _final_status_html_class(_overall)``) is a direct +# substitution of the previous inline expression. + + +def test_u3_html_class_empty_shell_no_content_is_fail_red(): + """Case B u3 surface — EMPTY_SHELL_NO_CONTENT MUST map to ``"fail"`` so + the dashboard surfaces the honesty defect in the red colour band. Stage + 2 axis A5 lock: the literal does NOT contain ``"PASS"`` / ``"FAIL"`` / + ``"REGRESSION"`` so without the explicit branch the helper would default + to ``"partial"`` (the amber legacy band), which is the exact mislabel + u3 corrects.""" + assert _final_status_html_class("EMPTY_SHELL_NO_CONTENT") == "fail" + + +def test_u3_html_class_pass_preserves_legacy_substring_mapping(): + """Case A u3 surface — the legacy substring rule for ``"PASS"`` MUST + stay intact so existing dashboards keep the green band for healthy + runs. Guards against the u3 explicit branch over-firing on any string + that happens to contain ``"PASS"``.""" + assert _final_status_html_class("PASS") == "pass" + + +def test_u3_html_class_regression_and_fail_substrings_preserved(): + """u3 surface — the legacy ``"FAIL"`` / ``"REGRESSION"`` substring rule + MUST stay intact for the existing visual-regression enums so dashboards + keep the red band for overflow / regression failures. The pre-IMP-87 + expression is preserved verbatim in the fallback branch; this test + guards that preservation.""" + assert ( + _final_status_html_class("RENDERED_WITH_VISUAL_REGRESSION") == "fail" + ) + assert ( + _final_status_html_class("PARTIAL_COVERAGE_WITH_VISUAL_REGRESSION") + == "fail" + ) + + +def test_u3_html_class_partial_coverage_remains_amber_partial(): + """u3 surface — ``"PARTIAL_COVERAGE"`` MUST stay in the amber + ``"partial"`` band: it carries some rendered content (unlike Case B) + and does not deserve the red band. Guards against the EMPTY_SHELL + branch accidentally widening to the partial enum.""" + assert _final_status_html_class("PARTIAL_COVERAGE") == "partial" + + +def test_u3_html_class_unknown_enum_falls_back_to_partial(): + """u3 surface — defensive default. An unrecognised enum string (e.g. + legacy ``"?"`` from the ``slide_status.get("overall", "?")`` fallback + at the call site) MUST keep the legacy amber default so the dashboard + does not silently shift colour bands when a new enum is added without + updating the helper.""" + assert _final_status_html_class("?") == "partial" + assert _final_status_html_class("UNKNOWN_FUTURE_ENUM") == "partial" + + +# ════════════════════════════════════════════════════════════════════════ +# IMP-87 u3 — _is_blocked_overall CLI exit gating +# ════════════════════════════════════════════════════════════════════════ + + +def test_u3_blocked_overall_true_for_empty_shell_no_content(): + """Case B u3 surface — EMPTY_SHELL_NO_CONTENT MUST be flagged for the + BLOCKED CLI exit branch. The CLI gates ``sys.exit(1)`` on this helper + BEFORE the legacy visual_fail / partial_coverage branches so a + content-empty slide that passes Selenium overflow checks cannot + silently return without an exit signal (Stage 2 axis A4 lock, + feedback_artifact_status_naming guardrail).""" + assert _is_blocked_overall("EMPTY_SHELL_NO_CONTENT") is True + + +def test_u3_blocked_overall_false_for_pass_and_legacy_failure_enums(): + """Case A u3 surface — every pre-IMP-87 enum MUST stay False so the + legacy CLI ladder is preserved bit-for-bit (visual_fail → exit 1, + partial_coverage → return without exit, PASS → return without exit). + Regression guard against the BLOCKED branch widening beyond the Case B + honesty defect.""" + for enum in ( + "PASS", + "RENDERED_WITH_VISUAL_REGRESSION", + "PARTIAL_COVERAGE", + "PARTIAL_COVERAGE_WITH_VISUAL_REGRESSION", + ): + assert _is_blocked_overall(enum) is False, ( + f"{enum} must NOT trigger the IMP-87 u3 BLOCKED exit; only " + f"EMPTY_SHELL_NO_CONTENT is gated." + ) + + +def test_u3_blocked_overall_false_for_defensive_unknown_enum(): + """u3 surface — defensive default. An unknown / sentinel enum string + MUST stay out of the BLOCKED branch so a future enum addition does + not accidentally enable exit-1 behaviour without an explicit opt-in + here (Stage 2 axis A4 narrow allow-list lock).""" + assert _is_blocked_overall("?") is False + assert _is_blocked_overall("UNKNOWN_FUTURE_ENUM") is False diff --git a/tests/test_pipeline_smoke_imp85.py b/tests/test_pipeline_smoke_imp85.py index 2ec68bf..7389d46 100644 --- a/tests/test_pipeline_smoke_imp85.py +++ b/tests/test_pipeline_smoke_imp85.py @@ -14,13 +14,29 @@ invariant + runtime VP gate end-to-end against real MDX inputs: *downstream* of that routing (e.g. layout_css zone aggregation when all live zones are adapter_needed) is a separate axis and out of scope for this issue (see follow_up_issue_candidates). - * mdx05 — non-VP rank-1 path stays clean (exit 0). + * mdx05 — IMP-#87 u5 inversion. mdx05 has ZERO V4 evidence for any + section (``judgments_full32 = 0``, Case B per IMP-#87 Stage 1), + so the composition planner emits an IMP-#30 u4 EMPTY-SHELL + placeholder for the whole slide. Before IMP-#87 the pipeline + reported ``overall=PASS`` + ``full_mdx_coverage=True`` for this + state — the honesty defect this issue fixes. After IMP-#87 u2/u3 + the same run elevates ``overall`` to + ``EMPTY_SHELL_NO_CONTENT`` and the CLI exits 1 (BLOCKED). The old + exit-0 mdx05 smoke is therefore stale; this module now (a) keeps + mdx03 in the exit-0 non-VP parametrization, (b) adds a dedicated + mdx05 blocked-exit assertion that verifies the new + ``EMPTY_SHELL_NO_CONTENT`` status surface, and (c) preserves the + IMP-#85 crash-marker guard on the mdx05 path so future + regressions cannot re-introduce the original uncaught + ``BuilderMissingError`` propagation under cover of the blocked + exit. Each subprocess gets a unique run_id so the runs do not collide on disk when pytest is invoked concurrently or with -x retry. """ from __future__ import annotations +import json import subprocess import sys import uuid @@ -30,6 +46,7 @@ import pytest REPO_ROOT = Path(__file__).resolve().parents[1] SAMPLES_DIR = REPO_ROOT / "samples" / "mdx_batch" +RUNS_DIR = REPO_ROOT / "data" / "runs" # Original IMP-#85 crash signature (issue body verbatim). u1 converted # the uncaught ``ValueError`` raised from the mapper's missing-builder @@ -65,15 +82,22 @@ def _unique_run_id(prefix: str) -> str: "mdx_name,prefix", [ ("03.mdx", "mdx03"), - ("05.mdx", "mdx05"), ], ) def test_non_vp_smoke_runs_clean(mdx_name: str, prefix: str) -> None: - """mdx03 / mdx05 hit non-VP rank-1 frames; the pipeline runs to exit 0. + """mdx03 hits non-VP rank-1 frames; the pipeline runs to exit 0. Non-VP rank-1 selection is the normal Phase Z path and the - primary regression guard that u1-u6 do not perturb mapper / - pipeline behaviour for non-VP routes. + primary regression guard that IMP-#85 u1-u6 do not perturb + mapper / pipeline behaviour for non-VP routes. + + IMP-#87 u5 — mdx05 was removed from this parametrization because + its V4 evidence is empty for every aligned section (Case B, + Stage 1 lock). The IMP-#87 u2 ``EMPTY_SHELL_NO_CONTENT`` enum + + u3 BLOCKED CLI exit make the post-IMP-#87 mdx05 run exit 1, + not 0, so an exit-0 parametrization would now be stale. The + dedicated mdx05 blocked-exit coverage lives in + ``test_mdx05_blocked_exit_empty_shell_no_content`` below. """ cp = _run_pipeline(mdx_name, _unique_run_id(prefix)) assert cp.returncode == 0, ( @@ -83,6 +107,69 @@ def test_non_vp_smoke_runs_clean(mdx_name: str, prefix: str) -> None: ) +def test_mdx05_blocked_exit_empty_shell_no_content() -> None: + """mdx05 must exit 1 (BLOCKED) with ``overall=EMPTY_SHELL_NO_CONTENT``. + + IMP-#87 u5 — mdx05 is the canonical Case B fixture (zero V4 + evidence for any aligned section per Stage 1; ``judgments_full32 = 0`` + in step05). The pre-IMP-#87 pipeline mislabelled this state as + ``overall=PASS`` + ``full_mdx_coverage=True`` because the only + rendered unit was an IMP-#30 u4 EMPTY-SHELL placeholder + (``frame_template_id="__empty__"``) which trivially passes the + Selenium overflow check. IMP-#87 u1 splits content-rendered + coverage from legacy ``covered_section_ids``, u2 elevates the + overall enum to ``EMPTY_SHELL_NO_CONTENT`` before the legacy + ladder, and u3 routes that enum to a BLOCKED CLI exit (1). + + This smoke pins the post-IMP-#87 contract on the real mdx05 + pipeline run: + + * subprocess returncode == 1 (BLOCKED, u3 axis A4). + * ``step20_slide_status.json`` ``overall`` == + ``"EMPTY_SHELL_NO_CONTENT"`` (u2 axis A3 precedence over the + legacy 4-way ladder). + * ``step20_slide_status.json`` ``full_mdx_coverage`` is False + (u1 axis A2 content-rendered coverage split). + * The IMP-#85 original crash marker + (``PAYLOAD_BUILDERS has no such entry``) is absent from both + stdout and stderr — the IMP-#85 crash-marker guard is + preserved on the mdx05 path even though mdx05 itself no + longer exits 0. + """ + run_id = _unique_run_id("mdx05") + cp = _run_pipeline("05.mdx", run_id) + + assert cp.returncode == 1, ( + f"mdx05 expected BLOCKED exit 1, got {cp.returncode}\n" + f"--- stderr tail ---\n{cp.stderr[-1500:]}\n" + f"--- stdout tail ---\n{cp.stdout[-1500:]}" + ) + + combined = cp.stdout + cp.stderr + assert IMP85_OLD_CRASH_MARKER not in combined, ( + "IMP-#85 original crash signature regressed on mdx05 path:\n" + f"--- stderr tail ---\n{cp.stderr[-1500:]}\n" + f"--- stdout tail ---\n{cp.stdout[-1500:]}" + ) + + status_path = RUNS_DIR / run_id / "phase_z2" / "steps" / "step20_slide_status.json" + assert status_path.is_file(), ( + f"mdx05 step20_slide_status.json not found at {status_path}\n" + f"--- stderr tail ---\n{cp.stderr[-1500:]}\n" + f"--- stdout tail ---\n{cp.stdout[-1500:]}" + ) + status_payload = json.loads(status_path.read_text(encoding="utf-8")) + status_data = status_payload.get("data") or {} + assert status_data.get("overall") == "EMPTY_SHELL_NO_CONTENT", ( + f"mdx05 overall expected EMPTY_SHELL_NO_CONTENT, got " + f"{status_data.get('overall')!r}" + ) + assert status_data.get("full_mdx_coverage") is False, ( + f"mdx05 full_mdx_coverage expected False, got " + f"{status_data.get('full_mdx_coverage')!r}" + ) + + def test_mdx04_no_longer_emits_imp85_crash_signature() -> None: """mdx04 must no longer surface the IMP-#85 uncaught crash marker.