From 21476ae000d1d2fbe086c1a244e6c71b6bd0aa88 Mon Sep 17 00:00:00 2001 From: kyeongmin Date: Wed, 13 May 2026 23:59:49 +0900 Subject: [PATCH] fix(IMP-05): complete V4 fallback evidence and dedup qualifiers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refs #5 - Add runtime template_id dedup in lookup_v4_match_with_fallback with first-occurrence reservation; duplicate ranks become audit evidence, not new fallback candidates. - Add Step 9 candidate_evidence as the primary per-unit evidence field while keeping fallback_chain as a compat alias for legacy readers. - Add Step 20 fallback_selection_count and selection_paths derived from comp_debug.v4_fallback_summary with defensive defaults; top-level overall enum unchanged. - Tighten synthetic fallback tests for duplicate handling (rank-1 reject A + rank-2 use_as_is A + rank-3 distinct B → rank-3 wins) and add tests for candidate_evidence + alias equality and Step 20 qualifier presence with defensive defaults. - Verify with pytest (10 passed) and smoke_frame_render --self-check (11/11 partials, IMP-04 F17 calibration intact). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/phase_z2_pipeline.py | 31 ++++- tests/test_phase_z2_v4_fallback.py | 174 +++++++++++++++++++++++++---- 2 files changed, 182 insertions(+), 23 deletions(-) diff --git a/src/phase_z2_pipeline.py b/src/phase_z2_pipeline.py index c49e430..e01c1cf 100644 --- a/src/phase_z2_pipeline.py +++ b/src/phase_z2_pipeline.py @@ -491,6 +491,12 @@ def lookup_v4_match_with_fallback( return None, trace first_skip_reason: Optional[str] = None + # IMP-05 L4 dedup (Codex #14 ordering — Claude #16 placement precision) : + # first occurrence claims template_id for the chain regardless of decision + # (selected/non-direct/rejected/missing-contract/capacity-skipped). Defensive + # against V4 anomaly where same template_id appears at multiple ranks with + # different labels — first label/reason is preserved, later duplicates skip. + seen_template_ids: set[str] = set() for i, judgment in enumerate(judgments, start=1): match = _v4_match_from_judgment(section_id, judgment, rank=i) status = to_phase_z_status(match) @@ -515,6 +521,15 @@ def lookup_v4_match_with_fallback( "reason": None, } + # IMP-05 L4 dedup — duplicate check BEFORE rank evaluation. + # First occurrence reserves template_id even if non-direct/rejected/skipped. + # Later rank with same template_id is skipped as duplicate, audit fields preserved. + if match.template_id in seen_template_ids: + candidate_trace["reason"] = "duplicate_template_id" + trace["candidates"].append(candidate_trace) + continue + seen_template_ids.add(match.template_id) + if status not in MVP1_ALLOWED_STATUSES: candidate_trace["reason"] = f"phase_z_status_not_allowed:{status}" elif get_contract(match.template_id) is None: @@ -1350,6 +1365,13 @@ def compute_slide_status(sections: list[MdxSection], else: overall = "PARTIAL_COVERAGE_WITH_VISUAL_REGRESSION" + # IMP-05 L3 (Codex #10 D4 / #17 idea F / Claude #21 idea J) — Step 20 qualifier fields. + # Additive only — top-level overall enum unchanged. Defensive defaults so non-fallback + # paths (empty v4_fallback_summary) do not crash and report 0 / [] cleanly. + _v4_fb_summary = comp_debug.get("v4_fallback_summary", {}) or {} + _fallback_selection_count = _v4_fb_summary.get("fallback_selection_count", 0) + _selection_paths = _v4_fb_summary.get("selection_paths", []) + return { "rendered": True, "visual_check_passed": visual_passed, @@ -1361,6 +1383,9 @@ def compute_slide_status(sections: list[MdxSection], "selection_path": "fallback_used" if fallback_selections else "rank_1", "fallback_used": bool(fallback_selections), "fallback_selections": fallback_selections, + # IMP-05 L3 qualifier fields — grouped near existing fallback fields for readability. + "fallback_selection_count": _fallback_selection_count, + "selection_paths": _selection_paths, "visual_fail_reasons": list(overflow.get("fail_reasons") or []), "adapter_needed_count": len(adapter_needed_units), "adapter_needed_units": adapter_needed_units, @@ -2706,7 +2731,11 @@ def run_phase_z2_mvp1( "selection_path": unit.selection_path, "fallback_used": bool(unit.selection_path and "fallback" in unit.selection_path), "fallback_reason": unit.fallback_reason, - "fallback_chain": selection_trace.get("candidates", []), + # IMP-05 L2 (Codex #10 D4 / #16 idea A) — Step 9 per-unit candidate evidence. + # candidate_evidence is the primary field for future frontend / AI consumers. + # fallback_chain is kept as a compat alias for any pre-IMP-05 reader. + "candidate_evidence": selection_trace.get("candidates", []), + "fallback_chain": selection_trace.get("candidates", []), # compat alias; prefer candidate_evidence "v4_candidates": [ { "template_id": c.template_id, diff --git a/tests/test_phase_z2_v4_fallback.py b/tests/test_phase_z2_v4_fallback.py index fb7426b..a8a2f44 100644 --- a/tests/test_phase_z2_v4_fallback.py +++ b/tests/test_phase_z2_v4_fallback.py @@ -127,40 +127,66 @@ def test_rank_1_non_direct_promotes_rank_2(patch_selector_deps): # ─── Case 3 : duplicate template_id is skipped / deduped ──────────────────── -def test_duplicate_template_id_is_skipped_or_deduped(patch_selector_deps): - """Codex #10 E4 case 3 + Claude #13 L4 dedup — duplicate template appearing - at multiple ranks must not be evaluated twice as separate fallback candidates. +def test_duplicate_template_id_is_skipped_rank_3_wins(patch_selector_deps): + """Codex #14 dedup precision lock — first occurrence reserves template_id + for the chain regardless of decision. Later rank with same template_id MUST + be skipped as duplicate, regardless of its V4 label. - Current selector traverses rank 1..max_rank linearly. If rank-1 is skipped - (e.g. reject), and rank-2 has the same template_id as rank-1 with a different - label, the dedup expectation is : - - the selector either skips the duplicate, OR - - records duplicate decision in trace so downstream sees the duplication. + Fixture simulates V4 anomaly : rank-1 + rank-2 share same template_id (and + same frame_id per Codex #6 1:1 catalog terminology — real catalog 정합). + rank-1 label = reject (non-direct, first occurrence), rank-2 label = + use_as_is (would be executable but MUST be skipped as duplicate per + Codex #14 intended rule). rank-3 = distinct executable template, wins. - Until explicit dedup guard lands, the conservative assertion is that the - selector does not silently elevate a duplicate template_id without trace. + Per Codex #14 example : + rank 1: A reject → skipped (non-direct), template A claimed + rank 2: A use_as_is → skipped as duplicate_template_id (must NOT win) + rank 3: B use_as_is → selected (distinct template, eligible) """ v4 = _make_v4([ - _j(1, "MOCK_template_reject_a", "MOCK_frame_001", "reject"), - # rank-2 has same template_id as rank-1 (synthetic V4 anomaly) - _j(2, "MOCK_template_reject_a", "MOCK_frame_001", "use_as_is"), - _j(3, "MOCK_template_direct_a", "MOCK_frame_002", "use_as_is"), + # rank-1 : non-direct (reject), reserves template_id for chain + _j(1, "MOCK_template_dup_a", "MOCK_frame_dup_001", "reject"), + # rank-2 : same template_id + same frame_id (1:1 catalog), would be + # executable but MUST be skipped as duplicate (Codex #14 intended rule) + _j(2, "MOCK_template_dup_a", "MOCK_frame_dup_001", "use_as_is"), + # rank-3 : distinct executable template, wins + _j(3, "MOCK_template_direct_a", "MOCK_frame_003", "use_as_is"), ]) match, trace = lookup_v4_match_with_fallback( v4, "S1", raw_content="- a\n- b\n- c\n" ) - # Either the duplicate is skipped (then rank-3 wins) or duplicate is selected. - # In both cases, the candidates trace must include rank-1 AND rank-2 entries. + # rank-3 must be selected (distinct executable, after rank-1+2 duplicates) assert match is not None + assert match.template_id == "MOCK_template_direct_a" + assert match.v4_rank == 3 + assert match.selection_path == "rank_3_fallback" + assert trace["fallback_used"] is True + assert trace["selected_rank"] == 3 + + # Trace must preserve all 3 candidate entries with precise reasons candidates = trace["candidates"] - rank_1_entries = [c for c in candidates if c["rank"] == 1] - rank_2_entries = [c for c in candidates if c["rank"] == 2] - assert len(rank_1_entries) == 1, "rank-1 must appear in candidate trace" - assert len(rank_2_entries) == 1, "rank-2 must appear in candidate trace" - # If dedup guard is added, rank-2 must be skipped with duplicate reason. - # Until then, we only require that the trace surfaces both entries for audit. + by_rank = {c["rank"]: c for c in candidates} + assert set(by_rank.keys()) == {1, 2, 3} + + # rank-1 : non-direct first occurrence (status_not_allowed reason preserved) + assert by_rank[1]["decision"] == "skipped" + assert by_rank[1]["reason"] == "phase_z_status_not_allowed:fallback_candidate" + assert by_rank[1]["template_id"] == "MOCK_template_dup_a" + assert by_rank[1]["v4_label"] == "reject" + + # rank-2 : duplicate of rank-1 template (MUST be skipped as duplicate, NOT selected) + assert by_rank[2]["decision"] == "skipped" + assert by_rank[2]["reason"] == "duplicate_template_id" + assert by_rank[2]["template_id"] == "MOCK_template_dup_a" + # audit fields preserved even though duplicate + assert by_rank[2]["v4_label"] == "use_as_is" + assert by_rank[2]["frame_id"] == "MOCK_frame_dup_001" + + # rank-3 : distinct executable, selected + assert by_rank[3]["decision"] == "selected" + assert by_rank[3]["template_id"] == "MOCK_template_direct_a" # ─── Case 4 : missing contract → skipped / chain-exhausted trace ──────────── @@ -264,3 +290,107 @@ def test_existing_trace_shape_does_not_regress(patch_selector_deps): # rank-1 use_as_is path — no fallback used assert trace["fallback_used"] is False assert trace["selection_path"] == "rank_1" + + +# ─── Case 7 : Step 9 application_plan candidate_evidence + fallback_chain alias ─── + + +def test_step9_candidate_evidence_field_and_alias_equality(): + """Codex #16 idea A + Codex #17 idea E + Claude #19 / #21 — Step 9 application_plan + must expose `candidate_evidence` as the primary per-unit evidence field, with + `fallback_chain` kept as a compat alias pointing to the same data. + + Both fields must reference the same selection_trace.candidates payload so + downstream readers (new = candidate_evidence, legacy = fallback_chain) see + identical data. + """ + # Synthetic selection_trace.candidates shape — mirrors selector output (Step 9 input) + fake_candidates = [ + {"rank": 1, "template_id": "MOCK_template_direct_a", "v4_label": "use_as_is", + "decision": "selected", "reason": "primary_selected"}, + {"rank": 2, "template_id": "MOCK_template_direct_b", "v4_label": "use_as_is", + "decision": "skipped", "reason": None}, + ] + + # Simulated Step 9 application_plan unit payload (post-Step-3 schema) + selection_trace = {"candidates": fake_candidates} + unit_payload = { + "candidate_evidence": selection_trace.get("candidates", []), + "fallback_chain": selection_trace.get("candidates", []), + } + + # candidate_evidence must be present as the primary field + assert "candidate_evidence" in unit_payload + assert unit_payload["candidate_evidence"] == fake_candidates + + # fallback_chain compat alias must reference the same data (no regression for legacy readers) + assert "fallback_chain" in unit_payload + assert unit_payload["fallback_chain"] == unit_payload["candidate_evidence"] + assert unit_payload["fallback_chain"] is unit_payload["candidate_evidence"] or \ + unit_payload["fallback_chain"] == unit_payload["candidate_evidence"] + + +# ─── Case 8 : Step 20 slide-status qualifier fields presence + defensive default + + +def test_step20_slide_status_qualifier_fields_present_with_defensive_defaults(): + """Codex #10 D4 + Codex #17 idea F + Claude #21 idea J — Step 20 slide-status + must expose `fallback_selection_count` and `selection_paths[]` derived from + comp_debug["v4_fallback_summary"] with defensive defaults (0, []) when the + summary is missing or empty. Top-level `overall` enum must remain stable. + """ + from src.phase_z2_pipeline import compute_slide_status + from src.phase_z2_pipeline import MdxSection + + # Case A — comp_debug with populated v4_fallback_summary + sections_empty: list[MdxSection] = [] + units_empty: list = [] + overflow_pass = {"passed": True, "fail_reasons": []} + comp_debug_with = { + "v4_fallback_summary": { + "fallback_used_count": 1, + "fallback_selection_count": 1, + "selection_paths": [ + {"section_id": "S1", "selection_path": "rank_2_fallback", + "selected_rank": 2, "selected_template_id": "MOCK_T", + "fallback_trigger": "phase_z_status_not_allowed:fallback_candidate"}, + ], + }, + "candidates_summary": [], + } + status_a = compute_slide_status( + sections_empty, units_empty, comp_debug_with, overflow_pass, + adapter_needed_units=None, debug_zones=None, + ) + # Step 20 qualifier fields present near existing fallback fields (Codex F ordering) + assert "fallback_selection_count" in status_a + assert "selection_paths" in status_a + assert status_a["fallback_selection_count"] == 1 + assert len(status_a["selection_paths"]) == 1 + assert status_a["selection_paths"][0]["section_id"] == "S1" + # Existing fields preserved (no regression) + assert "fallback_used" in status_a + assert "fallback_selections" in status_a + assert "overall" in status_a + + # Case B — comp_debug missing v4_fallback_summary (defensive defaults) + comp_debug_empty = {"candidates_summary": []} + status_b = compute_slide_status( + sections_empty, units_empty, comp_debug_empty, overflow_pass, + adapter_needed_units=None, debug_zones=None, + ) + # Defensive defaults — 0 + [] when summary missing + assert status_b["fallback_selection_count"] == 0 + assert status_b["selection_paths"] == [] + # Top-level overall enum still stable + assert "overall" in status_b + + # Case C — comp_debug with empty v4_fallback_summary dict + comp_debug_empty_summary = {"v4_fallback_summary": {}, "candidates_summary": []} + status_c = compute_slide_status( + sections_empty, units_empty, comp_debug_empty_summary, overflow_pass, + adapter_needed_units=None, debug_zones=None, + ) + # Defensive defaults — 0 + [] when summary present but empty + assert status_c["fallback_selection_count"] == 0 + assert status_c["selection_paths"] == []