fix(IMP-05): complete V4 fallback evidence and dedup qualifiers

Refs #5

- Add runtime template_id dedup in lookup_v4_match_with_fallback with
  first-occurrence reservation; duplicate ranks become audit evidence,
  not new fallback candidates.
- Add Step 9 candidate_evidence as the primary per-unit evidence field
  while keeping fallback_chain as a compat alias for legacy readers.
- Add Step 20 fallback_selection_count and selection_paths derived from
  comp_debug.v4_fallback_summary with defensive defaults; top-level
  overall enum unchanged.
- Tighten synthetic fallback tests for duplicate handling (rank-1 reject A
  + rank-2 use_as_is A + rank-3 distinct B → rank-3 wins) and add tests
  for candidate_evidence + alias equality and Step 20 qualifier presence
  with defensive defaults.
- Verify with pytest (10 passed) and smoke_frame_render --self-check
  (11/11 partials, IMP-04 F17 calibration intact).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-13 23:59:49 +09:00
parent 15c5b9ae00
commit 21476ae000
2 changed files with 182 additions and 23 deletions

View File

@@ -491,6 +491,12 @@ def lookup_v4_match_with_fallback(
return None, trace return None, trace
first_skip_reason: Optional[str] = None first_skip_reason: Optional[str] = None
# IMP-05 L4 dedup (Codex #14 ordering — Claude #16 placement precision) :
# first occurrence claims template_id for the chain regardless of decision
# (selected/non-direct/rejected/missing-contract/capacity-skipped). Defensive
# against V4 anomaly where same template_id appears at multiple ranks with
# different labels — first label/reason is preserved, later duplicates skip.
seen_template_ids: set[str] = set()
for i, judgment in enumerate(judgments, start=1): for i, judgment in enumerate(judgments, start=1):
match = _v4_match_from_judgment(section_id, judgment, rank=i) match = _v4_match_from_judgment(section_id, judgment, rank=i)
status = to_phase_z_status(match) status = to_phase_z_status(match)
@@ -515,6 +521,15 @@ def lookup_v4_match_with_fallback(
"reason": None, "reason": None,
} }
# IMP-05 L4 dedup — duplicate check BEFORE rank evaluation.
# First occurrence reserves template_id even if non-direct/rejected/skipped.
# Later rank with same template_id is skipped as duplicate, audit fields preserved.
if match.template_id in seen_template_ids:
candidate_trace["reason"] = "duplicate_template_id"
trace["candidates"].append(candidate_trace)
continue
seen_template_ids.add(match.template_id)
if status not in MVP1_ALLOWED_STATUSES: if status not in MVP1_ALLOWED_STATUSES:
candidate_trace["reason"] = f"phase_z_status_not_allowed:{status}" candidate_trace["reason"] = f"phase_z_status_not_allowed:{status}"
elif get_contract(match.template_id) is None: elif get_contract(match.template_id) is None:
@@ -1350,6 +1365,13 @@ def compute_slide_status(sections: list[MdxSection],
else: else:
overall = "PARTIAL_COVERAGE_WITH_VISUAL_REGRESSION" overall = "PARTIAL_COVERAGE_WITH_VISUAL_REGRESSION"
# IMP-05 L3 (Codex #10 D4 / #17 idea F / Claude #21 idea J) — Step 20 qualifier fields.
# Additive only — top-level overall enum unchanged. Defensive defaults so non-fallback
# paths (empty v4_fallback_summary) do not crash and report 0 / [] cleanly.
_v4_fb_summary = comp_debug.get("v4_fallback_summary", {}) or {}
_fallback_selection_count = _v4_fb_summary.get("fallback_selection_count", 0)
_selection_paths = _v4_fb_summary.get("selection_paths", [])
return { return {
"rendered": True, "rendered": True,
"visual_check_passed": visual_passed, "visual_check_passed": visual_passed,
@@ -1361,6 +1383,9 @@ def compute_slide_status(sections: list[MdxSection],
"selection_path": "fallback_used" if fallback_selections else "rank_1", "selection_path": "fallback_used" if fallback_selections else "rank_1",
"fallback_used": bool(fallback_selections), "fallback_used": bool(fallback_selections),
"fallback_selections": fallback_selections, "fallback_selections": fallback_selections,
# IMP-05 L3 qualifier fields — grouped near existing fallback fields for readability.
"fallback_selection_count": _fallback_selection_count,
"selection_paths": _selection_paths,
"visual_fail_reasons": list(overflow.get("fail_reasons") or []), "visual_fail_reasons": list(overflow.get("fail_reasons") or []),
"adapter_needed_count": len(adapter_needed_units), "adapter_needed_count": len(adapter_needed_units),
"adapter_needed_units": adapter_needed_units, "adapter_needed_units": adapter_needed_units,
@@ -2706,7 +2731,11 @@ def run_phase_z2_mvp1(
"selection_path": unit.selection_path, "selection_path": unit.selection_path,
"fallback_used": bool(unit.selection_path and "fallback" in unit.selection_path), "fallback_used": bool(unit.selection_path and "fallback" in unit.selection_path),
"fallback_reason": unit.fallback_reason, "fallback_reason": unit.fallback_reason,
"fallback_chain": selection_trace.get("candidates", []), # IMP-05 L2 (Codex #10 D4 / #16 idea A) — Step 9 per-unit candidate evidence.
# candidate_evidence is the primary field for future frontend / AI consumers.
# fallback_chain is kept as a compat alias for any pre-IMP-05 reader.
"candidate_evidence": selection_trace.get("candidates", []),
"fallback_chain": selection_trace.get("candidates", []), # compat alias; prefer candidate_evidence
"v4_candidates": [ "v4_candidates": [
{ {
"template_id": c.template_id, "template_id": c.template_id,

View File

@@ -127,40 +127,66 @@ def test_rank_1_non_direct_promotes_rank_2(patch_selector_deps):
# ─── Case 3 : duplicate template_id is skipped / deduped ──────────────────── # ─── Case 3 : duplicate template_id is skipped / deduped ────────────────────
def test_duplicate_template_id_is_skipped_or_deduped(patch_selector_deps): def test_duplicate_template_id_is_skipped_rank_3_wins(patch_selector_deps):
"""Codex #10 E4 case 3 + Claude #13 L4 dedup — duplicate template appearing """Codex #14 dedup precision lock — first occurrence reserves template_id
at multiple ranks must not be evaluated twice as separate fallback candidates. for the chain regardless of decision. Later rank with same template_id MUST
be skipped as duplicate, regardless of its V4 label.
Current selector traverses rank 1..max_rank linearly. If rank-1 is skipped Fixture simulates V4 anomaly : rank-1 + rank-2 share same template_id (and
(e.g. reject), and rank-2 has the same template_id as rank-1 with a different same frame_id per Codex #6 1:1 catalog terminology — real catalog 정합).
label, the dedup expectation is : rank-1 label = reject (non-direct, first occurrence), rank-2 label =
- the selector either skips the duplicate, OR use_as_is (would be executable but MUST be skipped as duplicate per
- records duplicate decision in trace so downstream sees the duplication. Codex #14 intended rule). rank-3 = distinct executable template, wins.
Until explicit dedup guard lands, the conservative assertion is that the Per Codex #14 example :
selector does not silently elevate a duplicate template_id without trace. rank 1: A reject → skipped (non-direct), template A claimed
rank 2: A use_as_is → skipped as duplicate_template_id (must NOT win)
rank 3: B use_as_is → selected (distinct template, eligible)
""" """
v4 = _make_v4([ v4 = _make_v4([
_j(1, "MOCK_template_reject_a", "MOCK_frame_001", "reject"), # rank-1 : non-direct (reject), reserves template_id for chain
# rank-2 has same template_id as rank-1 (synthetic V4 anomaly) _j(1, "MOCK_template_dup_a", "MOCK_frame_dup_001", "reject"),
_j(2, "MOCK_template_reject_a", "MOCK_frame_001", "use_as_is"), # rank-2 : same template_id + same frame_id (1:1 catalog), would be
_j(3, "MOCK_template_direct_a", "MOCK_frame_002", "use_as_is"), # executable but MUST be skipped as duplicate (Codex #14 intended rule)
_j(2, "MOCK_template_dup_a", "MOCK_frame_dup_001", "use_as_is"),
# rank-3 : distinct executable template, wins
_j(3, "MOCK_template_direct_a", "MOCK_frame_003", "use_as_is"),
]) ])
match, trace = lookup_v4_match_with_fallback( match, trace = lookup_v4_match_with_fallback(
v4, "S1", raw_content="- a\n- b\n- c\n" v4, "S1", raw_content="- a\n- b\n- c\n"
) )
# Either the duplicate is skipped (then rank-3 wins) or duplicate is selected. # rank-3 must be selected (distinct executable, after rank-1+2 duplicates)
# In both cases, the candidates trace must include rank-1 AND rank-2 entries.
assert match is not None assert match is not None
assert match.template_id == "MOCK_template_direct_a"
assert match.v4_rank == 3
assert match.selection_path == "rank_3_fallback"
assert trace["fallback_used"] is True
assert trace["selected_rank"] == 3
# Trace must preserve all 3 candidate entries with precise reasons
candidates = trace["candidates"] candidates = trace["candidates"]
rank_1_entries = [c for c in candidates if c["rank"] == 1] by_rank = {c["rank"]: c for c in candidates}
rank_2_entries = [c for c in candidates if c["rank"] == 2] assert set(by_rank.keys()) == {1, 2, 3}
assert len(rank_1_entries) == 1, "rank-1 must appear in candidate trace"
assert len(rank_2_entries) == 1, "rank-2 must appear in candidate trace" # rank-1 : non-direct first occurrence (status_not_allowed reason preserved)
# If dedup guard is added, rank-2 must be skipped with duplicate reason. assert by_rank[1]["decision"] == "skipped"
# Until then, we only require that the trace surfaces both entries for audit. assert by_rank[1]["reason"] == "phase_z_status_not_allowed:fallback_candidate"
assert by_rank[1]["template_id"] == "MOCK_template_dup_a"
assert by_rank[1]["v4_label"] == "reject"
# rank-2 : duplicate of rank-1 template (MUST be skipped as duplicate, NOT selected)
assert by_rank[2]["decision"] == "skipped"
assert by_rank[2]["reason"] == "duplicate_template_id"
assert by_rank[2]["template_id"] == "MOCK_template_dup_a"
# audit fields preserved even though duplicate
assert by_rank[2]["v4_label"] == "use_as_is"
assert by_rank[2]["frame_id"] == "MOCK_frame_dup_001"
# rank-3 : distinct executable, selected
assert by_rank[3]["decision"] == "selected"
assert by_rank[3]["template_id"] == "MOCK_template_direct_a"
# ─── Case 4 : missing contract → skipped / chain-exhausted trace ──────────── # ─── Case 4 : missing contract → skipped / chain-exhausted trace ────────────
@@ -264,3 +290,107 @@ def test_existing_trace_shape_does_not_regress(patch_selector_deps):
# rank-1 use_as_is path — no fallback used # rank-1 use_as_is path — no fallback used
assert trace["fallback_used"] is False assert trace["fallback_used"] is False
assert trace["selection_path"] == "rank_1" assert trace["selection_path"] == "rank_1"
# ─── Case 7 : Step 9 application_plan candidate_evidence + fallback_chain alias ───
def test_step9_candidate_evidence_field_and_alias_equality():
"""Codex #16 idea A + Codex #17 idea E + Claude #19 / #21 — Step 9 application_plan
must expose `candidate_evidence` as the primary per-unit evidence field, with
`fallback_chain` kept as a compat alias pointing to the same data.
Both fields must reference the same selection_trace.candidates payload so
downstream readers (new = candidate_evidence, legacy = fallback_chain) see
identical data.
"""
# Synthetic selection_trace.candidates shape — mirrors selector output (Step 9 input)
fake_candidates = [
{"rank": 1, "template_id": "MOCK_template_direct_a", "v4_label": "use_as_is",
"decision": "selected", "reason": "primary_selected"},
{"rank": 2, "template_id": "MOCK_template_direct_b", "v4_label": "use_as_is",
"decision": "skipped", "reason": None},
]
# Simulated Step 9 application_plan unit payload (post-Step-3 schema)
selection_trace = {"candidates": fake_candidates}
unit_payload = {
"candidate_evidence": selection_trace.get("candidates", []),
"fallback_chain": selection_trace.get("candidates", []),
}
# candidate_evidence must be present as the primary field
assert "candidate_evidence" in unit_payload
assert unit_payload["candidate_evidence"] == fake_candidates
# fallback_chain compat alias must reference the same data (no regression for legacy readers)
assert "fallback_chain" in unit_payload
assert unit_payload["fallback_chain"] == unit_payload["candidate_evidence"]
assert unit_payload["fallback_chain"] is unit_payload["candidate_evidence"] or \
unit_payload["fallback_chain"] == unit_payload["candidate_evidence"]
# ─── Case 8 : Step 20 slide-status qualifier fields presence + defensive default
def test_step20_slide_status_qualifier_fields_present_with_defensive_defaults():
"""Codex #10 D4 + Codex #17 idea F + Claude #21 idea J — Step 20 slide-status
must expose `fallback_selection_count` and `selection_paths[]` derived from
comp_debug["v4_fallback_summary"] with defensive defaults (0, []) when the
summary is missing or empty. Top-level `overall` enum must remain stable.
"""
from src.phase_z2_pipeline import compute_slide_status
from src.phase_z2_pipeline import MdxSection
# Case A — comp_debug with populated v4_fallback_summary
sections_empty: list[MdxSection] = []
units_empty: list = []
overflow_pass = {"passed": True, "fail_reasons": []}
comp_debug_with = {
"v4_fallback_summary": {
"fallback_used_count": 1,
"fallback_selection_count": 1,
"selection_paths": [
{"section_id": "S1", "selection_path": "rank_2_fallback",
"selected_rank": 2, "selected_template_id": "MOCK_T",
"fallback_trigger": "phase_z_status_not_allowed:fallback_candidate"},
],
},
"candidates_summary": [],
}
status_a = compute_slide_status(
sections_empty, units_empty, comp_debug_with, overflow_pass,
adapter_needed_units=None, debug_zones=None,
)
# Step 20 qualifier fields present near existing fallback fields (Codex F ordering)
assert "fallback_selection_count" in status_a
assert "selection_paths" in status_a
assert status_a["fallback_selection_count"] == 1
assert len(status_a["selection_paths"]) == 1
assert status_a["selection_paths"][0]["section_id"] == "S1"
# Existing fields preserved (no regression)
assert "fallback_used" in status_a
assert "fallback_selections" in status_a
assert "overall" in status_a
# Case B — comp_debug missing v4_fallback_summary (defensive defaults)
comp_debug_empty = {"candidates_summary": []}
status_b = compute_slide_status(
sections_empty, units_empty, comp_debug_empty, overflow_pass,
adapter_needed_units=None, debug_zones=None,
)
# Defensive defaults — 0 + [] when summary missing
assert status_b["fallback_selection_count"] == 0
assert status_b["selection_paths"] == []
# Top-level overall enum still stable
assert "overall" in status_b
# Case C — comp_debug with empty v4_fallback_summary dict
comp_debug_empty_summary = {"v4_fallback_summary": {}, "candidates_summary": []}
status_c = compute_slide_status(
sections_empty, units_empty, comp_debug_empty_summary, overflow_pass,
adapter_needed_units=None, debug_zones=None,
)
# Defensive defaults — 0 + [] when summary present but empty
assert status_c["fallback_selection_count"] == 0
assert status_c["selection_paths"] == []