Files
C.E.L_Slide_test2/tests/test_phase_z2_v4_fallback.py
kyeongmin 23d1b25144 test(IMP-05): tighten Step 9 candidate evidence guard
Refs #5

Replace the hand-built Case 7 payload assertion with a temporary
production-source guard. The test now fails if Step 9 stops emitting
candidate_evidence, breaks the fallback_chain compat alias, or removes
the alias intent comment.

This is intentionally temporary because Step 9 application-plan unit
assembly is inline. Follow-up IMP-32 should extract a helper and replace
this source-string guard with a direct helper test.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-14 00:24:42 +09:00

383 lines
15 KiB
Python

"""IMP-05 V4 fallback selector behavior tests — fully synthetic per Codex #10 E1 + Claude #13.
Lock per round 65~73 + Claude #13 §3 L4' :
- 6 explicit behavior cases (Codex #10 E4)
- fully synthetic MOCK_ IDs (Codex #7 generalization guardrail + Codex #10 E1 naming)
- monkeypatch `get_contract` + `compute_capacity_fit` (Codex #10 E3 — selector has no DI)
- NO real catalog template_id / frame_id
- NO `v4_full32_result.yaml` dependency
Synthetic naming convention :
- `MOCK_` prefix mandatory
- `_a` / `_b` / `_c` suffixes = enumeration only (NOT ordering / priority)
- rank/order expressed by `v4_full_rank` field, NEVER by ID suffix
Real-catalog integrity is verified separately in `tests/test_catalog_invariant.py`.
"""
from __future__ import annotations
from typing import Optional
import pytest
import inspect
from src import phase_z2_pipeline
from src.phase_z2_pipeline import lookup_v4_match_with_fallback
# ─── Synthetic catalog stub ──────────────────────────────────────
# Tests control which synthetic templates are catalog-registered + capacity-OK.
_MOCK_CATALOG: dict[str, object] = {
"MOCK_template_direct_a": object(), # registered
"MOCK_template_direct_b": object(), # registered (used for dedup case)
"MOCK_template_reject_a": object(), # registered (but label=reject)
"MOCK_template_restructure_a": object(), # registered (but label=restructure)
# "MOCK_template_missing_contract" intentionally absent — get_contract returns None.
}
def _mock_get_contract(template_id: str):
"""Synthetic contract lookup — return catalog entry or None."""
return _MOCK_CATALOG.get(template_id)
def _mock_capacity_fit_ok(template_id: str, raw_content: str) -> dict:
"""Synthetic capacity precheck — always OK."""
return {"fit_status": "ok"}
@pytest.fixture
def patch_selector_deps(monkeypatch):
"""Monkeypatch module-level dependencies of `lookup_v4_match_with_fallback`.
Codex #10 E3 + Claude #12 verification — selector has no DI; module-level
`get_contract` / `compute_capacity_fit` must be monkeypatched.
"""
monkeypatch.setattr(
"src.phase_z2_pipeline.get_contract", _mock_get_contract
)
monkeypatch.setattr(
"src.phase_z2_pipeline.compute_capacity_fit", _mock_capacity_fit_ok
)
def _make_v4(judgments: list[dict], section_id: str = "S1") -> dict:
"""Wrap synthetic judgments into V4 input shape."""
return {"mdx_sections": {section_id: {"judgments_full32": judgments}}}
def _j(rank: int, template_id: str, frame_id: str, label: str,
confidence: float = 0.9) -> dict:
"""Synthetic V4 judgment record — shape matches real V4 evidence shape."""
return {
"frame_id": frame_id,
"frame_number": rank,
"template_id": template_id,
"confidence": confidence,
"label": label,
"v4_full_rank": rank,
}
# ─── Case 1 : rank-1 direct eligible retention (no fallback used) ───────────
def test_rank_1_direct_eligible_is_retained(patch_selector_deps):
"""Codex #10 E4 case 1 — rank-1 use_as_is + registered → keep rank-1, no fallback."""
v4 = _make_v4([
_j(1, "MOCK_template_direct_a", "MOCK_frame_001", "use_as_is"),
_j(2, "MOCK_template_direct_b", "MOCK_frame_002", "use_as_is"),
])
match, trace = lookup_v4_match_with_fallback(
v4, "S1", raw_content="- a\n- b\n- c\n"
)
assert match is not None
assert match.template_id == "MOCK_template_direct_a"
assert match.v4_rank == 1
assert match.selection_path == "rank_1"
assert trace["fallback_used"] is False
assert trace["selection_path"] == "rank_1"
assert trace["selected_rank"] == 1
# ─── Case 2 : rank-1 non-direct → rank-2/3 direct selected (fallback used) ───
def test_rank_1_non_direct_promotes_rank_2(patch_selector_deps):
"""Codex #10 E4 case 2 — rank-1 reject + rank-2 use_as_is → promote rank-2."""
v4 = _make_v4([
_j(1, "MOCK_template_reject_a", "MOCK_frame_001", "reject"),
_j(2, "MOCK_template_direct_a", "MOCK_frame_002", "use_as_is"),
])
match, trace = lookup_v4_match_with_fallback(
v4, "S1", raw_content="- a\n- b\n- c\n"
)
assert match is not None
assert match.template_id == "MOCK_template_direct_a"
assert match.v4_rank == 2
assert match.selection_path == "rank_2_fallback"
assert trace["fallback_used"] is True
assert trace["selected_rank"] == 2
assert "phase_z_status_not_allowed" in trace["fallback_reason"]
# ─── Case 3 : duplicate template_id is skipped / deduped ────────────────────
def test_duplicate_template_id_is_skipped_rank_3_wins(patch_selector_deps):
"""Codex #14 dedup precision lock — first occurrence reserves template_id
for the chain regardless of decision. Later rank with same template_id MUST
be skipped as duplicate, regardless of its V4 label.
Fixture simulates V4 anomaly : rank-1 + rank-2 share same template_id (and
same frame_id per Codex #6 1:1 catalog terminology — real catalog 정합).
rank-1 label = reject (non-direct, first occurrence), rank-2 label =
use_as_is (would be executable but MUST be skipped as duplicate per
Codex #14 intended rule). rank-3 = distinct executable template, wins.
Per Codex #14 example :
rank 1: A reject → skipped (non-direct), template A claimed
rank 2: A use_as_is → skipped as duplicate_template_id (must NOT win)
rank 3: B use_as_is → selected (distinct template, eligible)
"""
v4 = _make_v4([
# rank-1 : non-direct (reject), reserves template_id for chain
_j(1, "MOCK_template_dup_a", "MOCK_frame_dup_001", "reject"),
# rank-2 : same template_id + same frame_id (1:1 catalog), would be
# executable but MUST be skipped as duplicate (Codex #14 intended rule)
_j(2, "MOCK_template_dup_a", "MOCK_frame_dup_001", "use_as_is"),
# rank-3 : distinct executable template, wins
_j(3, "MOCK_template_direct_a", "MOCK_frame_003", "use_as_is"),
])
match, trace = lookup_v4_match_with_fallback(
v4, "S1", raw_content="- a\n- b\n- c\n"
)
# rank-3 must be selected (distinct executable, after rank-1+2 duplicates)
assert match is not None
assert match.template_id == "MOCK_template_direct_a"
assert match.v4_rank == 3
assert match.selection_path == "rank_3_fallback"
assert trace["fallback_used"] is True
assert trace["selected_rank"] == 3
# Trace must preserve all 3 candidate entries with precise reasons
candidates = trace["candidates"]
by_rank = {c["rank"]: c for c in candidates}
assert set(by_rank.keys()) == {1, 2, 3}
# rank-1 : non-direct first occurrence (status_not_allowed reason preserved)
assert by_rank[1]["decision"] == "skipped"
assert by_rank[1]["reason"] == "phase_z_status_not_allowed:fallback_candidate"
assert by_rank[1]["template_id"] == "MOCK_template_dup_a"
assert by_rank[1]["v4_label"] == "reject"
# rank-2 : duplicate of rank-1 template (MUST be skipped as duplicate, NOT selected)
assert by_rank[2]["decision"] == "skipped"
assert by_rank[2]["reason"] == "duplicate_template_id"
assert by_rank[2]["template_id"] == "MOCK_template_dup_a"
# audit fields preserved even though duplicate
assert by_rank[2]["v4_label"] == "use_as_is"
assert by_rank[2]["frame_id"] == "MOCK_frame_dup_001"
# rank-3 : distinct executable, selected
assert by_rank[3]["decision"] == "selected"
assert by_rank[3]["template_id"] == "MOCK_template_direct_a"
# ─── Case 4 : missing contract → skipped / chain-exhausted trace ────────────
def test_missing_contract_yields_chain_exhausted_trace(patch_selector_deps):
"""Codex #10 E4 case 4 — all ranks missing catalog contract → chain exhausted."""
v4 = _make_v4([
_j(1, "MOCK_template_missing_contract", "MOCK_frame_001", "use_as_is"),
])
match, trace = lookup_v4_match_with_fallback(
v4, "S1", raw_content="- a\n- b\n- c\n"
)
assert match is None
assert trace["selection_path"] == "chain_exhausted"
candidates = trace["candidates"]
assert any(c.get("reason") == "skipped_no_contract" for c in candidates)
# ─── Case 5 : restructure / reject preserved as non-direct candidate evidence
def test_restructure_reject_preserved_as_non_direct_evidence(patch_selector_deps):
"""Codex #10 E4 case 5 + Codex #2 conceptual + Claude #11 L5 — restructure / reject
candidates must remain visible in candidate_evidence with route hints,
not silently discarded.
"""
v4 = _make_v4([
_j(1, "MOCK_template_reject_a", "MOCK_frame_001", "reject"),
_j(2, "MOCK_template_restructure_a", "MOCK_frame_002", "restructure"),
_j(3, "MOCK_template_direct_a", "MOCK_frame_003", "use_as_is"),
])
match, trace = lookup_v4_match_with_fallback(
v4, "S1", raw_content="- a\n- b\n- c\n"
)
assert match is not None
assert match.template_id == "MOCK_template_direct_a"
candidates = trace["candidates"]
# All 3 must appear with informative schema (L2 fields)
by_rank = {c["rank"]: c for c in candidates}
assert set(by_rank.keys()) == {1, 2, 3}
# rank-1 reject — non-direct, design_reference_only
assert by_rank[1]["v4_label"] == "reject"
assert by_rank[1]["filtered_for_direct_execution"] is True
assert by_rank[1]["route_hint"] == "design_reference_only"
# rank-2 restructure — non-direct, ai_adaptation_required
assert by_rank[2]["v4_label"] == "restructure"
assert by_rank[2]["filtered_for_direct_execution"] is True
assert by_rank[2]["route_hint"] == "ai_adaptation_required"
# rank-3 use_as_is — direct, direct_render
assert by_rank[3]["v4_label"] == "use_as_is"
assert by_rank[3]["filtered_for_direct_execution"] is False
assert by_rank[3]["route_hint"] == "direct_render"
# ─── Case 6 : additive fields do not regress existing trace shape ───────────
def test_existing_trace_shape_does_not_regress(patch_selector_deps):
"""Codex #10 E4 case 6 + Claude #11 L9 — additive L2/L3 fields must not break
existing trace consumers. Existing fields (`label`, `fallback_used`,
`selection_path`, `selected_rank`, etc.) must remain present and unchanged.
"""
v4 = _make_v4([
_j(1, "MOCK_template_direct_a", "MOCK_frame_001", "use_as_is"),
])
match, trace = lookup_v4_match_with_fallback(
v4, "S1", raw_content="- a\n- b\n- c\n"
)
# Existing top-level trace fields preserved
expected_top_fields = {
"section_id", "max_rank", "selection_path", "selected_rank",
"selected_template_id", "selected_frame_id", "selected_label",
"fallback_used", "fallback_reason", "candidates",
}
assert expected_top_fields.issubset(trace.keys())
# Existing candidate fields preserved
candidate = trace["candidates"][0]
expected_candidate_fields = {
"rank", "template_id", "frame_id", "frame_number", "confidence",
"label", "phase_z_status", "catalog_registered", "decision", "reason",
}
assert expected_candidate_fields.issubset(candidate.keys())
# New L2 additive fields present (v4_label / filtered_for_direct_execution / route_hint)
assert candidate["v4_label"] == candidate["label"] # alias of label
assert "filtered_for_direct_execution" in candidate
assert "route_hint" in candidate
# rank-1 use_as_is path — no fallback used
assert trace["fallback_used"] is False
assert trace["selection_path"] == "rank_1"
# ─── Case 7 : Step 9 production-source guard (Codex #20 blocker fix) ───
def test_step9_production_emits_candidate_evidence_and_alias():
"""Temporary production-source guard for IMP-05 Step 9 evidence fields.
Step 9 application-plan unit assembly is currently inline, so this test
checks the exact production assignments until IMP-32 extracts a helper.
Once that helper exists, replace this source-string guard with a direct
helper-call test.
"""
source = inspect.getsource(phase_z2_pipeline)
candidate_line = '"candidate_evidence": selection_trace.get("candidates", [])'
alias_line = '"fallback_chain": selection_trace.get("candidates", [])'
assert candidate_line in source
assert alias_line in source
assert source.index(candidate_line) < source.index(alias_line)
assert "compat alias; prefer candidate_evidence" in source
# ─── Case 8 : Step 20 slide-status qualifier fields presence + defensive default
def test_step20_slide_status_qualifier_fields_present_with_defensive_defaults():
"""Codex #10 D4 + Codex #17 idea F + Claude #21 idea J — Step 20 slide-status
must expose `fallback_selection_count` and `selection_paths[]` derived from
comp_debug["v4_fallback_summary"] with defensive defaults (0, []) when the
summary is missing or empty. Top-level `overall` enum must remain stable.
"""
from src.phase_z2_pipeline import compute_slide_status
from src.phase_z2_pipeline import MdxSection
# Case A — comp_debug with populated v4_fallback_summary
sections_empty: list[MdxSection] = []
units_empty: list = []
overflow_pass = {"passed": True, "fail_reasons": []}
comp_debug_with = {
"v4_fallback_summary": {
"fallback_used_count": 1,
"fallback_selection_count": 1,
"selection_paths": [
{"section_id": "S1", "selection_path": "rank_2_fallback",
"selected_rank": 2, "selected_template_id": "MOCK_T",
"fallback_trigger": "phase_z_status_not_allowed:fallback_candidate"},
],
},
"candidates_summary": [],
}
status_a = compute_slide_status(
sections_empty, units_empty, comp_debug_with, overflow_pass,
adapter_needed_units=None, debug_zones=None,
)
# Step 20 qualifier fields present near existing fallback fields (Codex F ordering)
assert "fallback_selection_count" in status_a
assert "selection_paths" in status_a
assert status_a["fallback_selection_count"] == 1
assert len(status_a["selection_paths"]) == 1
assert status_a["selection_paths"][0]["section_id"] == "S1"
# Existing fields preserved (no regression)
assert "fallback_used" in status_a
assert "fallback_selections" in status_a
assert "overall" in status_a
# Case B — comp_debug missing v4_fallback_summary (defensive defaults)
comp_debug_empty = {"candidates_summary": []}
status_b = compute_slide_status(
sections_empty, units_empty, comp_debug_empty, overflow_pass,
adapter_needed_units=None, debug_zones=None,
)
# Defensive defaults — 0 + [] when summary missing
assert status_b["fallback_selection_count"] == 0
assert status_b["selection_paths"] == []
# Top-level overall enum still stable
assert "overall" in status_b
# Case C — comp_debug with empty v4_fallback_summary dict
comp_debug_empty_summary = {"v4_fallback_summary": {}, "candidates_summary": []}
status_c = compute_slide_status(
sections_empty, units_empty, comp_debug_empty_summary, overflow_pass,
adapter_needed_units=None, debug_zones=None,
)
# Defensive defaults — 0 + [] when summary present but empty
assert status_c["fallback_selection_count"] == 0
assert status_c["selection_paths"] == []