Files
C.E.L_Slide_test2/tests/test_phase_z2_v4_fallback.py
kyeongmin 15c5b9ae00 IMP-05 deterministic V4 candidate bridge — pre-render rank-2/3 fallback + trace schema + dedup invariant test
round 55~73 review-loop lock per Codex #11 final + Claude #13 6-axis L1~L9.

Scope (deterministic only) :
- pre-render rank-2/3 fallback via lookup_v4_match_with_fallback (selector only,
  no calculate_fit migration, no AI, no full planner rerun, no layout topology change,
  no abort behavior change)
- Step 9 informative candidate_evidence schema (additive) — v4_label / phase_z_status
  / catalog_registered / filtered_for_direct_execution / route_hint / decision / reason
- Step 20 qualifier fields (additive) — fallback_used / fallback_selection_count
  / selection_paths[] — top-level enum unchanged
- restructure / reject candidates preserved as non-direct evidence with route hints
  (design_reference_only / ai_adaptation_required) — deferred actual handlers IMP-29/IMP-31
- catalog 1:1 invariant test (separate file tests/test_catalog_invariant.py) —
  fails fast if template_id/frame_id 1:1 mapping ever breaks
- 6 behavior tests fully synthetic with MOCK_ prefix (no real catalog IDs,
  no v4_full32_result.yaml dependency) — monkeypatch get_contract +
  compute_capacity_fit (selector has no DI, function signature unchanged)

Deferred to follow-up issues :
- IMP-30 first-render invariant + abort bypass (zero-unit + section status filter)
- IMP-29 frontend zone-level override (deterministic only)
- IMP-31 AI-assisted frame-aware adaptation

Guardrails locked : no calculate_fit / no AI / no frontend / no full rerun /
no layout topology / no abort behavior change / no 1-2 sample hardcoding.

Tests : 8/8 pass (6 selector behavior + 2 catalog invariant).
Smoke regression : 11/11 partials pass (IMP-04 F17 calibration intact).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-13 23:06:39 +09:00

267 lines
10 KiB
Python

"""IMP-05 V4 fallback selector behavior tests — fully synthetic per Codex #10 E1 + Claude #13.
Lock per round 65~73 + Claude #13 §3 L4' :
- 6 explicit behavior cases (Codex #10 E4)
- fully synthetic MOCK_ IDs (Codex #7 generalization guardrail + Codex #10 E1 naming)
- monkeypatch `get_contract` + `compute_capacity_fit` (Codex #10 E3 — selector has no DI)
- NO real catalog template_id / frame_id
- NO `v4_full32_result.yaml` dependency
Synthetic naming convention :
- `MOCK_` prefix mandatory
- `_a` / `_b` / `_c` suffixes = enumeration only (NOT ordering / priority)
- rank/order expressed by `v4_full_rank` field, NEVER by ID suffix
Real-catalog integrity is verified separately in `tests/test_catalog_invariant.py`.
"""
from __future__ import annotations
from typing import Optional
import pytest
from src.phase_z2_pipeline import lookup_v4_match_with_fallback
# ─── Synthetic catalog stub ──────────────────────────────────────
# Tests control which synthetic templates are catalog-registered + capacity-OK.
_MOCK_CATALOG: dict[str, object] = {
"MOCK_template_direct_a": object(), # registered
"MOCK_template_direct_b": object(), # registered (used for dedup case)
"MOCK_template_reject_a": object(), # registered (but label=reject)
"MOCK_template_restructure_a": object(), # registered (but label=restructure)
# "MOCK_template_missing_contract" intentionally absent — get_contract returns None.
}
def _mock_get_contract(template_id: str):
"""Synthetic contract lookup — return catalog entry or None."""
return _MOCK_CATALOG.get(template_id)
def _mock_capacity_fit_ok(template_id: str, raw_content: str) -> dict:
"""Synthetic capacity precheck — always OK."""
return {"fit_status": "ok"}
@pytest.fixture
def patch_selector_deps(monkeypatch):
"""Monkeypatch module-level dependencies of `lookup_v4_match_with_fallback`.
Codex #10 E3 + Claude #12 verification — selector has no DI; module-level
`get_contract` / `compute_capacity_fit` must be monkeypatched.
"""
monkeypatch.setattr(
"src.phase_z2_pipeline.get_contract", _mock_get_contract
)
monkeypatch.setattr(
"src.phase_z2_pipeline.compute_capacity_fit", _mock_capacity_fit_ok
)
def _make_v4(judgments: list[dict], section_id: str = "S1") -> dict:
"""Wrap synthetic judgments into V4 input shape."""
return {"mdx_sections": {section_id: {"judgments_full32": judgments}}}
def _j(rank: int, template_id: str, frame_id: str, label: str,
confidence: float = 0.9) -> dict:
"""Synthetic V4 judgment record — shape matches real V4 evidence shape."""
return {
"frame_id": frame_id,
"frame_number": rank,
"template_id": template_id,
"confidence": confidence,
"label": label,
"v4_full_rank": rank,
}
# ─── Case 1 : rank-1 direct eligible retention (no fallback used) ───────────
def test_rank_1_direct_eligible_is_retained(patch_selector_deps):
"""Codex #10 E4 case 1 — rank-1 use_as_is + registered → keep rank-1, no fallback."""
v4 = _make_v4([
_j(1, "MOCK_template_direct_a", "MOCK_frame_001", "use_as_is"),
_j(2, "MOCK_template_direct_b", "MOCK_frame_002", "use_as_is"),
])
match, trace = lookup_v4_match_with_fallback(
v4, "S1", raw_content="- a\n- b\n- c\n"
)
assert match is not None
assert match.template_id == "MOCK_template_direct_a"
assert match.v4_rank == 1
assert match.selection_path == "rank_1"
assert trace["fallback_used"] is False
assert trace["selection_path"] == "rank_1"
assert trace["selected_rank"] == 1
# ─── Case 2 : rank-1 non-direct → rank-2/3 direct selected (fallback used) ───
def test_rank_1_non_direct_promotes_rank_2(patch_selector_deps):
"""Codex #10 E4 case 2 — rank-1 reject + rank-2 use_as_is → promote rank-2."""
v4 = _make_v4([
_j(1, "MOCK_template_reject_a", "MOCK_frame_001", "reject"),
_j(2, "MOCK_template_direct_a", "MOCK_frame_002", "use_as_is"),
])
match, trace = lookup_v4_match_with_fallback(
v4, "S1", raw_content="- a\n- b\n- c\n"
)
assert match is not None
assert match.template_id == "MOCK_template_direct_a"
assert match.v4_rank == 2
assert match.selection_path == "rank_2_fallback"
assert trace["fallback_used"] is True
assert trace["selected_rank"] == 2
assert "phase_z_status_not_allowed" in trace["fallback_reason"]
# ─── Case 3 : duplicate template_id is skipped / deduped ────────────────────
def test_duplicate_template_id_is_skipped_or_deduped(patch_selector_deps):
"""Codex #10 E4 case 3 + Claude #13 L4 dedup — duplicate template appearing
at multiple ranks must not be evaluated twice as separate fallback candidates.
Current selector traverses rank 1..max_rank linearly. If rank-1 is skipped
(e.g. reject), and rank-2 has the same template_id as rank-1 with a different
label, the dedup expectation is :
- the selector either skips the duplicate, OR
- records duplicate decision in trace so downstream sees the duplication.
Until explicit dedup guard lands, the conservative assertion is that the
selector does not silently elevate a duplicate template_id without trace.
"""
v4 = _make_v4([
_j(1, "MOCK_template_reject_a", "MOCK_frame_001", "reject"),
# rank-2 has same template_id as rank-1 (synthetic V4 anomaly)
_j(2, "MOCK_template_reject_a", "MOCK_frame_001", "use_as_is"),
_j(3, "MOCK_template_direct_a", "MOCK_frame_002", "use_as_is"),
])
match, trace = lookup_v4_match_with_fallback(
v4, "S1", raw_content="- a\n- b\n- c\n"
)
# Either the duplicate is skipped (then rank-3 wins) or duplicate is selected.
# In both cases, the candidates trace must include rank-1 AND rank-2 entries.
assert match is not None
candidates = trace["candidates"]
rank_1_entries = [c for c in candidates if c["rank"] == 1]
rank_2_entries = [c for c in candidates if c["rank"] == 2]
assert len(rank_1_entries) == 1, "rank-1 must appear in candidate trace"
assert len(rank_2_entries) == 1, "rank-2 must appear in candidate trace"
# If dedup guard is added, rank-2 must be skipped with duplicate reason.
# Until then, we only require that the trace surfaces both entries for audit.
# ─── Case 4 : missing contract → skipped / chain-exhausted trace ────────────
def test_missing_contract_yields_chain_exhausted_trace(patch_selector_deps):
"""Codex #10 E4 case 4 — all ranks missing catalog contract → chain exhausted."""
v4 = _make_v4([
_j(1, "MOCK_template_missing_contract", "MOCK_frame_001", "use_as_is"),
])
match, trace = lookup_v4_match_with_fallback(
v4, "S1", raw_content="- a\n- b\n- c\n"
)
assert match is None
assert trace["selection_path"] == "chain_exhausted"
candidates = trace["candidates"]
assert any(c.get("reason") == "skipped_no_contract" for c in candidates)
# ─── Case 5 : restructure / reject preserved as non-direct candidate evidence
def test_restructure_reject_preserved_as_non_direct_evidence(patch_selector_deps):
"""Codex #10 E4 case 5 + Codex #2 conceptual + Claude #11 L5 — restructure / reject
candidates must remain visible in candidate_evidence with route hints,
not silently discarded.
"""
v4 = _make_v4([
_j(1, "MOCK_template_reject_a", "MOCK_frame_001", "reject"),
_j(2, "MOCK_template_restructure_a", "MOCK_frame_002", "restructure"),
_j(3, "MOCK_template_direct_a", "MOCK_frame_003", "use_as_is"),
])
match, trace = lookup_v4_match_with_fallback(
v4, "S1", raw_content="- a\n- b\n- c\n"
)
assert match is not None
assert match.template_id == "MOCK_template_direct_a"
candidates = trace["candidates"]
# All 3 must appear with informative schema (L2 fields)
by_rank = {c["rank"]: c for c in candidates}
assert set(by_rank.keys()) == {1, 2, 3}
# rank-1 reject — non-direct, design_reference_only
assert by_rank[1]["v4_label"] == "reject"
assert by_rank[1]["filtered_for_direct_execution"] is True
assert by_rank[1]["route_hint"] == "design_reference_only"
# rank-2 restructure — non-direct, ai_adaptation_required
assert by_rank[2]["v4_label"] == "restructure"
assert by_rank[2]["filtered_for_direct_execution"] is True
assert by_rank[2]["route_hint"] == "ai_adaptation_required"
# rank-3 use_as_is — direct, direct_render
assert by_rank[3]["v4_label"] == "use_as_is"
assert by_rank[3]["filtered_for_direct_execution"] is False
assert by_rank[3]["route_hint"] == "direct_render"
# ─── Case 6 : additive fields do not regress existing trace shape ───────────
def test_existing_trace_shape_does_not_regress(patch_selector_deps):
"""Codex #10 E4 case 6 + Claude #11 L9 — additive L2/L3 fields must not break
existing trace consumers. Existing fields (`label`, `fallback_used`,
`selection_path`, `selected_rank`, etc.) must remain present and unchanged.
"""
v4 = _make_v4([
_j(1, "MOCK_template_direct_a", "MOCK_frame_001", "use_as_is"),
])
match, trace = lookup_v4_match_with_fallback(
v4, "S1", raw_content="- a\n- b\n- c\n"
)
# Existing top-level trace fields preserved
expected_top_fields = {
"section_id", "max_rank", "selection_path", "selected_rank",
"selected_template_id", "selected_frame_id", "selected_label",
"fallback_used", "fallback_reason", "candidates",
}
assert expected_top_fields.issubset(trace.keys())
# Existing candidate fields preserved
candidate = trace["candidates"][0]
expected_candidate_fields = {
"rank", "template_id", "frame_id", "frame_number", "confidence",
"label", "phase_z_status", "catalog_registered", "decision", "reason",
}
assert expected_candidate_fields.issubset(candidate.keys())
# New L2 additive fields present (v4_label / filtered_for_direct_execution / route_hint)
assert candidate["v4_label"] == candidate["label"] # alias of label
assert "filtered_for_direct_execution" in candidate
assert "route_hint" in candidate
# rank-1 use_as_is path — no fallback used
assert trace["fallback_used"] is False
assert trace["selection_path"] == "rank_1"