round 55~73 review-loop lock per Codex #11 final + Claude #13 6-axis L1~L9. Scope (deterministic only) : - pre-render rank-2/3 fallback via lookup_v4_match_with_fallback (selector only, no calculate_fit migration, no AI, no full planner rerun, no layout topology change, no abort behavior change) - Step 9 informative candidate_evidence schema (additive) — v4_label / phase_z_status / catalog_registered / filtered_for_direct_execution / route_hint / decision / reason - Step 20 qualifier fields (additive) — fallback_used / fallback_selection_count / selection_paths[] — top-level enum unchanged - restructure / reject candidates preserved as non-direct evidence with route hints (design_reference_only / ai_adaptation_required) — deferred actual handlers IMP-29/IMP-31 - catalog 1:1 invariant test (separate file tests/test_catalog_invariant.py) — fails fast if template_id/frame_id 1:1 mapping ever breaks - 6 behavior tests fully synthetic with MOCK_ prefix (no real catalog IDs, no v4_full32_result.yaml dependency) — monkeypatch get_contract + compute_capacity_fit (selector has no DI, function signature unchanged) Deferred to follow-up issues : - IMP-30 first-render invariant + abort bypass (zero-unit + section status filter) - IMP-29 frontend zone-level override (deterministic only) - IMP-31 AI-assisted frame-aware adaptation Guardrails locked : no calculate_fit / no AI / no frontend / no full rerun / no layout topology / no abort behavior change / no 1-2 sample hardcoding. Tests : 8/8 pass (6 selector behavior + 2 catalog invariant). Smoke regression : 11/11 partials pass (IMP-04 F17 calibration intact). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
267 lines
10 KiB
Python
267 lines
10 KiB
Python
"""IMP-05 V4 fallback selector behavior tests — fully synthetic per Codex #10 E1 + Claude #13.
|
|
|
|
Lock per round 65~73 + Claude #13 §3 L4' :
|
|
- 6 explicit behavior cases (Codex #10 E4)
|
|
- fully synthetic MOCK_ IDs (Codex #7 generalization guardrail + Codex #10 E1 naming)
|
|
- monkeypatch `get_contract` + `compute_capacity_fit` (Codex #10 E3 — selector has no DI)
|
|
- NO real catalog template_id / frame_id
|
|
- NO `v4_full32_result.yaml` dependency
|
|
|
|
Synthetic naming convention :
|
|
- `MOCK_` prefix mandatory
|
|
- `_a` / `_b` / `_c` suffixes = enumeration only (NOT ordering / priority)
|
|
- rank/order expressed by `v4_full_rank` field, NEVER by ID suffix
|
|
|
|
Real-catalog integrity is verified separately in `tests/test_catalog_invariant.py`.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from typing import Optional
|
|
|
|
import pytest
|
|
|
|
from src.phase_z2_pipeline import lookup_v4_match_with_fallback
|
|
|
|
|
|
# ─── Synthetic catalog stub ──────────────────────────────────────
|
|
# Tests control which synthetic templates are catalog-registered + capacity-OK.
|
|
|
|
_MOCK_CATALOG: dict[str, object] = {
|
|
"MOCK_template_direct_a": object(), # registered
|
|
"MOCK_template_direct_b": object(), # registered (used for dedup case)
|
|
"MOCK_template_reject_a": object(), # registered (but label=reject)
|
|
"MOCK_template_restructure_a": object(), # registered (but label=restructure)
|
|
# "MOCK_template_missing_contract" intentionally absent — get_contract returns None.
|
|
}
|
|
|
|
|
|
def _mock_get_contract(template_id: str):
|
|
"""Synthetic contract lookup — return catalog entry or None."""
|
|
return _MOCK_CATALOG.get(template_id)
|
|
|
|
|
|
def _mock_capacity_fit_ok(template_id: str, raw_content: str) -> dict:
|
|
"""Synthetic capacity precheck — always OK."""
|
|
return {"fit_status": "ok"}
|
|
|
|
|
|
@pytest.fixture
|
|
def patch_selector_deps(monkeypatch):
|
|
"""Monkeypatch module-level dependencies of `lookup_v4_match_with_fallback`.
|
|
|
|
Codex #10 E3 + Claude #12 verification — selector has no DI; module-level
|
|
`get_contract` / `compute_capacity_fit` must be monkeypatched.
|
|
"""
|
|
monkeypatch.setattr(
|
|
"src.phase_z2_pipeline.get_contract", _mock_get_contract
|
|
)
|
|
monkeypatch.setattr(
|
|
"src.phase_z2_pipeline.compute_capacity_fit", _mock_capacity_fit_ok
|
|
)
|
|
|
|
|
|
def _make_v4(judgments: list[dict], section_id: str = "S1") -> dict:
|
|
"""Wrap synthetic judgments into V4 input shape."""
|
|
return {"mdx_sections": {section_id: {"judgments_full32": judgments}}}
|
|
|
|
|
|
def _j(rank: int, template_id: str, frame_id: str, label: str,
|
|
confidence: float = 0.9) -> dict:
|
|
"""Synthetic V4 judgment record — shape matches real V4 evidence shape."""
|
|
return {
|
|
"frame_id": frame_id,
|
|
"frame_number": rank,
|
|
"template_id": template_id,
|
|
"confidence": confidence,
|
|
"label": label,
|
|
"v4_full_rank": rank,
|
|
}
|
|
|
|
|
|
# ─── Case 1 : rank-1 direct eligible retention (no fallback used) ───────────
|
|
|
|
|
|
def test_rank_1_direct_eligible_is_retained(patch_selector_deps):
|
|
"""Codex #10 E4 case 1 — rank-1 use_as_is + registered → keep rank-1, no fallback."""
|
|
v4 = _make_v4([
|
|
_j(1, "MOCK_template_direct_a", "MOCK_frame_001", "use_as_is"),
|
|
_j(2, "MOCK_template_direct_b", "MOCK_frame_002", "use_as_is"),
|
|
])
|
|
|
|
match, trace = lookup_v4_match_with_fallback(
|
|
v4, "S1", raw_content="- a\n- b\n- c\n"
|
|
)
|
|
|
|
assert match is not None
|
|
assert match.template_id == "MOCK_template_direct_a"
|
|
assert match.v4_rank == 1
|
|
assert match.selection_path == "rank_1"
|
|
assert trace["fallback_used"] is False
|
|
assert trace["selection_path"] == "rank_1"
|
|
assert trace["selected_rank"] == 1
|
|
|
|
|
|
# ─── Case 2 : rank-1 non-direct → rank-2/3 direct selected (fallback used) ───
|
|
|
|
|
|
def test_rank_1_non_direct_promotes_rank_2(patch_selector_deps):
|
|
"""Codex #10 E4 case 2 — rank-1 reject + rank-2 use_as_is → promote rank-2."""
|
|
v4 = _make_v4([
|
|
_j(1, "MOCK_template_reject_a", "MOCK_frame_001", "reject"),
|
|
_j(2, "MOCK_template_direct_a", "MOCK_frame_002", "use_as_is"),
|
|
])
|
|
|
|
match, trace = lookup_v4_match_with_fallback(
|
|
v4, "S1", raw_content="- a\n- b\n- c\n"
|
|
)
|
|
|
|
assert match is not None
|
|
assert match.template_id == "MOCK_template_direct_a"
|
|
assert match.v4_rank == 2
|
|
assert match.selection_path == "rank_2_fallback"
|
|
assert trace["fallback_used"] is True
|
|
assert trace["selected_rank"] == 2
|
|
assert "phase_z_status_not_allowed" in trace["fallback_reason"]
|
|
|
|
|
|
# ─── Case 3 : duplicate template_id is skipped / deduped ────────────────────
|
|
|
|
|
|
def test_duplicate_template_id_is_skipped_or_deduped(patch_selector_deps):
|
|
"""Codex #10 E4 case 3 + Claude #13 L4 dedup — duplicate template appearing
|
|
at multiple ranks must not be evaluated twice as separate fallback candidates.
|
|
|
|
Current selector traverses rank 1..max_rank linearly. If rank-1 is skipped
|
|
(e.g. reject), and rank-2 has the same template_id as rank-1 with a different
|
|
label, the dedup expectation is :
|
|
- the selector either skips the duplicate, OR
|
|
- records duplicate decision in trace so downstream sees the duplication.
|
|
|
|
Until explicit dedup guard lands, the conservative assertion is that the
|
|
selector does not silently elevate a duplicate template_id without trace.
|
|
"""
|
|
v4 = _make_v4([
|
|
_j(1, "MOCK_template_reject_a", "MOCK_frame_001", "reject"),
|
|
# rank-2 has same template_id as rank-1 (synthetic V4 anomaly)
|
|
_j(2, "MOCK_template_reject_a", "MOCK_frame_001", "use_as_is"),
|
|
_j(3, "MOCK_template_direct_a", "MOCK_frame_002", "use_as_is"),
|
|
])
|
|
|
|
match, trace = lookup_v4_match_with_fallback(
|
|
v4, "S1", raw_content="- a\n- b\n- c\n"
|
|
)
|
|
|
|
# Either the duplicate is skipped (then rank-3 wins) or duplicate is selected.
|
|
# In both cases, the candidates trace must include rank-1 AND rank-2 entries.
|
|
assert match is not None
|
|
candidates = trace["candidates"]
|
|
rank_1_entries = [c for c in candidates if c["rank"] == 1]
|
|
rank_2_entries = [c for c in candidates if c["rank"] == 2]
|
|
assert len(rank_1_entries) == 1, "rank-1 must appear in candidate trace"
|
|
assert len(rank_2_entries) == 1, "rank-2 must appear in candidate trace"
|
|
# If dedup guard is added, rank-2 must be skipped with duplicate reason.
|
|
# Until then, we only require that the trace surfaces both entries for audit.
|
|
|
|
|
|
# ─── Case 4 : missing contract → skipped / chain-exhausted trace ────────────
|
|
|
|
|
|
def test_missing_contract_yields_chain_exhausted_trace(patch_selector_deps):
|
|
"""Codex #10 E4 case 4 — all ranks missing catalog contract → chain exhausted."""
|
|
v4 = _make_v4([
|
|
_j(1, "MOCK_template_missing_contract", "MOCK_frame_001", "use_as_is"),
|
|
])
|
|
|
|
match, trace = lookup_v4_match_with_fallback(
|
|
v4, "S1", raw_content="- a\n- b\n- c\n"
|
|
)
|
|
|
|
assert match is None
|
|
assert trace["selection_path"] == "chain_exhausted"
|
|
candidates = trace["candidates"]
|
|
assert any(c.get("reason") == "skipped_no_contract" for c in candidates)
|
|
|
|
|
|
# ─── Case 5 : restructure / reject preserved as non-direct candidate evidence
|
|
|
|
|
|
def test_restructure_reject_preserved_as_non_direct_evidence(patch_selector_deps):
|
|
"""Codex #10 E4 case 5 + Codex #2 conceptual + Claude #11 L5 — restructure / reject
|
|
candidates must remain visible in candidate_evidence with route hints,
|
|
not silently discarded.
|
|
"""
|
|
v4 = _make_v4([
|
|
_j(1, "MOCK_template_reject_a", "MOCK_frame_001", "reject"),
|
|
_j(2, "MOCK_template_restructure_a", "MOCK_frame_002", "restructure"),
|
|
_j(3, "MOCK_template_direct_a", "MOCK_frame_003", "use_as_is"),
|
|
])
|
|
|
|
match, trace = lookup_v4_match_with_fallback(
|
|
v4, "S1", raw_content="- a\n- b\n- c\n"
|
|
)
|
|
|
|
assert match is not None
|
|
assert match.template_id == "MOCK_template_direct_a"
|
|
|
|
candidates = trace["candidates"]
|
|
# All 3 must appear with informative schema (L2 fields)
|
|
by_rank = {c["rank"]: c for c in candidates}
|
|
assert set(by_rank.keys()) == {1, 2, 3}
|
|
|
|
# rank-1 reject — non-direct, design_reference_only
|
|
assert by_rank[1]["v4_label"] == "reject"
|
|
assert by_rank[1]["filtered_for_direct_execution"] is True
|
|
assert by_rank[1]["route_hint"] == "design_reference_only"
|
|
|
|
# rank-2 restructure — non-direct, ai_adaptation_required
|
|
assert by_rank[2]["v4_label"] == "restructure"
|
|
assert by_rank[2]["filtered_for_direct_execution"] is True
|
|
assert by_rank[2]["route_hint"] == "ai_adaptation_required"
|
|
|
|
# rank-3 use_as_is — direct, direct_render
|
|
assert by_rank[3]["v4_label"] == "use_as_is"
|
|
assert by_rank[3]["filtered_for_direct_execution"] is False
|
|
assert by_rank[3]["route_hint"] == "direct_render"
|
|
|
|
|
|
# ─── Case 6 : additive fields do not regress existing trace shape ───────────
|
|
|
|
|
|
def test_existing_trace_shape_does_not_regress(patch_selector_deps):
|
|
"""Codex #10 E4 case 6 + Claude #11 L9 — additive L2/L3 fields must not break
|
|
existing trace consumers. Existing fields (`label`, `fallback_used`,
|
|
`selection_path`, `selected_rank`, etc.) must remain present and unchanged.
|
|
"""
|
|
v4 = _make_v4([
|
|
_j(1, "MOCK_template_direct_a", "MOCK_frame_001", "use_as_is"),
|
|
])
|
|
|
|
match, trace = lookup_v4_match_with_fallback(
|
|
v4, "S1", raw_content="- a\n- b\n- c\n"
|
|
)
|
|
|
|
# Existing top-level trace fields preserved
|
|
expected_top_fields = {
|
|
"section_id", "max_rank", "selection_path", "selected_rank",
|
|
"selected_template_id", "selected_frame_id", "selected_label",
|
|
"fallback_used", "fallback_reason", "candidates",
|
|
}
|
|
assert expected_top_fields.issubset(trace.keys())
|
|
|
|
# Existing candidate fields preserved
|
|
candidate = trace["candidates"][0]
|
|
expected_candidate_fields = {
|
|
"rank", "template_id", "frame_id", "frame_number", "confidence",
|
|
"label", "phase_z_status", "catalog_registered", "decision", "reason",
|
|
}
|
|
assert expected_candidate_fields.issubset(candidate.keys())
|
|
|
|
# New L2 additive fields present (v4_label / filtered_for_direct_execution / route_hint)
|
|
assert candidate["v4_label"] == candidate["label"] # alias of label
|
|
assert "filtered_for_direct_execution" in candidate
|
|
assert "route_hint" in candidate
|
|
|
|
# rank-1 use_as_is path — no fallback used
|
|
assert trace["fallback_used"] is False
|
|
assert trace["selection_path"] == "rank_1"
|