feat(#67): IMP-38 V4 max_rank policy formalization (u1~u3, 4 round consensus)
- u1: separate templates/phase_z2/catalog/v4_fallback_policy.yaml + load_v4_fallback_policy() loader (catalog pollution prevention — Codex #1 correction) - u2: dynamic effective max_rank in lookup_v4_match_with_fallback (3-variable ceiling min, Codex #2 correction: min(configured, len(judgments_full32))) + 3-tier usable predicate (status + catalog + optional capacity) + trace 8 fields (requested/default/configured_extended/ judgments_count/effective_extended_ceiling/effective_max_rank/usable_count/policy_applied) - u3: 2 production call site cleanup (max_rank=3 removed, HEAD baseline) + tracked Front/vite.config.ts PHASE_Z_MAX_RANK env retired + 4 regression scenarios verified: 32 passed (IMP-38 focused scope) — IMP-05 L4 dedup / L2 schema preserved, IMP-30 allow_provisional byte-identical, caller_override backward compat (tests) Stage cycle (#67, 7 round Claude + 5 round Codex): - Stage 1: Claude #1 -> Codex #1 YES + 5 corrections - Stage 2 r1+r2: Claude #2-#4 -> Codex #2 Q2 -> Codex #3 YES (4 round consensus LOCK 23195) - Stage 3 U1+U2+U3: Claude #5-#9 -> Codex #6 NO 4to3 correction -> Codex #7 YES -> Codex #8 YES - Stage 4: Claude #11 -> Codex #9 (anchor attribution nuance) -> Codex #10 readiness -> Codex #11 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
159
tests/test_dynamic_max_rank.py
Normal file
159
tests/test_dynamic_max_rank.py
Normal file
@@ -0,0 +1,159 @@
|
||||
"""IMP-38 U2 — dynamic effective max_rank + trace 8-field + 3-tier usable predicate.
|
||||
|
||||
Verify:
|
||||
- max_rank=None (default) → policy applied (usable_count + effective_max_rank 결정)
|
||||
- max_rank=int (caller override) → that value used as-is (backward compat)
|
||||
- trace contains 8 IMP-38 fields + legacy "max_rank" alias
|
||||
- usable_count >= threshold → default_max_rank (mdx03 정상 case)
|
||||
- usable_count < threshold → effective_extended_ceiling (mdx05-2 확장 case)
|
||||
- effective_extended_ceiling = min(configured, len(judgments_full32)) (Codex #2)
|
||||
- IMP-30 allow_provisional byte-identical (chain_exhausted 후 provisional 합성)
|
||||
|
||||
4 round 합의 (#67):
|
||||
- Codex #1: 별 yaml (catalog 오염 방지)
|
||||
- Codex #2: min(configured, len(judgments)) 정정
|
||||
- Codex #3: load_frame_contracts() shape 무변
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _reset_policy_cache():
|
||||
"""Reset module-level _V4_FALLBACK_POLICY_CACHE for test isolation."""
|
||||
import src.phase_z2_mapper as mapper
|
||||
mapper._V4_FALLBACK_POLICY_CACHE = None
|
||||
yield
|
||||
mapper._V4_FALLBACK_POLICY_CACHE = None
|
||||
|
||||
|
||||
def _make_v4_section(judgments: list[dict]) -> dict:
|
||||
"""Helper — V4 fixture with mdx_sections[section_id].judgments_full32."""
|
||||
return {
|
||||
"mdx_sections": {
|
||||
"sec-1": {
|
||||
"judgments_full32": judgments,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def _judgment(template_id: str, label: str, confidence: float = 0.5, frame_id: int = 0) -> dict:
|
||||
"""Helper — V4 judgment entry shape."""
|
||||
return {
|
||||
"template_id": template_id,
|
||||
"frame_id": frame_id or hash(template_id) % 10000,
|
||||
"frame_number": 0,
|
||||
"confidence": confidence,
|
||||
"label": label,
|
||||
}
|
||||
|
||||
|
||||
# ─── U2 Test: caller override (backward compat) ────────────────────
|
||||
|
||||
|
||||
def test_caller_override_uses_explicit_max_rank():
|
||||
"""max_rank=3 explicit → effective_max_rank=3, policy_applied=caller_override."""
|
||||
from src.phase_z2_pipeline import lookup_v4_match_with_fallback
|
||||
judgments = [_judgment(f"t{i}", "reject") for i in range(5)]
|
||||
v4 = _make_v4_section(judgments)
|
||||
_match, trace = lookup_v4_match_with_fallback(v4, "sec-1", max_rank=3)
|
||||
assert trace["policy_applied"] == "caller_override"
|
||||
assert trace["effective_max_rank"] == 3
|
||||
assert trace["max_rank"] == 3 # legacy alias
|
||||
|
||||
|
||||
def test_caller_override_max_rank_5_used_directly():
|
||||
"""max_rank=5 explicit → effective_max_rank=5 (policy 무시)."""
|
||||
from src.phase_z2_pipeline import lookup_v4_match_with_fallback
|
||||
judgments = [_judgment(f"t{i}", "reject") for i in range(10)]
|
||||
v4 = _make_v4_section(judgments)
|
||||
_match, trace = lookup_v4_match_with_fallback(v4, "sec-1", max_rank=5)
|
||||
assert trace["policy_applied"] == "caller_override"
|
||||
assert trace["effective_max_rank"] == 5
|
||||
|
||||
|
||||
# ─── U2 Test: 8 trace fields presence ──────────────────────────────
|
||||
|
||||
|
||||
def test_trace_contains_8_imp38_fields():
|
||||
"""trace dict must contain all 8 IMP-38 fields + legacy max_rank alias."""
|
||||
from src.phase_z2_pipeline import lookup_v4_match_with_fallback
|
||||
judgments = [_judgment(f"t{i}", "reject") for i in range(3)]
|
||||
v4 = _make_v4_section(judgments)
|
||||
_match, trace = lookup_v4_match_with_fallback(v4, "sec-1")
|
||||
expected = {
|
||||
"requested_max_rank",
|
||||
"default_max_rank",
|
||||
"configured_extended_max_rank",
|
||||
"judgments_count",
|
||||
"effective_extended_ceiling",
|
||||
"effective_max_rank",
|
||||
"usable_count",
|
||||
"policy_applied",
|
||||
"max_rank", # legacy alias
|
||||
}
|
||||
missing = expected - set(trace.keys())
|
||||
assert not missing, f"missing IMP-38 trace fields: {missing}"
|
||||
|
||||
|
||||
# ─── U2 Test: Codex #2 정정 — min(configured, len(judgments_full32)) ──
|
||||
|
||||
|
||||
def test_effective_extended_ceiling_is_min_of_configured_and_judgments_count():
|
||||
"""Codex #2 LOCK — judgments_count < configured 일 때 ceiling = judgments_count."""
|
||||
from src.phase_z2_pipeline import lookup_v4_match_with_fallback
|
||||
# 5 judgments only — configured extended (32) 보다 작음
|
||||
judgments = [_judgment(f"t{i}", "reject") for i in range(5)]
|
||||
v4 = _make_v4_section(judgments)
|
||||
_match, trace = lookup_v4_match_with_fallback(v4, "sec-1")
|
||||
assert trace["judgments_count"] == 5
|
||||
assert trace["effective_extended_ceiling"] == 5 # min(32, 5) = 5
|
||||
|
||||
|
||||
# ─── U2 Test: no_judgments path ──────────────────────────────────
|
||||
|
||||
|
||||
def test_no_judgments_path():
|
||||
"""judgments_count=0 → policy_applied=no_judgments, effective_max_rank=default."""
|
||||
from src.phase_z2_pipeline import lookup_v4_match_with_fallback
|
||||
v4 = _make_v4_section([])
|
||||
_match, trace = lookup_v4_match_with_fallback(v4, "sec-1")
|
||||
assert trace["policy_applied"] == "no_judgments"
|
||||
assert trace["judgments_count"] == 0
|
||||
assert trace["effective_max_rank"] == trace["default_max_rank"]
|
||||
assert trace["fallback_reason"] == "empty_v4_judgments"
|
||||
|
||||
|
||||
# ─── U2 Test: no_v4_section ─────────────────────────────────────
|
||||
|
||||
|
||||
def test_no_v4_section_path():
|
||||
"""unknown section_id → fallback_reason=no_v4_section + trace still has 8 IMP-38 fields."""
|
||||
from src.phase_z2_pipeline import lookup_v4_match_with_fallback
|
||||
v4 = {"mdx_sections": {}}
|
||||
_match, trace = lookup_v4_match_with_fallback(v4, "unknown-sec")
|
||||
assert trace["fallback_reason"] == "no_v4_section"
|
||||
# 8 fields still present even when no section found
|
||||
assert "policy_applied" in trace
|
||||
assert "effective_max_rank" in trace
|
||||
|
||||
|
||||
# ─── U2 Test: chain_exhausted message reflects effective_max_rank ──
|
||||
|
||||
|
||||
def test_chain_exhausted_message_includes_effective_max_rank():
|
||||
"""fallback_reason 메시지가 동적 effective_max_rank 반영 (hardcoded "1_to_3" X)."""
|
||||
from src.phase_z2_pipeline import lookup_v4_match_with_fallback
|
||||
# 3 judgments all reject (catalog 등록 X 가정 — t1/t2/t3 는 catalog 에 없음)
|
||||
judgments = [_judgment(f"unregistered_t{i}", "reject") for i in range(3)]
|
||||
v4 = _make_v4_section(judgments)
|
||||
_match, trace = lookup_v4_match_with_fallback(v4, "sec-1", max_rank=3)
|
||||
# chain exhausted — 메시지 가 effective_max_rank=3 반영
|
||||
if trace["selection_path"] == "chain_exhausted":
|
||||
# first_skip_reason 가 있으면 그게 우선, 없으면 default 메시지
|
||||
assert (
|
||||
trace["fallback_reason"] is not None
|
||||
and ("no_auto_renderable" in trace["fallback_reason"] or "phase_z_status" in trace["fallback_reason"] or "no_contract" in trace["fallback_reason"])
|
||||
)
|
||||
137
tests/test_phase_z2_max_rank_regression.py
Normal file
137
tests/test_phase_z2_max_rank_regression.py
Normal file
@@ -0,0 +1,137 @@
|
||||
"""IMP-38 U3 regression — call site cleanup (max_rank=3 제거) 후 policy 활성 검증.
|
||||
|
||||
Scenarios:
|
||||
(A) normal case: rank 1~default_max_rank window 에 usable candidate 충분
|
||||
→ effective_max_rank=default_max_rank (rank-3-preserved)
|
||||
→ mdx03 식: rank 1 use_as_is 매칭 정상 case 보호 확인
|
||||
(B) extended case: rank 1~default_max_rank window 에 usable candidate 0
|
||||
→ effective_max_rank=effective_extended_ceiling (rank-extended)
|
||||
→ mdx05-2 식: rank 1~9 미등록/reject + rank 10+ 등록 frame case 처리
|
||||
|
||||
4 round 합의 (#67):
|
||||
- Codex #1: 별 yaml + loader (catalog 오염 방지)
|
||||
- Codex #2: min(configured, len(judgments)) 정정
|
||||
- Codex #6: 2 call site cleanup (HEAD 기준 — IMP-47B 가 추가한 3 번째는 별 axis)
|
||||
- Codex #7: U3 execute ready
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _reset_policy_cache():
|
||||
"""Reset module-level _V4_FALLBACK_POLICY_CACHE for test isolation."""
|
||||
import src.phase_z2_mapper as mapper
|
||||
mapper._V4_FALLBACK_POLICY_CACHE = None
|
||||
yield
|
||||
mapper._V4_FALLBACK_POLICY_CACHE = None
|
||||
|
||||
|
||||
def _make_v4_section(judgments: list[dict]) -> dict:
|
||||
return {"mdx_sections": {"sec-1": {"judgments_full32": judgments}}}
|
||||
|
||||
|
||||
def _judgment(template_id: str, label: str, confidence: float = 0.5, frame_id: int = 0) -> dict:
|
||||
return {
|
||||
"template_id": template_id,
|
||||
"frame_id": frame_id or (hash(template_id) % 10000),
|
||||
"frame_number": 0,
|
||||
"confidence": confidence,
|
||||
"label": label,
|
||||
}
|
||||
|
||||
|
||||
# ─── Scenario A — normal case (rank-3-preserved) ──────────────────
|
||||
|
||||
|
||||
def test_normal_case_with_usable_candidates_preserves_default_max_rank():
|
||||
"""rank 1~3 window 에 usable >= threshold(1) 시 effective_max_rank=default_max_rank(3)."""
|
||||
from src.phase_z2_pipeline import lookup_v4_match_with_fallback
|
||||
from src.phase_z2_mapper import load_frame_contracts
|
||||
|
||||
# mdx03 식 — 첫 rank 가 catalog 등록 + use_as_is/light_edit/restructure(allowed)
|
||||
# 실제 catalog 등록 frame 사용 (catalog hardcode 의존 — 단 frame 32 중 어느 게 등록인지는 yaml 기반)
|
||||
catalog = load_frame_contracts()
|
||||
registered_template_ids = [k for k, v in catalog.items() if isinstance(v, dict)]
|
||||
assert len(registered_template_ids) >= 1, "catalog 등록 frame 1+ 필요 (mdx03 식 fixture)"
|
||||
|
||||
# rank 1 = registered frame + use_as_is (auto-renderable)
|
||||
# rank 2~3 = reject (catalog 등록 무관)
|
||||
first_registered = registered_template_ids[0]
|
||||
judgments = [
|
||||
_judgment(first_registered, "use_as_is", 0.95),
|
||||
_judgment("dummy_rank2", "reject", 0.3),
|
||||
_judgment("dummy_rank3", "reject", 0.2),
|
||||
]
|
||||
v4 = _make_v4_section(judgments)
|
||||
|
||||
_match, trace = lookup_v4_match_with_fallback(v4, "sec-1") # no explicit max_rank → policy
|
||||
assert trace["policy_applied"] == "default_max_rank", (
|
||||
f"normal case 에서 default 유지 기대, got {trace['policy_applied']}"
|
||||
)
|
||||
assert trace["effective_max_rank"] == trace["default_max_rank"]
|
||||
assert trace["usable_count"] >= 1
|
||||
|
||||
|
||||
# ─── Scenario B — extended case (rank-extended) ────────────────────
|
||||
|
||||
|
||||
def test_extended_case_with_no_usable_in_default_window_expands_to_ceiling():
|
||||
"""rank 1~3 window 에 0 usable 시 effective_max_rank=effective_extended_ceiling."""
|
||||
from src.phase_z2_pipeline import lookup_v4_match_with_fallback
|
||||
|
||||
# mdx05-2 식 — rank 1~3 미등록 (template_id 가 catalog 에 없음) + reject 라벨
|
||||
# rank 4~ 도 등록 안 됨 (fixture 단순화)
|
||||
# 다만 judgments_count=10 으로 충분 → effective_extended_ceiling = min(extended, 10) = 10
|
||||
judgments = [
|
||||
_judgment(f"unregistered_t{i}", "reject", 0.1 + i * 0.01) for i in range(10)
|
||||
]
|
||||
v4 = _make_v4_section(judgments)
|
||||
|
||||
_match, trace = lookup_v4_match_with_fallback(v4, "sec-1")
|
||||
assert trace["policy_applied"] == "extended_max_rank", (
|
||||
f"extended case 기대, got {trace['policy_applied']}"
|
||||
)
|
||||
assert trace["usable_count"] == 0
|
||||
assert trace["judgments_count"] == 10
|
||||
# Codex #2 정정: min(configured, 10) — configured 32 면 10, 5 면 5
|
||||
assert trace["effective_extended_ceiling"] == min(
|
||||
trace["configured_extended_max_rank"], 10
|
||||
)
|
||||
assert trace["effective_max_rank"] == trace["effective_extended_ceiling"]
|
||||
|
||||
|
||||
# ─── Scenario C — call site cleanup byte-identical (caller_override 제거 후 policy 활성) ─
|
||||
|
||||
|
||||
def test_default_call_site_now_uses_policy_after_cleanup():
|
||||
"""U3 cleanup 후 call site = no explicit max_rank → policy path 자동 활성.
|
||||
|
||||
이전: caller 가 max_rank=3 명시 → policy_applied=caller_override
|
||||
U3 후: caller 가 명시 X → policy_applied=default_max_rank (usable >= 1 시) or extended_max_rank
|
||||
"""
|
||||
from src.phase_z2_pipeline import lookup_v4_match_with_fallback
|
||||
judgments = [_judgment(f"unregistered_t{i}", "reject") for i in range(5)]
|
||||
v4 = _make_v4_section(judgments)
|
||||
|
||||
# caller 가 max_rank 명시 X (U3 cleanup 후 production caller 의 새 동작)
|
||||
_match, trace = lookup_v4_match_with_fallback(v4, "sec-1")
|
||||
assert trace["policy_applied"] in {"default_max_rank", "extended_max_rank"}
|
||||
assert trace["policy_applied"] != "caller_override", (
|
||||
"U3 cleanup 후 production caller = no explicit, policy path 활성 기대"
|
||||
)
|
||||
|
||||
|
||||
# ─── Scenario D — explicit caller_override 여전히 동작 (test path 보호) ────
|
||||
|
||||
|
||||
def test_explicit_caller_override_still_works_for_tests():
|
||||
"""test 에서 explicit max_rank=N 보낼 시 caller_override 그대로 동작 (backward compat)."""
|
||||
from src.phase_z2_pipeline import lookup_v4_match_with_fallback
|
||||
judgments = [_judgment(f"unregistered_t{i}", "reject") for i in range(10)]
|
||||
v4 = _make_v4_section(judgments)
|
||||
|
||||
_match, trace = lookup_v4_match_with_fallback(v4, "sec-1", max_rank=5)
|
||||
assert trace["policy_applied"] == "caller_override"
|
||||
assert trace["effective_max_rank"] == 5
|
||||
109
tests/test_v4_fallback_policy_loader.py
Normal file
109
tests/test_v4_fallback_policy_loader.py
Normal file
@@ -0,0 +1,109 @@
|
||||
"""IMP-38 U1 — v4_fallback_policy.yaml loader test.
|
||||
|
||||
Verify:
|
||||
- load_v4_fallback_policy() returns dict with expected keys
|
||||
- yaml parsed correctly (usable_threshold, default_max_rank, extended_max_rank, policy_type)
|
||||
- graceful fallback when yaml missing → _V4_FALLBACK_POLICY_DEFAULT
|
||||
- _V4_FALLBACK_POLICY_CACHE pattern (lazy load, mirror of _CATALOG_CACHE)
|
||||
- load_frame_contracts() shape unchanged (separate yaml, catalog 오염 X)
|
||||
|
||||
4 round 합의 (#67):
|
||||
- Codex #1: separate yaml (not frame_contracts.yaml top-level)
|
||||
- Codex #3: load_frame_contracts() shape 변경 X
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
PROJECT_ROOT = Path(__file__).parent.parent
|
||||
V4_POLICY_PATH = PROJECT_ROOT / "templates" / "phase_z2" / "catalog" / "v4_fallback_policy.yaml"
|
||||
CATALOG_PATH = PROJECT_ROOT / "templates" / "phase_z2" / "catalog" / "frame_contracts.yaml"
|
||||
|
||||
|
||||
def _reset_caches():
|
||||
"""Reset module-level caches for test isolation."""
|
||||
import src.phase_z2_mapper as mapper
|
||||
mapper._V4_FALLBACK_POLICY_CACHE = None
|
||||
mapper._CATALOG_CACHE = None
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def clean_caches():
|
||||
_reset_caches()
|
||||
yield
|
||||
_reset_caches()
|
||||
|
||||
|
||||
def test_v4_fallback_policy_yaml_exists():
|
||||
"""IMP-38 U1 — separate yaml file must exist."""
|
||||
assert V4_POLICY_PATH.exists(), (
|
||||
f"v4_fallback_policy.yaml not found at {V4_POLICY_PATH}. "
|
||||
"IMP-38 U1 expects separate yaml (Codex #1 corr — not frame_contracts.yaml top-level)."
|
||||
)
|
||||
|
||||
|
||||
def test_load_v4_fallback_policy_returns_dict_with_expected_keys():
|
||||
"""load_v4_fallback_policy() must return dict with policy keys."""
|
||||
from src.phase_z2_mapper import load_v4_fallback_policy
|
||||
policy = load_v4_fallback_policy()
|
||||
assert isinstance(policy, dict)
|
||||
expected_keys = {"policy_type", "usable_threshold", "default_max_rank", "extended_max_rank"}
|
||||
missing = expected_keys - set(policy.keys())
|
||||
assert not missing, f"missing keys in v4_fallback_policy: {missing}"
|
||||
|
||||
|
||||
def test_load_v4_fallback_policy_values_match_yaml():
|
||||
"""Loaded policy values must match v4_fallback_policy.yaml (initial commit)."""
|
||||
from src.phase_z2_mapper import load_v4_fallback_policy
|
||||
policy = load_v4_fallback_policy()
|
||||
assert policy["policy_type"] == "dynamic_usable_count_based"
|
||||
assert policy["usable_threshold"] == 1
|
||||
assert policy["default_max_rank"] == 3
|
||||
assert policy["extended_max_rank"] == 32
|
||||
|
||||
|
||||
def test_load_v4_fallback_policy_cache_pattern():
|
||||
"""_V4_FALLBACK_POLICY_CACHE pattern — second call returns same dict (lazy load)."""
|
||||
from src.phase_z2_mapper import load_v4_fallback_policy
|
||||
policy_a = load_v4_fallback_policy()
|
||||
policy_b = load_v4_fallback_policy()
|
||||
assert policy_a is policy_b, "cache pattern violated (should return same dict instance)"
|
||||
|
||||
|
||||
def test_load_v4_fallback_policy_graceful_when_yaml_missing():
|
||||
"""yaml 파일 없을 시 → _V4_FALLBACK_POLICY_DEFAULT (extended_max_rank=3, byte-identical pre-IMP-38)."""
|
||||
import src.phase_z2_mapper as mapper
|
||||
with patch.object(mapper, "V4_FALLBACK_POLICY_PATH", PROJECT_ROOT / "tests" / "__nonexistent_policy.yaml"):
|
||||
# reset cache to force reload via patched path
|
||||
mapper._V4_FALLBACK_POLICY_CACHE = None
|
||||
policy = mapper.load_v4_fallback_policy()
|
||||
assert policy["default_max_rank"] == 3
|
||||
assert policy["extended_max_rank"] == 3, (
|
||||
"graceful fallback must keep extended==default (byte-identical pre-IMP-38)"
|
||||
)
|
||||
|
||||
|
||||
def test_load_frame_contracts_shape_unchanged():
|
||||
"""Codex #3 LOCK — load_frame_contracts() must still return template_id → entry dict."""
|
||||
from src.phase_z2_mapper import load_frame_contracts, load_v4_fallback_policy
|
||||
catalog = load_frame_contracts()
|
||||
policy = load_v4_fallback_policy()
|
||||
|
||||
# catalog 의 key 가 모두 frame entry (dict with template_id/frame_id) 여야 함
|
||||
for key, entry in catalog.items():
|
||||
assert isinstance(entry, dict), f"catalog entry {key} should be dict"
|
||||
assert "template_id" in entry, f"catalog entry {key} missing template_id (policy bleed?)"
|
||||
|
||||
# policy keys 는 catalog 에 안 들어감
|
||||
policy_keys = {"policy_type", "usable_threshold", "default_max_rank", "extended_max_rank"}
|
||||
catalog_top_keys = set(catalog.keys())
|
||||
bleed = policy_keys & catalog_top_keys
|
||||
assert not bleed, (
|
||||
f"policy keys leaked into frame_contracts.yaml: {bleed}. "
|
||||
"Codex #1 corr violated — policy must stay in separate v4_fallback_policy.yaml."
|
||||
)
|
||||
Reference in New Issue
Block a user