diff --git a/Front/client/src/services/designAgentApi.ts b/Front/client/src/services/designAgentApi.ts index 272a0a3..104b2c8 100644 --- a/Front/client/src/services/designAgentApi.ts +++ b/Front/client/src/services/designAgentApi.ts @@ -565,6 +565,13 @@ export async function loadRun(runId: string): Promise { // sort 우선순위 = label (use_as_is > light_edit > restructure > reject) + confidence desc. // 모두 reject 인 경우 confidence desc 만 적용 (사용자 명시). const TOP_N_FRAMES = 6; + // IMP-39 u4 (issue #68) — local LABEL_PRIORITY is now a documentation + // mirror of templates/phase_z2/catalog/ranking_sort_policy.yaml (u1). + // Primary ordering arrives pre-sorted from the backend selector + // (src/phase_z2_pipeline.py lookup_v4_match_with_fallback :1186-1196 + + // _build_application_plan_unit u3 payload fields). This constant is read + // ONLY on the warn-fallback path below (legacy fixtures pre-u3 / payload + // missing). Kept verbatim so the fallback ordering matches u1/u2 contract. const LABEL_PRIORITY: Record = { use_as_is: 0, light_edit: 1, @@ -576,9 +583,6 @@ export async function loadRun(runId: string): Promise { // 2) unit.v4_all_judgments (pre-IMP-05 audit array) // 3) unit.v4_candidates (legacy minimal) // fallback_chain alias is intentionally NOT read (Stage 2 guardrail). - const candidateEvidence = Array.isArray(unit.candidate_evidence) - ? unit.candidate_evidence - : []; const candidateMap = new Map(); const pushCandidate = (c: any) => { if (!c) return; @@ -586,15 +590,64 @@ export async function loadRun(runId: string): Promise { if (!key) return; if (!candidateMap.has(key)) candidateMap.set(key, c); }; - candidateEvidence.forEach(pushCandidate); - (unit.v4_all_judgments ?? []).forEach(pushCandidate); - (unit.v4_candidates ?? []).forEach(pushCandidate); - const rawSource = Array.from(candidateMap.values()); - const v4Source = [...rawSource].sort((a: any, b: any) => { - const lp = (LABEL_PRIORITY[a.label] ?? 99) - (LABEL_PRIORITY[b.label] ?? 99); - if (lp !== 0) return lp; - return (b.confidence ?? 0) - (a.confidence ?? 0); - }); + + // IMP-39 u4 (issue #68) — primary path: consume the backend Step 9 + // payload as the single source of ordering truth. + // • ``unit.sorted_candidate_evidence`` = policy-sorted selector trace + // (src/phase_z2_pipeline.py :4163, alias of selection_trace[ + // "candidates"] sorted by u2 at :1186-1196). Same IMP-05 L2 schema + // consumed below (template_id, label, confidence, frame_number, + // frame_id, rank, catalog_registered, capacity_fit, route_hint, ...). + // • ``unit.ranking_sort_policy`` = full single-source policy dict + // (policy_type / label_priority / unknown_label_priority / + // tie_break_axes) forwarded for telemetry + fallback parity check. + // When both are present we feed sorted_candidate_evidence through the + // existing dedup map (first occurrence wins, mirrors backend + // ``seen_template_ids`` semantics at :1204-1236) and SKIP the local + // re-sort — backend "rank 1" then equals frontend frame_candidates[0] + // by construction (Stage 1 root-cause fix). + const sortedCandidateEvidence: any[] | null = Array.isArray( + unit.sorted_candidate_evidence, + ) + ? unit.sorted_candidate_evidence + : null; + const rankingSortPolicy = unit.ranking_sort_policy ?? null; + const backendPolicyPayloadPresent = + sortedCandidateEvidence !== null && + sortedCandidateEvidence.length > 0 && + rankingSortPolicy !== null; + + let v4Source: any[]; + if (backendPolicyPayloadPresent) { + sortedCandidateEvidence!.forEach(pushCandidate); + v4Source = Array.from(candidateMap.values()); + } else { + // IMP-39 u4 — warn-fallback path. Legacy fixtures predating u3 (or + // any code path that strips the payload) lack the backend-sorted + // evidence; ordering then derives from local LABEL_PRIORITY mirror. + // Warning surfaces drift in dev console without hard-failing the UI + // (graceful: production sample audit deck remains renderable). + if (typeof console !== "undefined" && typeof console.warn === "function") { + console.warn( + `[IMP-39 u4] unit ${unit.unit_id ?? ""}: backend payload ` + + "missing ranking_sort_policy / sorted_candidate_evidence — " + + "falling back to local LABEL_PRIORITY (legacy fixture path).", + ); + } + const candidateEvidence = Array.isArray(unit.candidate_evidence) + ? unit.candidate_evidence + : []; + candidateEvidence.forEach(pushCandidate); + (unit.v4_all_judgments ?? []).forEach(pushCandidate); + (unit.v4_candidates ?? []).forEach(pushCandidate); + const rawSource = Array.from(candidateMap.values()); + v4Source = [...rawSource].sort((a: any, b: any) => { + const lp = + (LABEL_PRIORITY[a.label] ?? 99) - (LABEL_PRIORITY[b.label] ?? 99); + if (lp !== 0) return lp; + return (b.confidence ?? 0) - (a.confidence ?? 0); + }); + } // ─── IMP-41 u4 — application_candidates enrichment (issue #70) ─────────── // Backend Step 9 emits `unit.application_candidates[]` (src/phase_z2_pipeline.py // _application_candidates_for_unit, :3071-3092) one entry per v4 candidate with diff --git a/src/phase_z2_pipeline.py b/src/phase_z2_pipeline.py index 9495c6f..029ac1f 100644 --- a/src/phase_z2_pipeline.py +++ b/src/phase_z2_pipeline.py @@ -108,6 +108,12 @@ ASSETS_SOURCE_BASE = PROJECT_ROOT / "figma_to_html_agent" / "blocks" V4_RESULT_PATH = PROJECT_ROOT / "tests" / "matching" / "v4_full32_result.yaml" RUNS_DIR = PROJECT_ROOT / "data" / "runs" +# IMP-39 (#68) u1 — single-source ranking sort policy yaml. +# Loader + apply_ranking_sort helper below `to_phase_z_status`. +RANKING_SORT_POLICY_PATH = ( + PROJECT_ROOT / "templates" / "phase_z2" / "catalog" / "ranking_sort_policy.yaml" +) + # V4 label → Phase Z status (§ 7.4 매트릭스) V4_LABEL_TO_PHASE_Z_STATUS = { "use_as_is": "matched_zone", @@ -210,6 +216,106 @@ def to_phase_z_status(match: V4Match) -> str: return V4_LABEL_TO_PHASE_Z_STATUS.get(match.label, "unknown") +# ─── IMP-39 (#68) u1 — single-source ranking sort policy ────────── +# +# Single source of (label_priority, tie-break) ordering shared by: +# - backend `lookup_v4_match_with_fallback` selector loop (wired in u2) +# - Step 9 `_build_application_plan_unit` payload (wired in u3) +# - frontend `designAgentApi.ts` candidate builder (wired in u4) +# +# u1 scope = additive only (yaml + loader + helper). No selector wiring, +# no behavior change. Default-fallback matches yaml so missing-file boot +# keeps deterministic ordering identical to the file-loaded policy. + +_RANKING_SORT_POLICY_DEFAULT: dict = { + "policy_type": "deterministic_label_priority_then_confidence", + "label_priority": { + "use_as_is": 0, + "light_edit": 1, + "restructure": 2, + "reject": 3, + }, + "unknown_label_priority": 99, + "tie_break_axes": ["confidence_desc", "v4_rank_asc"], +} + +_RANKING_SORT_POLICY_CACHE: Optional[dict] = None + + +def load_ranking_sort_policy() -> dict: + """IMP-39 u1 — ranking sort policy loader (separate yaml, additive). + + Returns dict with keys: policy_type, label_priority (dict), + unknown_label_priority (int), tie_break_axes (list[str]). + + Graceful fallback: yaml 파일 없을 시 _RANKING_SORT_POLICY_DEFAULT + (위 dict) 그대로 — backward-compat boot-safe. + + Cache: module-level, mirrors `load_v4_fallback_policy` pattern. + """ + global _RANKING_SORT_POLICY_CACHE + if _RANKING_SORT_POLICY_CACHE is None: + if RANKING_SORT_POLICY_PATH.exists(): + loaded = ( + yaml.safe_load(RANKING_SORT_POLICY_PATH.read_text(encoding="utf-8")) + or {} + ) + # merge with default so partial yaml falls through cleanly + merged = dict(_RANKING_SORT_POLICY_DEFAULT) + for k, v in loaded.items(): + merged[k] = v + _RANKING_SORT_POLICY_CACHE = merged + else: + _RANKING_SORT_POLICY_CACHE = dict(_RANKING_SORT_POLICY_DEFAULT) + return _RANKING_SORT_POLICY_CACHE + + +def apply_ranking_sort( + records: list, + *, + policy: Optional[dict] = None, + label_key: str = "label", + confidence_key: str = "confidence", + v4_rank_key: str = "v4_rank", +) -> list: + """IMP-39 u1 — stable sort by (label_priority asc, confidence desc, v4_rank asc). + + Shared ordering primitive — backend selector / Step 9 payload / frontend + mirror invariant. Sample-agnostic; no hardcoded sample IDs. + + Args: + records: list of dicts (selector loop, trace candidates) OR V4Match + objects. Field access falls through getitem → getattr. + policy: optional explicit policy dict; defaults to `load_ranking_sort_policy()`. + label_key / confidence_key / v4_rank_key: per-record field names. + + Returns: + NEW list — input is not mutated. Records lacking a key get the + unknown-label priority / confidence=0.0 / v4_rank=inf so they sink + to the bottom in a deterministic way. + """ + pol = policy if policy is not None else load_ranking_sort_policy() + priority_map: dict = pol.get("label_priority", {}) or {} + unknown_priority: int = int(pol.get("unknown_label_priority", 99)) + + def _get(rec, key): + if isinstance(rec, dict): + return rec.get(key) + return getattr(rec, key, None) + + def _key(rec): + label = _get(rec, label_key) + conf = _get(rec, confidence_key) + v4_rank = _get(rec, v4_rank_key) + label_pri = priority_map.get(label, unknown_priority) + conf_val = float(conf) if conf is not None else 0.0 + # confidence desc → negate for asc sort key + rank_val = int(v4_rank) if v4_rank is not None else 10**9 + return (label_pri, -conf_val, rank_val) + + return sorted(records, key=_key) + + def _b4_mapper_source_enabled() -> bool: """IMP-89 89-a u1 — PHASE_Z_B4_MAPPER_SOURCE env flag reader (default OFF). @@ -1065,6 +1171,30 @@ def lookup_v4_match_with_fallback( trace["fallback_reason"] = "empty_v4_judgments" return None, trace + # IMP-39 (#68) u2 — apply single-source ranking sort policy to the selected + # window AFTER IMP-38 raw-window calc (default_window / usable_count above + # remain RAW all_judgments-based — no silent interaction with fallback + # expansion). Selection order now follows + # (label_priority asc, confidence desc, v4_rank asc) + # so backend selected rank-1 matches frontend frame_candidates[0] + # (designAgentApi.ts:578-597 LABEL_PRIORITY + confidence-desc mirror). + # `v4_rank_key="v4_full_rank"` reads the RAW V4 confidence-rank from each + # judgment dict for tie-break (yaml: tie_break_axes=[confidence_desc, + # v4_rank_asc]). Input list is NOT mutated (apply_ranking_sort returns a + # new list). Trace fields (sorted_candidate_evidence / ranking_sort_policy) + # are forwarded through Step 9 payload in u3. + ranking_sort_policy = load_ranking_sort_policy() + judgments = apply_ranking_sort( + judgments, + policy=ranking_sort_policy, + label_key="label", + confidence_key="confidence", + v4_rank_key="v4_full_rank", + ) + trace["ranking_sort_policy_applied"] = ranking_sort_policy.get( + "policy_type", "deterministic_label_priority_then_confidence" + ) + first_skip_reason: Optional[str] = None # IMP-05 L4 dedup (Codex #14 ordering — Claude #16 placement precision) : # first occurrence claims template_id for the chain regardless of decision @@ -3937,6 +4067,18 @@ def _build_application_plan_unit( - IMP-06 additive plan fields (position / assignment_source / section_ assignment_override / replaced_auto_unit / skipped_collided_auto_units / skipped_reason) — None / False / [] when no override CLI used. + + IMP-39 u3 (issue #68) additive fields : + - ``ranking_sort_policy`` : full policy dict from + ``load_ranking_sort_policy()`` (cached). Forwards the single-source + ordering contract (label_priority map + tie_break_axes) to the Step 9 + payload so the frontend (``designAgentApi.ts``) can mirror the backend + sort without re-implementing the policy locally. u4 wires consumption. + - ``sorted_candidate_evidence`` : explicit alias of the policy-sorted + ``selection_trace["candidates"]`` list. Identical contents to + ``candidate_evidence`` (u2 sorted the underlying ``judgments`` window + before the selector loop appended ``trace["candidates"]``), but the + explicit name documents the post-u2 contract for the frontend. """ unit_id = "+".join(unit.source_section_ids) @@ -3945,6 +4087,14 @@ def _build_application_plan_unit( application_status = "ok" if has_v4 else "no_v4_candidate" current_default = unit.frame_template_id if has_v4 else None + # IMP-39 u3 (issue #68) — forward the single-source ranking policy to the + # Step 9 per-unit payload. ``load_ranking_sort_policy()`` is module-cached + # (``_RANKING_SORT_POLICY_CACHE``), so the per-unit call is O(1) after + # first invocation. The full policy dict (not just ``policy_type``) is + # forwarded so the frontend can mirror label_priority + tie_break_axes + # without re-declaring the contract locally. + ranking_sort_policy = load_ranking_sort_policy() + # IMP-06 blocker-fix (Codex #13 Blocker 3 / #16) — plan-aware additive # fields. additive = pre-IMP-06 readers (no override CLI used) see # position=None / assignment_source=None / section_assignment_override @@ -4006,6 +4156,12 @@ def _build_application_plan_unit( "replaced_auto_unit": plan_replaced_auto, "skipped_collided_auto_units": plan_skipped_collided, "skipped_reason": plan_skipped_reason, + # IMP-39 u3 (issue #68) — single-source ranking policy forwarded to + # frontend so backend selector "rank 1" and frontend + # ``frame_candidates[0]`` share one ordering contract. Additive only; + # pre-u3 readers ignore both keys. + "ranking_sort_policy": ranking_sort_policy, + "sorted_candidate_evidence": selection_trace.get("candidates", []), } diff --git a/templates/phase_z2/catalog/ranking_sort_policy.yaml b/templates/phase_z2/catalog/ranking_sort_policy.yaml new file mode 100644 index 0000000..a6ebfe9 --- /dev/null +++ b/templates/phase_z2/catalog/ranking_sort_policy.yaml @@ -0,0 +1,50 @@ +# IMP-39 single-source ranking sort policy — backend ↔ frontend mirror. +# +# 도입 배경 (issue #68): +# Backend `lookup_v4_match_with_fallback` 는 V4 raw confidence-desc 순서로 +# first-eligible 선택 (label_priority 무시). Frontend `designAgentApi.ts` 는 +# 동일 source 를 (label_priority asc, confidence desc) 로 재정렬 후 slice. +# 결과: 낮은-confidence 높은-priority label 이 raw 상 뒤에 있을 때 +# backend "rank 1 selected" ≠ frontend `frame_candidates[0]` divergence. +# +# 정책 결정 (Stage 1~2 LOCK, 4 round 합의): +# - 단일 source 위치 = 본 yaml (catalog hot-reload + frontend mirror 가능) +# - frame_contracts.yaml / v4_fallback_policy.yaml 오염 회피 (분리 파일) +# - 정렬 axes = (label_priority asc, confidence desc, v4_rank asc) +# - tie-break = 원본 v4_rank 보존 (frontend LABEL_PRIORITY 와 1:1) +# +# 적용 path: +# - backend: src/phase_z2_pipeline.py `apply_ranking_sort` (helper, u1) +# + `lookup_v4_match_with_fallback` selector loop (u2) +# + `_build_application_plan_unit` Step 9 payload (u3) +# - frontend: Front/client/src/services/designAgentApi.ts (u4) +# → unit.ranking_sort_policy + unit.sorted_candidate_evidence 우선 read +# → local LABEL_PRIORITY 는 warn-fallback only + +policy_type: deterministic_label_priority_then_confidence + +# label_priority: +# lower value = higher priority (use_as_is 가 첫 후보) +# sort key = (label_priority asc, confidence desc, v4_rank asc) +label_priority: + use_as_is: 0 + light_edit: 1 + restructure: 2 + reject: 3 + +# unknown_label_priority: +# label 이 위 매트릭스에 없을 시 부여되는 우선순위 (최하위 push). +# frontend `LABEL_PRIORITY[label] ?? 99` 와 1:1. +unknown_label_priority: 99 + +# tie_break_axes: +# 동일 label_priority 시 적용 순서 — frontend mirror 와 1:1. +# confidence_desc: 큰 confidence 가 앞 +# v4_rank_asc: 동일 confidence 시 raw v4 rank (1, 2, 3 ...) 작은 게 앞 +tie_break_axes: + - confidence_desc + - v4_rank_asc + +# graceful fallback (yaml 없을 시): +# loader 가 default policy_type=deterministic_label_priority_then_confidence +# + 위 label_priority 매트릭스 로 fall through (backward compat / boot-safe). diff --git a/tests/phase_z2/fixtures/ranking_sort_policy/synthetic_divergence.yaml b/tests/phase_z2/fixtures/ranking_sort_policy/synthetic_divergence.yaml new file mode 100644 index 0000000..e46a4aa --- /dev/null +++ b/tests/phase_z2/fixtures/ranking_sort_policy/synthetic_divergence.yaml @@ -0,0 +1,56 @@ +fixture_id: synthetic_divergence +purpose: | + Backend - frontend "rank 1" divergence regression - IMP-39 (#68). + Captures the Stage 1 root-cause scenario where the legacy backend + (raw V4 confidence-desc order) selects a high-confidence + lower-priority label, while the frontend (LABEL_PRIORITY asc + + confidence desc) selects the lower-confidence higher-priority + label. The single-source ranking policy + (templates/phase_z2/catalog/ranking_sort_policy.yaml, u1) resolves + the divergence so that both sides agree on "rank 1". + +source: synthetic +sample_agnostic: true +notes: + - No real frame_id / template_id / MDX section is referenced. + - Only the four sort keys matter: label, confidence, v4_full_rank. + - The `tag` field is a fixture-local identifier for assertions. + - Field name `v4_full_rank` mirrors v4_full32_result.yaml shape so + fixture and corpus audit (u8) share the same key contract. + +raw_judgments: + # confidence is strictly descending so v4_full_rank == raw V4 + # confidence-desc rank (same axis as v4_full32_result.yaml). + - tag: synth_restructure_high + label: restructure + confidence: 0.92 + v4_full_rank: 1 + - tag: synth_light_edit_mid + label: light_edit + confidence: 0.70 + v4_full_rank: 2 + - tag: synth_use_as_is_low + label: use_as_is + confidence: 0.41 + v4_full_rank: 3 + - tag: synth_reject_low + label: reject + confidence: 0.30 + v4_full_rank: 4 + +expected_legacy_raw_order: + - synth_restructure_high + - synth_light_edit_mid + - synth_use_as_is_low + - synth_reject_low + +expected_policy_sorted_order: + - synth_use_as_is_low + - synth_light_edit_mid + - synth_restructure_high + - synth_reject_low + +divergence_axis: + pre_policy_rank_1_tag: synth_restructure_high + post_policy_rank_1_tag: synth_use_as_is_low + frontend_candidate_0_tag: synth_use_as_is_low diff --git a/tests/phase_z2/test_imp39_corpus_audit.py b/tests/phase_z2/test_imp39_corpus_audit.py new file mode 100644 index 0000000..6240b64 --- /dev/null +++ b/tests/phase_z2/test_imp39_corpus_audit.py @@ -0,0 +1,437 @@ +"""IMP-39 u8 (issue #68) - corpus audit over tests/matching/v4_full32_result.yaml. + +Mirror-invariance regression on the REAL V4 full-32 judgments corpus +(``tests/matching/v4_full32_result.yaml``). For every MDX section in the +corpus, asserts that: + + 1. The backend ranking helper ``apply_ranking_sort`` (single-source + policy via ``templates/phase_z2/catalog/ranking_sort_policy.yaml``) + yields the same ordering as a Python mirror of the frontend + candidate sort (``Front/client/src/services/designAgentApi.ts`` + warn-fallback path, lines 644-649). i.e. backend selector "rank 1" + == frontend ``frame_candidates[0]`` by construction across the + full corpus, with NO sample-specific carve-out. + 2. The tie-break contract (label_priority asc, confidence desc, + v4_rank asc) holds when (label, confidence) ties occur in real + data (e.g. multi-restructure sections like 01-1 where rank=8 + restructure rises above rank=5 reject under policy). + 3. Real-data DIVERGENCE between raw V4 confidence-desc order and + policy-sorted order EXISTS in the corpus (audit honesty: proves + the policy is non-trivial on real samples, not just synthetic + u6 fixture). + +Sample-agnostic axis (RULE 0 / RULE 7): + - The test iterates ``data['mdx_sections']`` keys dynamically; no + section ID (``01-2``, ``03-1``, ``04-2.1``, ...) is hardcoded as + an assertion target. The corpus inventory is treated as a + parametrize source, not a contract. + - The test does NOT assert any specific ``frame_id`` / + ``template_id`` / ``frame_number``. Only the ordering contract + is asserted. + - The test does NOT depend on MDX 03/04/05 outcome / answer_map + correctness; it only validates that the policy is applied + uniformly across whatever sections the corpus happens to have. + +Scope (u8, Stage 2 plan): + - Real-data sweep of ``tests/matching/v4_full32_result.yaml`` + confirming backend / frontend mirror invariance under + ``apply_ranking_sort`` + ``LABEL_PRIORITY`` mirror. + - Corpus uses ``v4_full_rank`` as the tie-break key, so calls pass + ``v4_rank_key="v4_full_rank"`` (matching u2 selector wiring). + +Out of scope (other units): + - u1 policy yaml shape: covered by ``test_ranking_sort_policy.py``. + - u2 selector wiring: integration covered indirectly via u7. + - u3 Step 9 payload forwarding: covered by u7. + - u4 frontend mirror: covered by u7. + - u5 pure permutation tests. + - u6 SYNTHETIC divergence fixture + (``tests/phase_z2/test_label_priority_synthetic.py``). + - u7 mdx04 env-toggle e2e + (``tests/phase_z2/test_imp39_mdx04_env_toggle_e2e.py``). + - V4 matching algorithm correctness (out of #68 scope, owner #5). + - ``MVP1_ALLOWED_STATUSES`` gate semantics (IMP-47B locked area). + - capacity-fit / catalog contract validation (orthogonal to policy). +""" +from __future__ import annotations + +from pathlib import Path +from typing import Any, Dict, List + +import pytest +import yaml + + +_REPO_ROOT = Path(__file__).resolve().parents[2] +_CORPUS_PATH = _REPO_ROOT / "tests" / "matching" / "v4_full32_result.yaml" + + +# Frontend LABEL_PRIORITY verbatim mirror — Front/client/src/services/ +# designAgentApi.ts:575-580 + warn-fallback sort :644-649. Kept inline (not +# imported from python policy) so this audit catches drift if the frontend +# TS constant ever diverges from the yaml policy. The yaml-shape equality +# is exercised separately in test_ranking_sort_policy.py (u5). +_FRONTEND_LABEL_PRIORITY: Dict[str, int] = { + "use_as_is": 0, + "light_edit": 1, + "restructure": 2, + "reject": 3, +} +_FRONTEND_UNKNOWN_PRIORITY = 99 + + +@pytest.fixture(autouse=True) +def _reset_policy_cache(): + """Mirror peer-test isolation - clear the cached single-source policy.""" + import src.phase_z2_pipeline as pipeline + + pipeline._RANKING_SORT_POLICY_CACHE = None + yield + pipeline._RANKING_SORT_POLICY_CACHE = None + + +@pytest.fixture(scope="module") +def corpus() -> Dict[str, Any]: + """Load v4_full32_result.yaml exactly once per test module run.""" + assert _CORPUS_PATH.exists(), ( + f"Corpus audit source missing: {_CORPUS_PATH}. u8 requires " + f"tests/matching/v4_full32_result.yaml present in repo." + ) + with _CORPUS_PATH.open(encoding="utf-8") as f: + return yaml.safe_load(f) + + +@pytest.fixture(scope="module") +def section_ids(corpus) -> List[str]: + """Dynamic section inventory — NOT hardcoded. + + Source = ``corpus['mdx_sections'].keys()``. The test asserts the + set is non-empty and each entry has a populated ``judgments_full32`` + list. Section IDs themselves are treated as parametrize values, not + assertion targets. + """ + return list(corpus["mdx_sections"].keys()) + + +def _frontend_mirror_sort( + judgments: List[Dict[str, Any]], +) -> List[Dict[str, Any]]: + """Pure-Python mirror of frontend warn-fallback ordering. + + Mirrors ``Front/client/src/services/designAgentApi.ts:644-649``: + v4Source.sort((a, b) => { + const lp = (LABEL_PRIORITY[a.label] ?? 99) - (LABEL_PRIORITY[b.label] ?? 99); + if (lp !== 0) return lp; + return (b.confidence ?? 0) - (a.confidence ?? 0); + }); + + NOTE on tie-break: the frontend warn-fallback path lacks the + explicit v4_rank tie-break the backend policy carries (yaml + ``tie_break_axes: [confidence_desc, v4_rank_asc]``). When (label, + confidence) are both equal, the frontend ``Array.prototype.sort`` + is now stable (ES2019), so original order is preserved. Backend + ``apply_ranking_sort`` also uses Python's stable Timsort and adds + ``v4_rank asc`` only as a positive tie-break which agrees with raw + V4 order (v4_rank=1 first, raw V4 ordering is confidence-desc = + same as input). Net effect: identical ordering across both paths + on the real corpus. The audit below verifies this empirically. + """ + return sorted( + judgments, + key=lambda j: ( + _FRONTEND_LABEL_PRIORITY.get(j.get("label"), _FRONTEND_UNKNOWN_PRIORITY), + -float(j.get("confidence", 0.0)), + ), + ) + + +def _identity_key(judgment: Dict[str, Any]) -> tuple: + """Stable identity for a corpus judgment row. + + ``v4_full_rank`` is unique per section (1..32), so it serves as the + section-local identity. Wrapped in a tuple with ``frame_number`` / + ``template_id`` for diagnostic richness in assert messages (these + extras are NOT used to derive ordering; only for failure diagnosis). + """ + return ( + judgment.get("v4_full_rank"), + judgment.get("frame_number"), + judgment.get("template_id"), + ) + + +# ─── corpus shape sanity ─────────────────────────────────────────────── + + +def test_corpus_file_is_present_and_non_empty(corpus, section_ids): + """RULE 5 factual: corpus path + section inventory both surface up.""" + assert isinstance(corpus, dict) + assert "mdx_sections" in corpus + assert len(section_ids) > 0, ( + f"v4_full32_result.yaml has zero mdx_sections — corpus audit " + f"cannot run. Path: {_CORPUS_PATH}" + ) + for sec_id in section_ids: + section = corpus["mdx_sections"][sec_id] + judgments = section.get("judgments_full32") + assert isinstance(judgments, list) and len(judgments) > 0, ( + f"Section {sec_id}: judgments_full32 missing or empty." + ) + # Every judgment must carry the four sort-relevant fields. + for j in judgments: + assert "label" in j, f"{sec_id}: judgment missing 'label'." + assert "confidence" in j, f"{sec_id}: judgment missing 'confidence'." + assert "v4_full_rank" in j, ( + f"{sec_id}: judgment missing 'v4_full_rank' (tie-break key)." + ) + + +# ─── backend ↔ frontend mirror invariance ─────────────────────────────── + + +def test_backend_policy_sort_matches_frontend_mirror_per_section( + corpus, section_ids, +): + """Per-section: backend ``apply_ranking_sort`` == frontend mirror order.""" + from src.phase_z2_pipeline import apply_ranking_sort + + divergences: List[str] = [] + for sec_id in section_ids: + judgments = corpus["mdx_sections"][sec_id]["judgments_full32"] + + backend_sorted = apply_ranking_sort( + judgments, + v4_rank_key="v4_full_rank", + ) + frontend_sorted = _frontend_mirror_sort(judgments) + + backend_keys = [_identity_key(j) for j in backend_sorted] + frontend_keys = [_identity_key(j) for j in frontend_sorted] + if backend_keys != frontend_keys: + divergences.append( + f"section={sec_id} backend_head={backend_keys[0]} " + f"frontend_head={frontend_keys[0]} " + f"first_divergence_index=" + f"{next((i for i, (a, b) in enumerate(zip(backend_keys, frontend_keys)) if a != b), 'tail')}" + ) + + assert not divergences, ( + "backend ↔ frontend mirror divergence on real corpus:\n " + + "\n ".join(divergences) + ) + + +def test_backend_rank_1_equals_frontend_candidate_0_per_section( + corpus, section_ids, +): + """Stage 1 root-cause head-of-list invariant on every corpus section.""" + from src.phase_z2_pipeline import apply_ranking_sort + + head_mismatches: List[str] = [] + for sec_id in section_ids: + judgments = corpus["mdx_sections"][sec_id]["judgments_full32"] + + backend_rank_1 = apply_ranking_sort( + judgments, + v4_rank_key="v4_full_rank", + )[0] + frontend_candidate_0 = _frontend_mirror_sort(judgments)[0] + + if _identity_key(backend_rank_1) != _identity_key(frontend_candidate_0): + head_mismatches.append( + f"section={sec_id} " + f"backend_rank_1={_identity_key(backend_rank_1)} " + f"frontend_candidate_0={_identity_key(frontend_candidate_0)}" + ) + + assert not head_mismatches, ( + "backend selector 'rank 1' diverges from frontend frame_candidates[0]:\n " + + "\n ".join(head_mismatches) + ) + + +# ─── tie-break + label-priority contract on real data ────────────────── + + +def test_policy_ordering_respects_label_priority_per_section( + corpus, section_ids, +): + """``label_priority`` weakly monotone across the policy-sorted list.""" + from src.phase_z2_pipeline import apply_ranking_sort + + violations: List[str] = [] + for sec_id in section_ids: + judgments = corpus["mdx_sections"][sec_id]["judgments_full32"] + sorted_judgments = apply_ranking_sort( + judgments, + v4_rank_key="v4_full_rank", + ) + priorities = [ + _FRONTEND_LABEL_PRIORITY.get(j["label"], _FRONTEND_UNKNOWN_PRIORITY) + for j in sorted_judgments + ] + for i in range(len(priorities) - 1): + if priorities[i] > priorities[i + 1]: + violations.append( + f"section={sec_id} idx={i} prio={priorities[i]} > " + f"idx={i + 1} prio={priorities[i + 1]}" + ) + break + + assert not violations, ( + "label_priority must be weakly monotone post-sort:\n " + + "\n ".join(violations) + ) + + +def test_policy_confidence_desc_within_label_group_per_section( + corpus, section_ids, +): + """Within same label, confidence must be weakly descending.""" + from src.phase_z2_pipeline import apply_ranking_sort + + violations: List[str] = [] + for sec_id in section_ids: + judgments = corpus["mdx_sections"][sec_id]["judgments_full32"] + sorted_judgments = apply_ranking_sort( + judgments, + v4_rank_key="v4_full_rank", + ) + for i in range(len(sorted_judgments) - 1): + a, b = sorted_judgments[i], sorted_judgments[i + 1] + if a["label"] != b["label"]: + continue + if float(a["confidence"]) < float(b["confidence"]): + violations.append( + f"section={sec_id} idx={i} label={a['label']} " + f"conf={a['confidence']} < idx={i + 1} conf={b['confidence']}" + ) + break + + assert not violations, ( + "confidence must be weakly desc within same-label runs:\n " + + "\n ".join(violations) + ) + + +def test_policy_v4_full_rank_asc_within_label_confidence_ties( + corpus, section_ids, +): + """When (label, confidence) tie, smaller v4_full_rank first. + + Real-data tie-break check. If no section in the corpus exhibits a + (label, confidence) tie, the test passes vacuously — this is the + correct contract: we only assert the tie-break behaviour where + it can actually be observed in the real data. Pure-permutation + tie-break coverage is owned by u5 + (``test_v4_rank_asc_tie_break_on_equal_confidence``). + """ + from src.phase_z2_pipeline import apply_ranking_sort + + tie_break_violations: List[str] = [] + for sec_id in section_ids: + judgments = corpus["mdx_sections"][sec_id]["judgments_full32"] + sorted_judgments = apply_ranking_sort( + judgments, + v4_rank_key="v4_full_rank", + ) + for i in range(len(sorted_judgments) - 1): + a, b = sorted_judgments[i], sorted_judgments[i + 1] + if a["label"] != b["label"]: + continue + if float(a["confidence"]) != float(b["confidence"]): + continue + if int(a["v4_full_rank"]) > int(b["v4_full_rank"]): + tie_break_violations.append( + f"section={sec_id} idx={i} v4_full_rank={a['v4_full_rank']} " + f"> idx={i + 1} v4_full_rank={b['v4_full_rank']} " + f"(label={a['label']} conf={a['confidence']})" + ) + + assert not tie_break_violations, ( + "v4_full_rank must be weakly asc within (label, conf) ties:\n " + + "\n ".join(tie_break_violations) + ) + + +# ─── audit honesty: real divergence exists ───────────────────────────── + + +def test_corpus_exhibits_real_policy_divergence(corpus, section_ids): + """At least one section MUST show raw-V4-order != policy-order. + + Honesty check (RULE 5): the corpus audit is meaningful only if the + policy actually changes some real section's ordering. If every + section already sorts the same way under raw V4 confidence-desc + AND under the policy, then the policy is a no-op on this corpus + and we should know about it — either the corpus needs richer + samples or the divergence axis has shifted. + + Currently observed (2026-05-24) raw-vs-policy mid-list divergence: + sections with multi-label diversity where a lower-confidence + higher-priority candidate sits behind a higher-confidence + lower-priority one (e.g. section 01-1 has rank=8 restructure + rising above rank=5/6/7 rejects under policy). + """ + from src.phase_z2_pipeline import apply_ranking_sort + + any_divergence = False + for sec_id in section_ids: + judgments = corpus["mdx_sections"][sec_id]["judgments_full32"] + # Raw V4 order: rows are stored in v4_full_rank asc (= confidence desc). + raw_keys = [_identity_key(j) for j in judgments] + policy_keys = [ + _identity_key(j) + for j in apply_ranking_sort(judgments, v4_rank_key="v4_full_rank") + ] + if raw_keys != policy_keys: + any_divergence = True + break + + assert any_divergence, ( + "No corpus section shows raw-V4 vs policy ordering divergence. " + "The policy is a no-op on this corpus — either re-curate the " + "corpus or re-validate the divergence axis." + ) + + +# ─── determinism + non-mutation on real corpus ───────────────────────── + + +def test_policy_sort_is_deterministic_across_calls_per_section( + corpus, section_ids, +): + """Two consecutive calls on the same section yield identical ordering.""" + from src.phase_z2_pipeline import apply_ranking_sort + + for sec_id in section_ids: + judgments = corpus["mdx_sections"][sec_id]["judgments_full32"] + first = [ + _identity_key(j) + for j in apply_ranking_sort(judgments, v4_rank_key="v4_full_rank") + ] + second = [ + _identity_key(j) + for j in apply_ranking_sort(judgments, v4_rank_key="v4_full_rank") + ] + assert first == second, ( + f"section={sec_id}: apply_ranking_sort is non-deterministic " + f"across calls." + ) + + +def test_corpus_input_lists_are_not_mutated(corpus, section_ids): + """Corpus rows survive ``apply_ranking_sort`` unchanged in place.""" + from src.phase_z2_pipeline import apply_ranking_sort + + for sec_id in section_ids: + judgments = corpus["mdx_sections"][sec_id]["judgments_full32"] + snapshot = [_identity_key(j) for j in judgments] + + apply_ranking_sort(judgments, v4_rank_key="v4_full_rank") + + post = [_identity_key(j) for j in judgments] + assert snapshot == post, ( + f"section={sec_id}: apply_ranking_sort mutated source list " + f"in place (forbidden — see u5 non-mutation contract)." + ) diff --git a/tests/phase_z2/test_imp39_mdx04_env_toggle_e2e.py b/tests/phase_z2/test_imp39_mdx04_env_toggle_e2e.py new file mode 100644 index 0000000..21aa2e0 --- /dev/null +++ b/tests/phase_z2/test_imp39_mdx04_env_toggle_e2e.py @@ -0,0 +1,332 @@ +"""IMP-39 u7 (issue #68) — mdx04 env-toggle e2e (AI_FALLBACK_ENABLED=off). + +Stage 2 u7 axis G: + Run ``python -m src.phase_z2_pipeline samples/mdx_batch/04.mdx `` + with ``AI_FALLBACK_ENABLED=off`` and assert that the backend selector's + "rank 1" view agrees with the frontend ``frame_candidates[0]`` view — + i.e., the Stage 1 root-cause divergence (Backend src/phase_z2_pipeline.py + raw-confidence-desc iteration vs Frontend Front/client/src/services/ + designAgentApi.ts label-priority resort) cannot recur once both sides + consume the single-source ranking_sort_policy.yaml contract (u1) via the + Step 9 payload (u3) and the frontend primary-path mirror (u4). + +Out of scope (per Stage 2 lock): + * The IMP-85 mdx04 BuilderMissingError downstream surface — covered by + ``tests/test_pipeline_smoke_imp85.py``. This e2e does NOT pin the + subprocess returncode; mdx04 may exit non-zero post-IMP-85 routing + while still emitting ``step09_application_plan.json`` whose unit + payload is what u3/u4 contract on. + * MVP1_ALLOWED_STATUSES gate / v4_fallback_policy max-rank / + capacity-fit / AI restructure / cache carve-out (IMP-46) / Phase Z + spacing semantics — all unchanged by IMP-39. + * Pure-permutation helper coverage (tests/test_ranking_sort_policy.py + u5) and the SYNTHETIC divergence regression + (tests/phase_z2/test_label_priority_synthetic.py u6). + * Corpus audit over v4_full32_result.yaml — u8. + +Demo env toggle policy (feedback_demo_env_toggle_policy 2026-05-08): + The subprocess is spawned with an EXPLICIT + ``env={..., "AI_FALLBACK_ENABLED": "false"}`` override even though + tests/conftest.py already sets the parent-process default to false. + This keeps the toggle expectation visible at the test level and + matches the .env-only activation policy (the .env file ships with + ``AI_FALLBACK_ENABLED=true``; the test isolates the off path). +""" +from __future__ import annotations + +import json +import os +import subprocess +import sys +import uuid +from pathlib import Path +from typing import Any + +import pytest +import yaml + +from src.phase_z2_pipeline import apply_ranking_sort, load_ranking_sort_policy + +_REPO_ROOT = Path(__file__).resolve().parents[2] +_SAMPLE_MDX = _REPO_ROOT / "samples" / "mdx_batch" / "04.mdx" +_RUNS_DIR = _REPO_ROOT / "data" / "runs" +_POLICY_YAML = ( + _REPO_ROOT + / "templates" + / "phase_z2" + / "catalog" + / "ranking_sort_policy.yaml" +) + +# Mirrors Front/client/src/services/designAgentApi.ts :567 — frontend slices +# the dedup'd v4Source to this many candidates. The test asserts that the +# frontend frame_candidates[0] mirror still equals sorted_candidate_evidence[0] +# for any TOP_N_FRAMES >= 1, but we honor the precise frontend constant so +# the dedup-then-slice path is exercised verbatim (not paraphrased). +_FRONTEND_TOP_N_FRAMES = 6 + + +def _frontend_frame_candidates(sorted_evidence: list[dict]) -> list[dict]: + """Pure-Python mirror of Front/client/src/services/designAgentApi.ts + :586-650 primary path: + + const candidateMap = new Map(); + const pushCandidate = (c: any) => { + if (!c) return; + const key = c.template_id ?? c.id ?? c.frame_id; + if (!key) return; + if (!candidateMap.has(key)) candidateMap.set(key, c); + }; + sortedCandidateEvidence!.forEach(pushCandidate); + v4Source = Array.from(candidateMap.values()); + frameCandidates = v4Source.slice(0, TOP_N_FRAMES); + + Same first-occurrence-wins dedup ordering, same slice cap, same key + fallback chain. Kept inline (no shared util) so a TS-side refactor that + diverges the contract is forced to update this mirror explicitly. + """ + seen: dict[Any, dict] = {} + for c in sorted_evidence: + if not isinstance(c, dict): + continue + key = c.get("template_id") or c.get("id") or c.get("frame_id") + if key is None or key == "": + continue + if key not in seen: + seen[key] = c + return list(seen.values())[:_FRONTEND_TOP_N_FRAMES] + + +@pytest.fixture(scope="module") +def mdx04_env_toggle_run() -> dict: + """Single subprocess run shared across u7 assertions. + + Returns ``{"run_id": ..., "completed_process": ..., "plan_payload": ...}``. + The IMP-85 downstream surface may push returncode != 0 for mdx04 (out of + scope here) — we still expect ``step09_application_plan.json`` to be + emitted, because u3 forwards the payload before any IMP-85 builder-fit + path. The fixture xfails if mdx04 does not even reach step09. + """ + assert _SAMPLE_MDX.exists(), f"sample missing: {_SAMPLE_MDX}" + run_id = f"imp39_u7_mdx04_{uuid.uuid4().hex[:8]}" + env = dict(os.environ) + env["AI_FALLBACK_ENABLED"] = "false" + env["AI_FALLBACK_AUTO_CACHE"] = "false" + cp = subprocess.run( + [ + sys.executable, + "-m", + "src.phase_z2_pipeline", + str(_SAMPLE_MDX), + run_id, + ], + capture_output=True, + text=True, + timeout=240, + cwd=str(_REPO_ROOT), + env=env, + ) + plan_path = ( + _RUNS_DIR + / run_id + / "phase_z2" + / "steps" + / "step09_application_plan.json" + ) + if not plan_path.is_file(): + pytest.xfail( + "mdx04 subprocess did not emit step09_application_plan.json " + f"(IMP-85 area, out of scope for u7). returncode={cp.returncode}\n" + f"--- stderr tail ---\n{cp.stderr[-1500:]}\n" + f"--- stdout tail ---\n{cp.stdout[-1500:]}" + ) + plan_payload = json.loads(plan_path.read_text(encoding="utf-8")) + return { + "run_id": run_id, + "completed_process": cp, + "plan_payload": plan_payload, + } + + +def _units_with_v4(plan_payload: dict) -> list[dict]: + units = (plan_payload.get("data") or {}).get("units") or [] + return [ + u + for u in units + if isinstance(u.get("sorted_candidate_evidence"), list) + and u["sorted_candidate_evidence"] + ] + + +def test_mdx04_env_toggle_step9_emits_u3_payload_fields(mdx04_env_toggle_run): + """Every Step 9 unit in the mdx04 e2e run carries the u3 additive fields + (``ranking_sort_policy`` + ``sorted_candidate_evidence``). + + Locks: u3 payload forwarding (src/phase_z2_pipeline.py :4163-4164) is + exercised by the real subprocess path on mdx04, not just an in-process + helper smoke. Without this gate the u4 frontend primary path silently + degrades to the LABEL_PRIORITY warn-fallback and the Stage 1 divergence + can re-surface on legacy data. + """ + plan = mdx04_env_toggle_run["plan_payload"] + units = (plan.get("data") or {}).get("units") or [] + assert units, "mdx04 application_plan emitted zero units" + yaml_policy = yaml.safe_load(_POLICY_YAML.read_text(encoding="utf-8")) + expected_policy_type = yaml_policy["policy_type"] + expected_label_priority = yaml_policy["label_priority"] + expected_unknown = yaml_policy["unknown_label_priority"] + expected_tie_break = yaml_policy["tie_break_axes"] + for u in units: + assert "ranking_sort_policy" in u, ( + f"unit {u.get('unit_id')!r} missing ranking_sort_policy " + "(u3 payload forwarding regressed)" + ) + assert "sorted_candidate_evidence" in u, ( + f"unit {u.get('unit_id')!r} missing sorted_candidate_evidence " + "(u3 payload forwarding regressed)" + ) + pol = u["ranking_sort_policy"] + assert pol.get("policy_type") == expected_policy_type + assert pol.get("label_priority") == expected_label_priority + assert pol.get("unknown_label_priority") == expected_unknown + assert pol.get("tie_break_axes") == expected_tie_break + + +def test_mdx04_sorted_candidate_evidence_is_policy_sorted(mdx04_env_toggle_run): + """``unit.sorted_candidate_evidence`` is already in policy order — i.e., + ``apply_ranking_sort(evidence)`` is a no-op (idempotent). + + This pins the u2 selector ordering invariant + (src/phase_z2_pipeline.py :1186-1196 sorts ``judgments`` BEFORE the + selector loop appends candidate_trace entries) against the real mdx04 + pipeline path. Any future change that re-sorts the trace post-iteration + or appends out-of-order would fail this assertion. + """ + plan = mdx04_env_toggle_run["plan_payload"] + units_with_v4 = _units_with_v4(plan) + assert units_with_v4, ( + "mdx04 application_plan units have no V4 evidence; cannot evaluate " + "the sort-idempotency invariant" + ) + policy = load_ranking_sort_policy() + for u in units_with_v4: + evidence = u["sorted_candidate_evidence"] + resorted = apply_ranking_sort( + evidence, + policy=policy, + label_key="label", + confidence_key="confidence", + v4_rank_key="v4_full_rank", + ) + order_in = [ + (c.get("label"), c.get("confidence"), c.get("template_id")) + for c in evidence + ] + order_out = [ + (c.get("label"), c.get("confidence"), c.get("template_id")) + for c in resorted + ] + assert order_in == order_out, ( + f"unit {u.get('unit_id')!r} sorted_candidate_evidence is not in " + f"policy order (u2 selector-loop ordering regressed):\n" + f" observed: {order_in[:6]}\n" + f" expected: {order_out[:6]}" + ) + + +def test_mdx04_backend_frontend_rank_one_mirror(mdx04_env_toggle_run): + """Stage 1 root-cause regression guard: backend "rank 1" view ≡ + frontend ``frame_candidates[0]`` view on real mdx04 data. + + Backend view = ``sorted_candidate_evidence[0]`` (policy-sorted selector + trace head — what the selector saw at iteration 1 of u2's sorted loop). + Frontend view = first entry of the dedup-then-slice mirror computed by + ``_frontend_frame_candidates`` (Front/client/src/services/designAgentApi.ts + :586-661 primary path verbatim). + + These two MUST refer to the same V4 candidate (matched on + ``(template_id, label, confidence)``) for every unit emitted by the mdx04 + pipeline run under ``AI_FALLBACK_ENABLED=off``. A mismatch here is the + exact post-fix surface of the Stage 1 root-cause divergence; the test is + sample-agnostic in its assertion (the divergence is structurally + impossible once both sides share the same source, not because mdx04 + specifically lacks the divergence shape). + """ + plan = mdx04_env_toggle_run["plan_payload"] + units_with_v4 = _units_with_v4(plan) + assert units_with_v4, "no V4-bearing units in mdx04 application_plan" + for u in units_with_v4: + evidence = u["sorted_candidate_evidence"] + backend_head = evidence[0] + frontend_candidates = _frontend_frame_candidates(evidence) + assert frontend_candidates, ( + f"unit {u.get('unit_id')!r}: frontend dedup mirror produced " + "an empty frame_candidates list (key fallback chain regressed)" + ) + frontend_head = frontend_candidates[0] + backend_key = ( + backend_head.get("template_id"), + backend_head.get("label"), + backend_head.get("confidence"), + ) + frontend_key = ( + frontend_head.get("template_id"), + frontend_head.get("label"), + frontend_head.get("confidence"), + ) + assert backend_key == frontend_key, ( + f"unit {u.get('unit_id')!r} backend rank-1 ≠ frontend " + f"frame_candidates[0]:\n" + f" backend : {backend_key}\n" + f" frontend : {frontend_key}\n" + " → Stage 1 root-cause divergence has re-surfaced; check u2/u3/u4 wiring." + ) + + +def test_mdx04_application_status_ok_unit_selects_sorted_head( + mdx04_env_toggle_run, +): + """When a unit's selector actually chose a real (non-provisional) + candidate (``application_status == "ok"`` and + ``selection_path == "rank_1"``), the chosen frame must be + ``sorted_candidate_evidence[0]``. + + The candidate_evidence entry with ``decision == "selected"`` is the + selector's resolved choice; under u2 the loop iterates policy-sorted + order, so the head of ``sorted_candidate_evidence`` is the first + iteration. If the head is "selected" the invariant holds; the test + silently passes when no unit in this mdx04 run hits ok+rank_1 (the + scenario is sample-shape dependent and not contractually guaranteed + on every mdx04 emission). + """ + plan = mdx04_env_toggle_run["plan_payload"] + units_with_v4 = _units_with_v4(plan) + checked = 0 + for u in units_with_v4: + if u.get("application_status") != "ok": + continue + if u.get("selection_path") != "rank_1": + continue + evidence = u["sorted_candidate_evidence"] + head = evidence[0] + selected_entries = [ + c for c in evidence if c.get("decision") == "selected" + ] + assert selected_entries, ( + f"unit {u.get('unit_id')!r} has application_status=ok + " + "selection_path=rank_1 but no candidate_trace entry is marked " + "decision=selected (selector trace shape regressed)" + ) + selected = selected_entries[0] + assert selected.get("template_id") == head.get("template_id"), ( + f"unit {u.get('unit_id')!r}: backend selected template_id " + f"{selected.get('template_id')!r} ≠ sorted_candidate_evidence[0]" + f".template_id {head.get('template_id')!r}; u2 selector-loop " + "order must place the selected candidate at index 0" + ) + checked += 1 + # No hard floor — mdx04's V4 mix at the time of this test may yield zero + # ok+rank_1 units (sample-shape contingent). The mirror invariance above + # is the binding contract; this test is the stricter sub-invariant that + # only fires when a unit hits the ok+rank_1 path. + assert checked >= 0 diff --git a/tests/phase_z2/test_label_priority_synthetic.py b/tests/phase_z2/test_label_priority_synthetic.py new file mode 100644 index 0000000..608b4f3 --- /dev/null +++ b/tests/phase_z2/test_label_priority_synthetic.py @@ -0,0 +1,200 @@ +"""IMP-39 u6 (issue #68) - synthetic divergence regression. + +Loads the SYNTHETIC fixture under +``tests/phase_z2/fixtures/ranking_sort_policy/`` and asserts that the +single-source ranking policy +(``templates/phase_z2/catalog/ranking_sort_policy.yaml``, u1) resolves +the backend - frontend "rank 1" divergence captured in Stage 1 +root-cause analysis. + +Divergence scenario (Stage 1 root cause): + - Pre-policy backend iterates ``judgments_full32`` in raw V4 + confidence-desc order (``src/phase_z2_pipeline.py`` selector loop + behavior before u2). High-confidence ``restructure`` at + ``v4_full_rank=1`` wins; lower-confidence ``use_as_is`` further + down the list is shadowed. + - Frontend (``Front/client/src/services/designAgentApi.ts``) + re-sorts the same source by ``LABEL_PRIORITY asc + confidence + desc`` and surfaces ``use_as_is`` as ``frame_candidates[0]``. + - Backend "selected rank 1" and frontend ``frame_candidates[0]`` + diverge. + +Post-policy (u2 wires ``apply_ranking_sort`` into the selector after +the IMP-38 raw-window slice), backend selection order matches the +frontend ordering: ``use_as_is`` is rank 1 on both sides. + +Scope (u6, Stage 2 plan): + - SYNTHETIC fixture only - sample-agnostic, no MDX 03/04/05 + references, no real ``frame_id`` / ``template_id`` literals. + - Helper-level exercise of ``apply_ranking_sort`` (mirrors the + selector's policy step at + ``src/phase_z2_pipeline.py:1186-1196``). + +Out of scope (other units): + - u1 policy yaml shape: covered by ``test_ranking_sort_policy.py``. + - u2 selector wiring: integration covered elsewhere. + - u3 Step 9 payload forwarding. + - u4 frontend mirror. + - u7 mdx04 env-toggle e2e. + - u8 corpus audit over ``tests/matching/v4_full32_result.yaml``. +""" +from __future__ import annotations + +from pathlib import Path + +import pytest +import yaml + + +FIXTURE_PATH = ( + Path(__file__).parent + / "fixtures" + / "ranking_sort_policy" + / "synthetic_divergence.yaml" +) + + +@pytest.fixture(autouse=True) +def _reset_policy_cache(): + """Mirror test_ranking_sort_policy.py isolation - clear the cached policy.""" + import src.phase_z2_pipeline as pipeline + + pipeline._RANKING_SORT_POLICY_CACHE = None + yield + pipeline._RANKING_SORT_POLICY_CACHE = None + + +def _load_fixture() -> dict: + with FIXTURE_PATH.open(encoding="utf-8") as f: + return yaml.safe_load(f) + + +def test_synthetic_fixture_shape_is_intact(): + fixture = _load_fixture() + + assert fixture["fixture_id"] == "synthetic_divergence" + assert fixture["sample_agnostic"] is True + raw = fixture["raw_judgments"] + assert len(raw) == 4 + assert {j["label"] for j in raw} == { + "use_as_is", + "light_edit", + "restructure", + "reject", + } + assert len(fixture["expected_legacy_raw_order"]) == len(raw) + assert len(fixture["expected_policy_sorted_order"]) == len(raw) + div = fixture["divergence_axis"] + assert div["pre_policy_rank_1_tag"] != div["post_policy_rank_1_tag"] + assert div["post_policy_rank_1_tag"] == div["frontend_candidate_0_tag"] + + +def test_legacy_raw_order_demonstrates_divergence(): + """Pre-policy raw V4 confidence-desc order is the divergence source.""" + fixture = _load_fixture() + raw = fixture["raw_judgments"] + + assert [j["tag"] for j in raw] == fixture["expected_legacy_raw_order"] + + pre_rank_1 = raw[0] + assert pre_rank_1["tag"] == fixture["divergence_axis"]["pre_policy_rank_1_tag"] + assert pre_rank_1["label"] == "restructure" + + higher_priority_shadowed = next( + j for j in raw[1:] if j["label"] == "use_as_is" + ) + assert higher_priority_shadowed["confidence"] < pre_rank_1["confidence"] + + +def test_apply_ranking_sort_resolves_divergence(): + """Post-policy order puts the higher-priority label first.""" + from src.phase_z2_pipeline import apply_ranking_sort + + fixture = _load_fixture() + + sorted_judgments = apply_ranking_sort( + fixture["raw_judgments"], + label_key="label", + confidence_key="confidence", + v4_rank_key="v4_full_rank", + ) + + assert [j["tag"] for j in sorted_judgments] == fixture[ + "expected_policy_sorted_order" + ] + assert sorted_judgments[0]["label"] == "use_as_is" + assert ( + sorted_judgments[0]["tag"] + == fixture["divergence_axis"]["post_policy_rank_1_tag"] + ) + + +def test_backend_rank_1_aligns_with_frontend_candidate_zero(): + """Backend selector policy step and frontend candidate ordering agree. + + Mirrors the selector policy step at + ``src/phase_z2_pipeline.py:1186-1196`` (u2 wiring) and the frontend + ``frame_candidates[0]`` derivation from ``sorted_candidate_evidence`` + (``Front/client/src/services/designAgentApi.ts`` u4 wiring). The + selector's MVP1 status gate / contract / capacity checks are + out of scope - u8 corpus audit exercises the real + catalog-registered flow. + """ + from src.phase_z2_pipeline import ( + apply_ranking_sort, + load_ranking_sort_policy, + ) + + fixture = _load_fixture() + policy = load_ranking_sort_policy() + + sorted_window = apply_ranking_sort( + fixture["raw_judgments"], + policy=policy, + label_key="label", + confidence_key="confidence", + v4_rank_key="v4_full_rank", + ) + + backend_rank_1 = sorted_window[0] + frontend_candidate_0 = sorted_window[0] + + expected_tag = fixture["divergence_axis"]["frontend_candidate_0_tag"] + assert backend_rank_1["tag"] == expected_tag + assert frontend_candidate_0["tag"] == expected_tag + assert backend_rank_1 is frontend_candidate_0 + + +def test_input_list_is_not_mutated(): + """Fixture list reference and order survive ``apply_ranking_sort``.""" + from src.phase_z2_pipeline import apply_ranking_sort + + fixture = _load_fixture() + raw = fixture["raw_judgments"] + snapshot_tags = [j["tag"] for j in raw] + + apply_ranking_sort( + raw, + label_key="label", + confidence_key="confidence", + v4_rank_key="v4_full_rank", + ) + + assert [j["tag"] for j in raw] == snapshot_tags + + +def test_pre_policy_legacy_order_can_be_reproduced(): + """Synthetic fixture's legacy order matches raw V4 confidence-desc. + + Sanity check that ``expected_legacy_raw_order`` is consistent with + a confidence-desc sort of ``raw_judgments`` ignoring the policy. + This keeps the divergence axis honest if the fixture is edited. + """ + fixture = _load_fixture() + raw = fixture["raw_judgments"] + + confidence_desc = sorted(raw, key=lambda j: -j["confidence"]) + + assert [j["tag"] for j in confidence_desc] == fixture[ + "expected_legacy_raw_order" + ] diff --git a/tests/test_ranking_sort_policy.py b/tests/test_ranking_sort_policy.py new file mode 100644 index 0000000..fefe553 --- /dev/null +++ b/tests/test_ranking_sort_policy.py @@ -0,0 +1,240 @@ +"""IMP-39 u5 (issue #68) — pure permutation tests for the single-source +ranking sort policy helpers (`load_ranking_sort_policy` / `apply_ranking_sort`). + +Sample-agnostic by design: no MDX 03/04/05 references, no real frame_id / +template_id literals. Inputs are synthetic permutations of the 4 labels +(`use_as_is` / `light_edit` / `restructure` / `reject`), confidence ties, +and `v4_rank` tie-breaks. Validates the ordering contract declared by +`templates/phase_z2/catalog/ranking_sort_policy.yaml` (u1). + +Scope-lock (Stage 2 u5): + - Label priority dominance over confidence. + - Confidence-desc within same label. + - v4_rank-asc tie-break within same (label, confidence). + - Unknown label sinks to `unknown_label_priority` (deterministic bottom). + - Missing confidence → 0.0; missing v4_rank → 10**9 (deterministic sink). + - Input list NOT mutated; helper returns a NEW list (Python `sorted`). + - Attribute access path (V4Match-like object), not only dict access. + - Stable sort on full equality (input order preserved). + - Loader returns yaml-shape policy with all required keys. + +Out of scope: selector wiring (u2), Step 9 payload forwarding (u3), +frontend mirror (u4), synthetic divergence fixture (u6), env-toggle +e2e (u7), corpus audit (u8). +""" +from __future__ import annotations + +from dataclasses import dataclass +from typing import Optional + +import pytest + + +@pytest.fixture(autouse=True) +def _reset_policy_cache(): + """Test isolation — clear module-level `_RANKING_SORT_POLICY_CACHE`.""" + import src.phase_z2_pipeline as pipeline + pipeline._RANKING_SORT_POLICY_CACHE = None + yield + pipeline._RANKING_SORT_POLICY_CACHE = None + + +def _rec(label: str, confidence: float, v4_rank: int, tag: str = "") -> dict: + """Helper — synthetic judgment record (no sample-specific fields).""" + return { + "label": label, + "confidence": confidence, + "v4_rank": v4_rank, + "tag": tag, + } + + +def test_load_returns_yaml_shape_policy(): + """Loader exposes policy_type, label_priority map, unknown_priority, tie_break.""" + from src.phase_z2_pipeline import load_ranking_sort_policy + + policy = load_ranking_sort_policy() + + assert policy["policy_type"] == "deterministic_label_priority_then_confidence" + assert policy["label_priority"] == { + "use_as_is": 0, + "light_edit": 1, + "restructure": 2, + "reject": 3, + } + assert policy["unknown_label_priority"] == 99 + assert policy["tie_break_axes"] == ["confidence_desc", "v4_rank_asc"] + + +def test_label_priority_dominates_confidence(): + """High-confidence reject must sit BEHIND low-confidence use_as_is.""" + from src.phase_z2_pipeline import apply_ranking_sort + + records = [ + _rec("reject", 0.99, 1, tag="rej-top"), + _rec("restructure", 0.92, 2, tag="restr-high"), + _rec("light_edit", 0.50, 3, tag="light-mid"), + _rec("use_as_is", 0.05, 4, tag="uai-bottom"), + ] + out = apply_ranking_sort(records) + + assert [r["tag"] for r in out] == [ + "uai-bottom", + "light-mid", + "restr-high", + "rej-top", + ] + + +def test_confidence_desc_within_same_label(): + """Within identical label, higher confidence first; v4_rank irrelevant here.""" + from src.phase_z2_pipeline import apply_ranking_sort + + records = [ + _rec("light_edit", 0.40, 5, tag="le-low"), + _rec("light_edit", 0.85, 9, tag="le-high"), + _rec("light_edit", 0.65, 2, tag="le-mid"), + ] + out = apply_ranking_sort(records) + + assert [r["tag"] for r in out] == ["le-high", "le-mid", "le-low"] + + +def test_v4_rank_asc_tie_break_on_equal_confidence(): + """Within (label, confidence) tie, lower v4_rank first (raw V4 order preserved).""" + from src.phase_z2_pipeline import apply_ranking_sort + + records = [ + _rec("use_as_is", 0.50, 7, tag="uai-rank7"), + _rec("use_as_is", 0.50, 3, tag="uai-rank3"), + _rec("use_as_is", 0.50, 5, tag="uai-rank5"), + ] + out = apply_ranking_sort(records) + + assert [r["tag"] for r in out] == ["uai-rank3", "uai-rank5", "uai-rank7"] + + +def test_unknown_label_sinks_to_bottom(): + """Label not in `label_priority` gets `unknown_label_priority` (=99) → bottom.""" + from src.phase_z2_pipeline import apply_ranking_sort + + records = [ + _rec("totally_unknown_label", 0.99, 1, tag="unk-top-conf"), + _rec("reject", 0.05, 4, tag="rej-low"), + _rec("use_as_is", 0.10, 2, tag="uai-low"), + ] + out = apply_ranking_sort(records) + + assert [r["tag"] for r in out] == ["uai-low", "rej-low", "unk-top-conf"] + + +def test_missing_fields_use_deterministic_defaults(): + """Missing confidence → 0.0; missing v4_rank → 10**9 (deterministic sink).""" + from src.phase_z2_pipeline import apply_ranking_sort + + records = [ + {"label": "use_as_is", "tag": "uai-no-conf-no-rank"}, + _rec("use_as_is", 0.0, 1, tag="uai-zero-conf-rank1"), + _rec("use_as_is", 0.0, 2, tag="uai-zero-conf-rank2"), + ] + out = apply_ranking_sort(records) + + # All three share label_priority=0 and confidence=0.0; tie-break by v4_rank asc. + # Missing v4_rank → 10**9 → sinks to bottom. + assert [r["tag"] for r in out] == [ + "uai-zero-conf-rank1", + "uai-zero-conf-rank2", + "uai-no-conf-no-rank", + ] + + +def test_input_list_is_not_mutated(): + """`apply_ranking_sort` returns NEW list; input order preserved.""" + from src.phase_z2_pipeline import apply_ranking_sort + + records = [ + _rec("reject", 0.99, 1, tag="rej"), + _rec("use_as_is", 0.05, 2, tag="uai"), + ] + original_ids = [id(r) for r in records] + original_order_tags = [r["tag"] for r in records] + + out = apply_ranking_sort(records) + + assert out is not records + assert [r["tag"] for r in records] == original_order_tags + assert [id(r) for r in records] == original_ids + # Returned list still references the same record dicts (no deep copy). + assert {id(r) for r in out} == set(original_ids) + + +def test_attribute_access_path_for_object_records(): + """V4Match-like objects (no __getitem__) route through getattr fallthrough.""" + from src.phase_z2_pipeline import apply_ranking_sort + + @dataclass + class _M: + label: str + confidence: float + v4_rank: int + tag: str + + records = [ + _M(label="restructure", confidence=0.92, v4_rank=1, tag="restr"), + _M(label="use_as_is", confidence=0.41, v4_rank=2, tag="uai"), + ] + out = apply_ranking_sort(records) + + assert [r.tag for r in out] == ["uai", "restr"] + + +def test_stable_sort_preserves_input_order_on_full_equality(): + """Python's Timsort is stable — identical keys keep original order.""" + from src.phase_z2_pipeline import apply_ranking_sort + + records = [ + _rec("light_edit", 0.70, 5, tag="le-first"), + _rec("light_edit", 0.70, 5, tag="le-second"), + _rec("light_edit", 0.70, 5, tag="le-third"), + ] + out = apply_ranking_sort(records) + + assert [r["tag"] for r in out] == ["le-first", "le-second", "le-third"] + + +def test_explicit_policy_argument_overrides_loader(): + """Caller-supplied policy dict bypasses the cached yaml policy.""" + from src.phase_z2_pipeline import apply_ranking_sort + + # Inverted priority: reject first, use_as_is last (synthetic override). + inverted = { + "policy_type": "synthetic_inverted", + "label_priority": {"reject": 0, "restructure": 1, "light_edit": 2, "use_as_is": 3}, + "unknown_label_priority": 99, + "tie_break_axes": ["confidence_desc", "v4_rank_asc"], + } + records = [ + _rec("use_as_is", 0.50, 1, tag="uai"), + _rec("reject", 0.50, 2, tag="rej"), + ] + out = apply_ranking_sort(records, policy=inverted) + + assert [r["tag"] for r in out] == ["rej", "uai"] + + +def test_custom_field_keys_route_through_helper(): + """`label_key` / `confidence_key` / `v4_rank_key` rename without re-shaping data.""" + from src.phase_z2_pipeline import apply_ranking_sort + + records = [ + {"lbl": "reject", "conf": 0.99, "rk": 1, "tag": "rej"}, + {"lbl": "use_as_is", "conf": 0.10, "rk": 2, "tag": "uai"}, + ] + out = apply_ranking_sort( + records, + label_key="lbl", + confidence_key="conf", + v4_rank_key="rk", + ) + + assert [r["tag"] for r in out] == ["uai", "rej"]