feat(#68): IMP-39 u1~u8 ranking_sort_policy single-source + backend↔frontend label-priority mirror
Some checks failed
Multi-MDX Regression (IMP-91) / multi-mdx-regression (push) Failing after 23s
Some checks failed
Multi-MDX Regression (IMP-91) / multi-mdx-regression (push) Failing after 23s
u1: templates/phase_z2/catalog/ranking_sort_policy.yaml — single-source policy
(label_priority asc {use_as_is:0, light_edit:1, restructure:2, reject:3}
+ confidence desc + v4_rank asc tie-break).
u2: src/phase_z2_pipeline.py — apply_ranking_sort helper + lookup_v4_match_with_fallback
applies policy AFTER IMP-38 raw-window selection (raw default_window + usable_count
preserved on RAW all_judgments).
u3: src/phase_z2_pipeline.py — _build_application_plan_unit forwards ranking_sort_policy
+ sorted_candidate_evidence into Step 9 payload.
u4: Front/client/src/services/designAgentApi.ts — frame_candidates builder reads
unit.sorted_candidate_evidence + unit.ranking_sort_policy first; local LABEL_PRIORITY
retained only on warn-fallback path.
u5: tests/test_ranking_sort_policy.py — pure permutation coverage (sample-agnostic).
u6: tests/phase_z2/test_label_priority_synthetic.py + fixtures/ranking_sort_policy/
synthetic_divergence.yaml — low-conf use_as_is behind high-conf restructure.
u7: tests/phase_z2/test_imp39_mdx04_env_toggle_e2e.py — samples/mdx_batch/04.mdx with
AI_FALLBACK_ENABLED=off; backend selected_v4_rank == frontend frame_candidates[0].
u8: tests/phase_z2/test_imp39_corpus_audit.py — real corpus sweep over
tests/matching/v4_full32_result.yaml (10 MDX sections); section IDs loaded
dynamically (RULE 0 / RULE 7 sample-agnostic).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -565,6 +565,13 @@ export async function loadRun(runId: string): Promise<LoadRunResult> {
|
|||||||
// sort 우선순위 = label (use_as_is > light_edit > restructure > reject) + confidence desc.
|
// sort 우선순위 = label (use_as_is > light_edit > restructure > reject) + confidence desc.
|
||||||
// 모두 reject 인 경우 confidence desc 만 적용 (사용자 명시).
|
// 모두 reject 인 경우 confidence desc 만 적용 (사용자 명시).
|
||||||
const TOP_N_FRAMES = 6;
|
const TOP_N_FRAMES = 6;
|
||||||
|
// IMP-39 u4 (issue #68) — local LABEL_PRIORITY is now a documentation
|
||||||
|
// mirror of templates/phase_z2/catalog/ranking_sort_policy.yaml (u1).
|
||||||
|
// Primary ordering arrives pre-sorted from the backend selector
|
||||||
|
// (src/phase_z2_pipeline.py lookup_v4_match_with_fallback :1186-1196 +
|
||||||
|
// _build_application_plan_unit u3 payload fields). This constant is read
|
||||||
|
// ONLY on the warn-fallback path below (legacy fixtures pre-u3 / payload
|
||||||
|
// missing). Kept verbatim so the fallback ordering matches u1/u2 contract.
|
||||||
const LABEL_PRIORITY: Record<string, number> = {
|
const LABEL_PRIORITY: Record<string, number> = {
|
||||||
use_as_is: 0,
|
use_as_is: 0,
|
||||||
light_edit: 1,
|
light_edit: 1,
|
||||||
@@ -576,9 +583,6 @@ export async function loadRun(runId: string): Promise<LoadRunResult> {
|
|||||||
// 2) unit.v4_all_judgments (pre-IMP-05 audit array)
|
// 2) unit.v4_all_judgments (pre-IMP-05 audit array)
|
||||||
// 3) unit.v4_candidates (legacy minimal)
|
// 3) unit.v4_candidates (legacy minimal)
|
||||||
// fallback_chain alias is intentionally NOT read (Stage 2 guardrail).
|
// fallback_chain alias is intentionally NOT read (Stage 2 guardrail).
|
||||||
const candidateEvidence = Array.isArray(unit.candidate_evidence)
|
|
||||||
? unit.candidate_evidence
|
|
||||||
: [];
|
|
||||||
const candidateMap = new Map<string, any>();
|
const candidateMap = new Map<string, any>();
|
||||||
const pushCandidate = (c: any) => {
|
const pushCandidate = (c: any) => {
|
||||||
if (!c) return;
|
if (!c) return;
|
||||||
@@ -586,15 +590,64 @@ export async function loadRun(runId: string): Promise<LoadRunResult> {
|
|||||||
if (!key) return;
|
if (!key) return;
|
||||||
if (!candidateMap.has(key)) candidateMap.set(key, c);
|
if (!candidateMap.has(key)) candidateMap.set(key, c);
|
||||||
};
|
};
|
||||||
candidateEvidence.forEach(pushCandidate);
|
|
||||||
(unit.v4_all_judgments ?? []).forEach(pushCandidate);
|
// IMP-39 u4 (issue #68) — primary path: consume the backend Step 9
|
||||||
(unit.v4_candidates ?? []).forEach(pushCandidate);
|
// payload as the single source of ordering truth.
|
||||||
const rawSource = Array.from(candidateMap.values());
|
// • ``unit.sorted_candidate_evidence`` = policy-sorted selector trace
|
||||||
const v4Source = [...rawSource].sort((a: any, b: any) => {
|
// (src/phase_z2_pipeline.py :4163, alias of selection_trace[
|
||||||
const lp = (LABEL_PRIORITY[a.label] ?? 99) - (LABEL_PRIORITY[b.label] ?? 99);
|
// "candidates"] sorted by u2 at :1186-1196). Same IMP-05 L2 schema
|
||||||
if (lp !== 0) return lp;
|
// consumed below (template_id, label, confidence, frame_number,
|
||||||
return (b.confidence ?? 0) - (a.confidence ?? 0);
|
// frame_id, rank, catalog_registered, capacity_fit, route_hint, ...).
|
||||||
});
|
// • ``unit.ranking_sort_policy`` = full single-source policy dict
|
||||||
|
// (policy_type / label_priority / unknown_label_priority /
|
||||||
|
// tie_break_axes) forwarded for telemetry + fallback parity check.
|
||||||
|
// When both are present we feed sorted_candidate_evidence through the
|
||||||
|
// existing dedup map (first occurrence wins, mirrors backend
|
||||||
|
// ``seen_template_ids`` semantics at :1204-1236) and SKIP the local
|
||||||
|
// re-sort — backend "rank 1" then equals frontend frame_candidates[0]
|
||||||
|
// by construction (Stage 1 root-cause fix).
|
||||||
|
const sortedCandidateEvidence: any[] | null = Array.isArray(
|
||||||
|
unit.sorted_candidate_evidence,
|
||||||
|
)
|
||||||
|
? unit.sorted_candidate_evidence
|
||||||
|
: null;
|
||||||
|
const rankingSortPolicy = unit.ranking_sort_policy ?? null;
|
||||||
|
const backendPolicyPayloadPresent =
|
||||||
|
sortedCandidateEvidence !== null &&
|
||||||
|
sortedCandidateEvidence.length > 0 &&
|
||||||
|
rankingSortPolicy !== null;
|
||||||
|
|
||||||
|
let v4Source: any[];
|
||||||
|
if (backendPolicyPayloadPresent) {
|
||||||
|
sortedCandidateEvidence!.forEach(pushCandidate);
|
||||||
|
v4Source = Array.from(candidateMap.values());
|
||||||
|
} else {
|
||||||
|
// IMP-39 u4 — warn-fallback path. Legacy fixtures predating u3 (or
|
||||||
|
// any code path that strips the payload) lack the backend-sorted
|
||||||
|
// evidence; ordering then derives from local LABEL_PRIORITY mirror.
|
||||||
|
// Warning surfaces drift in dev console without hard-failing the UI
|
||||||
|
// (graceful: production sample audit deck remains renderable).
|
||||||
|
if (typeof console !== "undefined" && typeof console.warn === "function") {
|
||||||
|
console.warn(
|
||||||
|
`[IMP-39 u4] unit ${unit.unit_id ?? "<unknown>"}: backend payload ` +
|
||||||
|
"missing ranking_sort_policy / sorted_candidate_evidence — " +
|
||||||
|
"falling back to local LABEL_PRIORITY (legacy fixture path).",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
const candidateEvidence = Array.isArray(unit.candidate_evidence)
|
||||||
|
? unit.candidate_evidence
|
||||||
|
: [];
|
||||||
|
candidateEvidence.forEach(pushCandidate);
|
||||||
|
(unit.v4_all_judgments ?? []).forEach(pushCandidate);
|
||||||
|
(unit.v4_candidates ?? []).forEach(pushCandidate);
|
||||||
|
const rawSource = Array.from(candidateMap.values());
|
||||||
|
v4Source = [...rawSource].sort((a: any, b: any) => {
|
||||||
|
const lp =
|
||||||
|
(LABEL_PRIORITY[a.label] ?? 99) - (LABEL_PRIORITY[b.label] ?? 99);
|
||||||
|
if (lp !== 0) return lp;
|
||||||
|
return (b.confidence ?? 0) - (a.confidence ?? 0);
|
||||||
|
});
|
||||||
|
}
|
||||||
// ─── IMP-41 u4 — application_candidates enrichment (issue #70) ───────────
|
// ─── IMP-41 u4 — application_candidates enrichment (issue #70) ───────────
|
||||||
// Backend Step 9 emits `unit.application_candidates[]` (src/phase_z2_pipeline.py
|
// Backend Step 9 emits `unit.application_candidates[]` (src/phase_z2_pipeline.py
|
||||||
// _application_candidates_for_unit, :3071-3092) one entry per v4 candidate with
|
// _application_candidates_for_unit, :3071-3092) one entry per v4 candidate with
|
||||||
|
|||||||
@@ -108,6 +108,12 @@ ASSETS_SOURCE_BASE = PROJECT_ROOT / "figma_to_html_agent" / "blocks"
|
|||||||
V4_RESULT_PATH = PROJECT_ROOT / "tests" / "matching" / "v4_full32_result.yaml"
|
V4_RESULT_PATH = PROJECT_ROOT / "tests" / "matching" / "v4_full32_result.yaml"
|
||||||
RUNS_DIR = PROJECT_ROOT / "data" / "runs"
|
RUNS_DIR = PROJECT_ROOT / "data" / "runs"
|
||||||
|
|
||||||
|
# IMP-39 (#68) u1 — single-source ranking sort policy yaml.
|
||||||
|
# Loader + apply_ranking_sort helper below `to_phase_z_status`.
|
||||||
|
RANKING_SORT_POLICY_PATH = (
|
||||||
|
PROJECT_ROOT / "templates" / "phase_z2" / "catalog" / "ranking_sort_policy.yaml"
|
||||||
|
)
|
||||||
|
|
||||||
# V4 label → Phase Z status (§ 7.4 매트릭스)
|
# V4 label → Phase Z status (§ 7.4 매트릭스)
|
||||||
V4_LABEL_TO_PHASE_Z_STATUS = {
|
V4_LABEL_TO_PHASE_Z_STATUS = {
|
||||||
"use_as_is": "matched_zone",
|
"use_as_is": "matched_zone",
|
||||||
@@ -210,6 +216,106 @@ def to_phase_z_status(match: V4Match) -> str:
|
|||||||
return V4_LABEL_TO_PHASE_Z_STATUS.get(match.label, "unknown")
|
return V4_LABEL_TO_PHASE_Z_STATUS.get(match.label, "unknown")
|
||||||
|
|
||||||
|
|
||||||
|
# ─── IMP-39 (#68) u1 — single-source ranking sort policy ──────────
|
||||||
|
#
|
||||||
|
# Single source of (label_priority, tie-break) ordering shared by:
|
||||||
|
# - backend `lookup_v4_match_with_fallback` selector loop (wired in u2)
|
||||||
|
# - Step 9 `_build_application_plan_unit` payload (wired in u3)
|
||||||
|
# - frontend `designAgentApi.ts` candidate builder (wired in u4)
|
||||||
|
#
|
||||||
|
# u1 scope = additive only (yaml + loader + helper). No selector wiring,
|
||||||
|
# no behavior change. Default-fallback matches yaml so missing-file boot
|
||||||
|
# keeps deterministic ordering identical to the file-loaded policy.
|
||||||
|
|
||||||
|
_RANKING_SORT_POLICY_DEFAULT: dict = {
|
||||||
|
"policy_type": "deterministic_label_priority_then_confidence",
|
||||||
|
"label_priority": {
|
||||||
|
"use_as_is": 0,
|
||||||
|
"light_edit": 1,
|
||||||
|
"restructure": 2,
|
||||||
|
"reject": 3,
|
||||||
|
},
|
||||||
|
"unknown_label_priority": 99,
|
||||||
|
"tie_break_axes": ["confidence_desc", "v4_rank_asc"],
|
||||||
|
}
|
||||||
|
|
||||||
|
_RANKING_SORT_POLICY_CACHE: Optional[dict] = None
|
||||||
|
|
||||||
|
|
||||||
|
def load_ranking_sort_policy() -> dict:
|
||||||
|
"""IMP-39 u1 — ranking sort policy loader (separate yaml, additive).
|
||||||
|
|
||||||
|
Returns dict with keys: policy_type, label_priority (dict),
|
||||||
|
unknown_label_priority (int), tie_break_axes (list[str]).
|
||||||
|
|
||||||
|
Graceful fallback: yaml 파일 없을 시 _RANKING_SORT_POLICY_DEFAULT
|
||||||
|
(위 dict) 그대로 — backward-compat boot-safe.
|
||||||
|
|
||||||
|
Cache: module-level, mirrors `load_v4_fallback_policy` pattern.
|
||||||
|
"""
|
||||||
|
global _RANKING_SORT_POLICY_CACHE
|
||||||
|
if _RANKING_SORT_POLICY_CACHE is None:
|
||||||
|
if RANKING_SORT_POLICY_PATH.exists():
|
||||||
|
loaded = (
|
||||||
|
yaml.safe_load(RANKING_SORT_POLICY_PATH.read_text(encoding="utf-8"))
|
||||||
|
or {}
|
||||||
|
)
|
||||||
|
# merge with default so partial yaml falls through cleanly
|
||||||
|
merged = dict(_RANKING_SORT_POLICY_DEFAULT)
|
||||||
|
for k, v in loaded.items():
|
||||||
|
merged[k] = v
|
||||||
|
_RANKING_SORT_POLICY_CACHE = merged
|
||||||
|
else:
|
||||||
|
_RANKING_SORT_POLICY_CACHE = dict(_RANKING_SORT_POLICY_DEFAULT)
|
||||||
|
return _RANKING_SORT_POLICY_CACHE
|
||||||
|
|
||||||
|
|
||||||
|
def apply_ranking_sort(
|
||||||
|
records: list,
|
||||||
|
*,
|
||||||
|
policy: Optional[dict] = None,
|
||||||
|
label_key: str = "label",
|
||||||
|
confidence_key: str = "confidence",
|
||||||
|
v4_rank_key: str = "v4_rank",
|
||||||
|
) -> list:
|
||||||
|
"""IMP-39 u1 — stable sort by (label_priority asc, confidence desc, v4_rank asc).
|
||||||
|
|
||||||
|
Shared ordering primitive — backend selector / Step 9 payload / frontend
|
||||||
|
mirror invariant. Sample-agnostic; no hardcoded sample IDs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
records: list of dicts (selector loop, trace candidates) OR V4Match
|
||||||
|
objects. Field access falls through getitem → getattr.
|
||||||
|
policy: optional explicit policy dict; defaults to `load_ranking_sort_policy()`.
|
||||||
|
label_key / confidence_key / v4_rank_key: per-record field names.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
NEW list — input is not mutated. Records lacking a key get the
|
||||||
|
unknown-label priority / confidence=0.0 / v4_rank=inf so they sink
|
||||||
|
to the bottom in a deterministic way.
|
||||||
|
"""
|
||||||
|
pol = policy if policy is not None else load_ranking_sort_policy()
|
||||||
|
priority_map: dict = pol.get("label_priority", {}) or {}
|
||||||
|
unknown_priority: int = int(pol.get("unknown_label_priority", 99))
|
||||||
|
|
||||||
|
def _get(rec, key):
|
||||||
|
if isinstance(rec, dict):
|
||||||
|
return rec.get(key)
|
||||||
|
return getattr(rec, key, None)
|
||||||
|
|
||||||
|
def _key(rec):
|
||||||
|
label = _get(rec, label_key)
|
||||||
|
conf = _get(rec, confidence_key)
|
||||||
|
v4_rank = _get(rec, v4_rank_key)
|
||||||
|
label_pri = priority_map.get(label, unknown_priority)
|
||||||
|
conf_val = float(conf) if conf is not None else 0.0
|
||||||
|
# confidence desc → negate for asc sort key
|
||||||
|
rank_val = int(v4_rank) if v4_rank is not None else 10**9
|
||||||
|
return (label_pri, -conf_val, rank_val)
|
||||||
|
|
||||||
|
return sorted(records, key=_key)
|
||||||
|
|
||||||
|
|
||||||
def _b4_mapper_source_enabled() -> bool:
|
def _b4_mapper_source_enabled() -> bool:
|
||||||
"""IMP-89 89-a u1 — PHASE_Z_B4_MAPPER_SOURCE env flag reader (default OFF).
|
"""IMP-89 89-a u1 — PHASE_Z_B4_MAPPER_SOURCE env flag reader (default OFF).
|
||||||
|
|
||||||
@@ -1065,6 +1171,30 @@ def lookup_v4_match_with_fallback(
|
|||||||
trace["fallback_reason"] = "empty_v4_judgments"
|
trace["fallback_reason"] = "empty_v4_judgments"
|
||||||
return None, trace
|
return None, trace
|
||||||
|
|
||||||
|
# IMP-39 (#68) u2 — apply single-source ranking sort policy to the selected
|
||||||
|
# window AFTER IMP-38 raw-window calc (default_window / usable_count above
|
||||||
|
# remain RAW all_judgments-based — no silent interaction with fallback
|
||||||
|
# expansion). Selection order now follows
|
||||||
|
# (label_priority asc, confidence desc, v4_rank asc)
|
||||||
|
# so backend selected rank-1 matches frontend frame_candidates[0]
|
||||||
|
# (designAgentApi.ts:578-597 LABEL_PRIORITY + confidence-desc mirror).
|
||||||
|
# `v4_rank_key="v4_full_rank"` reads the RAW V4 confidence-rank from each
|
||||||
|
# judgment dict for tie-break (yaml: tie_break_axes=[confidence_desc,
|
||||||
|
# v4_rank_asc]). Input list is NOT mutated (apply_ranking_sort returns a
|
||||||
|
# new list). Trace fields (sorted_candidate_evidence / ranking_sort_policy)
|
||||||
|
# are forwarded through Step 9 payload in u3.
|
||||||
|
ranking_sort_policy = load_ranking_sort_policy()
|
||||||
|
judgments = apply_ranking_sort(
|
||||||
|
judgments,
|
||||||
|
policy=ranking_sort_policy,
|
||||||
|
label_key="label",
|
||||||
|
confidence_key="confidence",
|
||||||
|
v4_rank_key="v4_full_rank",
|
||||||
|
)
|
||||||
|
trace["ranking_sort_policy_applied"] = ranking_sort_policy.get(
|
||||||
|
"policy_type", "deterministic_label_priority_then_confidence"
|
||||||
|
)
|
||||||
|
|
||||||
first_skip_reason: Optional[str] = None
|
first_skip_reason: Optional[str] = None
|
||||||
# IMP-05 L4 dedup (Codex #14 ordering — Claude #16 placement precision) :
|
# IMP-05 L4 dedup (Codex #14 ordering — Claude #16 placement precision) :
|
||||||
# first occurrence claims template_id for the chain regardless of decision
|
# first occurrence claims template_id for the chain regardless of decision
|
||||||
@@ -3937,6 +4067,18 @@ def _build_application_plan_unit(
|
|||||||
- IMP-06 additive plan fields (position / assignment_source / section_
|
- IMP-06 additive plan fields (position / assignment_source / section_
|
||||||
assignment_override / replaced_auto_unit / skipped_collided_auto_units /
|
assignment_override / replaced_auto_unit / skipped_collided_auto_units /
|
||||||
skipped_reason) — None / False / [] when no override CLI used.
|
skipped_reason) — None / False / [] when no override CLI used.
|
||||||
|
|
||||||
|
IMP-39 u3 (issue #68) additive fields :
|
||||||
|
- ``ranking_sort_policy`` : full policy dict from
|
||||||
|
``load_ranking_sort_policy()`` (cached). Forwards the single-source
|
||||||
|
ordering contract (label_priority map + tie_break_axes) to the Step 9
|
||||||
|
payload so the frontend (``designAgentApi.ts``) can mirror the backend
|
||||||
|
sort without re-implementing the policy locally. u4 wires consumption.
|
||||||
|
- ``sorted_candidate_evidence`` : explicit alias of the policy-sorted
|
||||||
|
``selection_trace["candidates"]`` list. Identical contents to
|
||||||
|
``candidate_evidence`` (u2 sorted the underlying ``judgments`` window
|
||||||
|
before the selector loop appended ``trace["candidates"]``), but the
|
||||||
|
explicit name documents the post-u2 contract for the frontend.
|
||||||
"""
|
"""
|
||||||
unit_id = "+".join(unit.source_section_ids)
|
unit_id = "+".join(unit.source_section_ids)
|
||||||
|
|
||||||
@@ -3945,6 +4087,14 @@ def _build_application_plan_unit(
|
|||||||
application_status = "ok" if has_v4 else "no_v4_candidate"
|
application_status = "ok" if has_v4 else "no_v4_candidate"
|
||||||
current_default = unit.frame_template_id if has_v4 else None
|
current_default = unit.frame_template_id if has_v4 else None
|
||||||
|
|
||||||
|
# IMP-39 u3 (issue #68) — forward the single-source ranking policy to the
|
||||||
|
# Step 9 per-unit payload. ``load_ranking_sort_policy()`` is module-cached
|
||||||
|
# (``_RANKING_SORT_POLICY_CACHE``), so the per-unit call is O(1) after
|
||||||
|
# first invocation. The full policy dict (not just ``policy_type``) is
|
||||||
|
# forwarded so the frontend can mirror label_priority + tie_break_axes
|
||||||
|
# without re-declaring the contract locally.
|
||||||
|
ranking_sort_policy = load_ranking_sort_policy()
|
||||||
|
|
||||||
# IMP-06 blocker-fix (Codex #13 Blocker 3 / #16) — plan-aware additive
|
# IMP-06 blocker-fix (Codex #13 Blocker 3 / #16) — plan-aware additive
|
||||||
# fields. additive = pre-IMP-06 readers (no override CLI used) see
|
# fields. additive = pre-IMP-06 readers (no override CLI used) see
|
||||||
# position=None / assignment_source=None / section_assignment_override
|
# position=None / assignment_source=None / section_assignment_override
|
||||||
@@ -4006,6 +4156,12 @@ def _build_application_plan_unit(
|
|||||||
"replaced_auto_unit": plan_replaced_auto,
|
"replaced_auto_unit": plan_replaced_auto,
|
||||||
"skipped_collided_auto_units": plan_skipped_collided,
|
"skipped_collided_auto_units": plan_skipped_collided,
|
||||||
"skipped_reason": plan_skipped_reason,
|
"skipped_reason": plan_skipped_reason,
|
||||||
|
# IMP-39 u3 (issue #68) — single-source ranking policy forwarded to
|
||||||
|
# frontend so backend selector "rank 1" and frontend
|
||||||
|
# ``frame_candidates[0]`` share one ordering contract. Additive only;
|
||||||
|
# pre-u3 readers ignore both keys.
|
||||||
|
"ranking_sort_policy": ranking_sort_policy,
|
||||||
|
"sorted_candidate_evidence": selection_trace.get("candidates", []),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
50
templates/phase_z2/catalog/ranking_sort_policy.yaml
Normal file
50
templates/phase_z2/catalog/ranking_sort_policy.yaml
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
# IMP-39 single-source ranking sort policy — backend ↔ frontend mirror.
|
||||||
|
#
|
||||||
|
# 도입 배경 (issue #68):
|
||||||
|
# Backend `lookup_v4_match_with_fallback` 는 V4 raw confidence-desc 순서로
|
||||||
|
# first-eligible 선택 (label_priority 무시). Frontend `designAgentApi.ts` 는
|
||||||
|
# 동일 source 를 (label_priority asc, confidence desc) 로 재정렬 후 slice.
|
||||||
|
# 결과: 낮은-confidence 높은-priority label 이 raw 상 뒤에 있을 때
|
||||||
|
# backend "rank 1 selected" ≠ frontend `frame_candidates[0]` divergence.
|
||||||
|
#
|
||||||
|
# 정책 결정 (Stage 1~2 LOCK, 4 round 합의):
|
||||||
|
# - 단일 source 위치 = 본 yaml (catalog hot-reload + frontend mirror 가능)
|
||||||
|
# - frame_contracts.yaml / v4_fallback_policy.yaml 오염 회피 (분리 파일)
|
||||||
|
# - 정렬 axes = (label_priority asc, confidence desc, v4_rank asc)
|
||||||
|
# - tie-break = 원본 v4_rank 보존 (frontend LABEL_PRIORITY 와 1:1)
|
||||||
|
#
|
||||||
|
# 적용 path:
|
||||||
|
# - backend: src/phase_z2_pipeline.py `apply_ranking_sort` (helper, u1)
|
||||||
|
# + `lookup_v4_match_with_fallback` selector loop (u2)
|
||||||
|
# + `_build_application_plan_unit` Step 9 payload (u3)
|
||||||
|
# - frontend: Front/client/src/services/designAgentApi.ts (u4)
|
||||||
|
# → unit.ranking_sort_policy + unit.sorted_candidate_evidence 우선 read
|
||||||
|
# → local LABEL_PRIORITY 는 warn-fallback only
|
||||||
|
|
||||||
|
policy_type: deterministic_label_priority_then_confidence
|
||||||
|
|
||||||
|
# label_priority:
|
||||||
|
# lower value = higher priority (use_as_is 가 첫 후보)
|
||||||
|
# sort key = (label_priority asc, confidence desc, v4_rank asc)
|
||||||
|
label_priority:
|
||||||
|
use_as_is: 0
|
||||||
|
light_edit: 1
|
||||||
|
restructure: 2
|
||||||
|
reject: 3
|
||||||
|
|
||||||
|
# unknown_label_priority:
|
||||||
|
# label 이 위 매트릭스에 없을 시 부여되는 우선순위 (최하위 push).
|
||||||
|
# frontend `LABEL_PRIORITY[label] ?? 99` 와 1:1.
|
||||||
|
unknown_label_priority: 99
|
||||||
|
|
||||||
|
# tie_break_axes:
|
||||||
|
# 동일 label_priority 시 적용 순서 — frontend mirror 와 1:1.
|
||||||
|
# confidence_desc: 큰 confidence 가 앞
|
||||||
|
# v4_rank_asc: 동일 confidence 시 raw v4 rank (1, 2, 3 ...) 작은 게 앞
|
||||||
|
tie_break_axes:
|
||||||
|
- confidence_desc
|
||||||
|
- v4_rank_asc
|
||||||
|
|
||||||
|
# graceful fallback (yaml 없을 시):
|
||||||
|
# loader 가 default policy_type=deterministic_label_priority_then_confidence
|
||||||
|
# + 위 label_priority 매트릭스 로 fall through (backward compat / boot-safe).
|
||||||
@@ -0,0 +1,56 @@
|
|||||||
|
fixture_id: synthetic_divergence
|
||||||
|
purpose: |
|
||||||
|
Backend - frontend "rank 1" divergence regression - IMP-39 (#68).
|
||||||
|
Captures the Stage 1 root-cause scenario where the legacy backend
|
||||||
|
(raw V4 confidence-desc order) selects a high-confidence
|
||||||
|
lower-priority label, while the frontend (LABEL_PRIORITY asc +
|
||||||
|
confidence desc) selects the lower-confidence higher-priority
|
||||||
|
label. The single-source ranking policy
|
||||||
|
(templates/phase_z2/catalog/ranking_sort_policy.yaml, u1) resolves
|
||||||
|
the divergence so that both sides agree on "rank 1".
|
||||||
|
|
||||||
|
source: synthetic
|
||||||
|
sample_agnostic: true
|
||||||
|
notes:
|
||||||
|
- No real frame_id / template_id / MDX section is referenced.
|
||||||
|
- Only the four sort keys matter: label, confidence, v4_full_rank.
|
||||||
|
- The `tag` field is a fixture-local identifier for assertions.
|
||||||
|
- Field name `v4_full_rank` mirrors v4_full32_result.yaml shape so
|
||||||
|
fixture and corpus audit (u8) share the same key contract.
|
||||||
|
|
||||||
|
raw_judgments:
|
||||||
|
# confidence is strictly descending so v4_full_rank == raw V4
|
||||||
|
# confidence-desc rank (same axis as v4_full32_result.yaml).
|
||||||
|
- tag: synth_restructure_high
|
||||||
|
label: restructure
|
||||||
|
confidence: 0.92
|
||||||
|
v4_full_rank: 1
|
||||||
|
- tag: synth_light_edit_mid
|
||||||
|
label: light_edit
|
||||||
|
confidence: 0.70
|
||||||
|
v4_full_rank: 2
|
||||||
|
- tag: synth_use_as_is_low
|
||||||
|
label: use_as_is
|
||||||
|
confidence: 0.41
|
||||||
|
v4_full_rank: 3
|
||||||
|
- tag: synth_reject_low
|
||||||
|
label: reject
|
||||||
|
confidence: 0.30
|
||||||
|
v4_full_rank: 4
|
||||||
|
|
||||||
|
expected_legacy_raw_order:
|
||||||
|
- synth_restructure_high
|
||||||
|
- synth_light_edit_mid
|
||||||
|
- synth_use_as_is_low
|
||||||
|
- synth_reject_low
|
||||||
|
|
||||||
|
expected_policy_sorted_order:
|
||||||
|
- synth_use_as_is_low
|
||||||
|
- synth_light_edit_mid
|
||||||
|
- synth_restructure_high
|
||||||
|
- synth_reject_low
|
||||||
|
|
||||||
|
divergence_axis:
|
||||||
|
pre_policy_rank_1_tag: synth_restructure_high
|
||||||
|
post_policy_rank_1_tag: synth_use_as_is_low
|
||||||
|
frontend_candidate_0_tag: synth_use_as_is_low
|
||||||
437
tests/phase_z2/test_imp39_corpus_audit.py
Normal file
437
tests/phase_z2/test_imp39_corpus_audit.py
Normal file
@@ -0,0 +1,437 @@
|
|||||||
|
"""IMP-39 u8 (issue #68) - corpus audit over tests/matching/v4_full32_result.yaml.
|
||||||
|
|
||||||
|
Mirror-invariance regression on the REAL V4 full-32 judgments corpus
|
||||||
|
(``tests/matching/v4_full32_result.yaml``). For every MDX section in the
|
||||||
|
corpus, asserts that:
|
||||||
|
|
||||||
|
1. The backend ranking helper ``apply_ranking_sort`` (single-source
|
||||||
|
policy via ``templates/phase_z2/catalog/ranking_sort_policy.yaml``)
|
||||||
|
yields the same ordering as a Python mirror of the frontend
|
||||||
|
candidate sort (``Front/client/src/services/designAgentApi.ts``
|
||||||
|
warn-fallback path, lines 644-649). i.e. backend selector "rank 1"
|
||||||
|
== frontend ``frame_candidates[0]`` by construction across the
|
||||||
|
full corpus, with NO sample-specific carve-out.
|
||||||
|
2. The tie-break contract (label_priority asc, confidence desc,
|
||||||
|
v4_rank asc) holds when (label, confidence) ties occur in real
|
||||||
|
data (e.g. multi-restructure sections like 01-1 where rank=8
|
||||||
|
restructure rises above rank=5 reject under policy).
|
||||||
|
3. Real-data DIVERGENCE between raw V4 confidence-desc order and
|
||||||
|
policy-sorted order EXISTS in the corpus (audit honesty: proves
|
||||||
|
the policy is non-trivial on real samples, not just synthetic
|
||||||
|
u6 fixture).
|
||||||
|
|
||||||
|
Sample-agnostic axis (RULE 0 / RULE 7):
|
||||||
|
- The test iterates ``data['mdx_sections']`` keys dynamically; no
|
||||||
|
section ID (``01-2``, ``03-1``, ``04-2.1``, ...) is hardcoded as
|
||||||
|
an assertion target. The corpus inventory is treated as a
|
||||||
|
parametrize source, not a contract.
|
||||||
|
- The test does NOT assert any specific ``frame_id`` /
|
||||||
|
``template_id`` / ``frame_number``. Only the ordering contract
|
||||||
|
is asserted.
|
||||||
|
- The test does NOT depend on MDX 03/04/05 outcome / answer_map
|
||||||
|
correctness; it only validates that the policy is applied
|
||||||
|
uniformly across whatever sections the corpus happens to have.
|
||||||
|
|
||||||
|
Scope (u8, Stage 2 plan):
|
||||||
|
- Real-data sweep of ``tests/matching/v4_full32_result.yaml``
|
||||||
|
confirming backend / frontend mirror invariance under
|
||||||
|
``apply_ranking_sort`` + ``LABEL_PRIORITY`` mirror.
|
||||||
|
- Corpus uses ``v4_full_rank`` as the tie-break key, so calls pass
|
||||||
|
``v4_rank_key="v4_full_rank"`` (matching u2 selector wiring).
|
||||||
|
|
||||||
|
Out of scope (other units):
|
||||||
|
- u1 policy yaml shape: covered by ``test_ranking_sort_policy.py``.
|
||||||
|
- u2 selector wiring: integration covered indirectly via u7.
|
||||||
|
- u3 Step 9 payload forwarding: covered by u7.
|
||||||
|
- u4 frontend mirror: covered by u7.
|
||||||
|
- u5 pure permutation tests.
|
||||||
|
- u6 SYNTHETIC divergence fixture
|
||||||
|
(``tests/phase_z2/test_label_priority_synthetic.py``).
|
||||||
|
- u7 mdx04 env-toggle e2e
|
||||||
|
(``tests/phase_z2/test_imp39_mdx04_env_toggle_e2e.py``).
|
||||||
|
- V4 matching algorithm correctness (out of #68 scope, owner #5).
|
||||||
|
- ``MVP1_ALLOWED_STATUSES`` gate semantics (IMP-47B locked area).
|
||||||
|
- capacity-fit / catalog contract validation (orthogonal to policy).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, List
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
|
||||||
|
_REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||||
|
_CORPUS_PATH = _REPO_ROOT / "tests" / "matching" / "v4_full32_result.yaml"
|
||||||
|
|
||||||
|
|
||||||
|
# Frontend LABEL_PRIORITY verbatim mirror — Front/client/src/services/
|
||||||
|
# designAgentApi.ts:575-580 + warn-fallback sort :644-649. Kept inline (not
|
||||||
|
# imported from python policy) so this audit catches drift if the frontend
|
||||||
|
# TS constant ever diverges from the yaml policy. The yaml-shape equality
|
||||||
|
# is exercised separately in test_ranking_sort_policy.py (u5).
|
||||||
|
_FRONTEND_LABEL_PRIORITY: Dict[str, int] = {
|
||||||
|
"use_as_is": 0,
|
||||||
|
"light_edit": 1,
|
||||||
|
"restructure": 2,
|
||||||
|
"reject": 3,
|
||||||
|
}
|
||||||
|
_FRONTEND_UNKNOWN_PRIORITY = 99
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def _reset_policy_cache():
|
||||||
|
"""Mirror peer-test isolation - clear the cached single-source policy."""
|
||||||
|
import src.phase_z2_pipeline as pipeline
|
||||||
|
|
||||||
|
pipeline._RANKING_SORT_POLICY_CACHE = None
|
||||||
|
yield
|
||||||
|
pipeline._RANKING_SORT_POLICY_CACHE = None
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def corpus() -> Dict[str, Any]:
|
||||||
|
"""Load v4_full32_result.yaml exactly once per test module run."""
|
||||||
|
assert _CORPUS_PATH.exists(), (
|
||||||
|
f"Corpus audit source missing: {_CORPUS_PATH}. u8 requires "
|
||||||
|
f"tests/matching/v4_full32_result.yaml present in repo."
|
||||||
|
)
|
||||||
|
with _CORPUS_PATH.open(encoding="utf-8") as f:
|
||||||
|
return yaml.safe_load(f)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def section_ids(corpus) -> List[str]:
|
||||||
|
"""Dynamic section inventory — NOT hardcoded.
|
||||||
|
|
||||||
|
Source = ``corpus['mdx_sections'].keys()``. The test asserts the
|
||||||
|
set is non-empty and each entry has a populated ``judgments_full32``
|
||||||
|
list. Section IDs themselves are treated as parametrize values, not
|
||||||
|
assertion targets.
|
||||||
|
"""
|
||||||
|
return list(corpus["mdx_sections"].keys())
|
||||||
|
|
||||||
|
|
||||||
|
def _frontend_mirror_sort(
|
||||||
|
judgments: List[Dict[str, Any]],
|
||||||
|
) -> List[Dict[str, Any]]:
|
||||||
|
"""Pure-Python mirror of frontend warn-fallback ordering.
|
||||||
|
|
||||||
|
Mirrors ``Front/client/src/services/designAgentApi.ts:644-649``:
|
||||||
|
v4Source.sort((a, b) => {
|
||||||
|
const lp = (LABEL_PRIORITY[a.label] ?? 99) - (LABEL_PRIORITY[b.label] ?? 99);
|
||||||
|
if (lp !== 0) return lp;
|
||||||
|
return (b.confidence ?? 0) - (a.confidence ?? 0);
|
||||||
|
});
|
||||||
|
|
||||||
|
NOTE on tie-break: the frontend warn-fallback path lacks the
|
||||||
|
explicit v4_rank tie-break the backend policy carries (yaml
|
||||||
|
``tie_break_axes: [confidence_desc, v4_rank_asc]``). When (label,
|
||||||
|
confidence) are both equal, the frontend ``Array.prototype.sort``
|
||||||
|
is now stable (ES2019), so original order is preserved. Backend
|
||||||
|
``apply_ranking_sort`` also uses Python's stable Timsort and adds
|
||||||
|
``v4_rank asc`` only as a positive tie-break which agrees with raw
|
||||||
|
V4 order (v4_rank=1 first, raw V4 ordering is confidence-desc =
|
||||||
|
same as input). Net effect: identical ordering across both paths
|
||||||
|
on the real corpus. The audit below verifies this empirically.
|
||||||
|
"""
|
||||||
|
return sorted(
|
||||||
|
judgments,
|
||||||
|
key=lambda j: (
|
||||||
|
_FRONTEND_LABEL_PRIORITY.get(j.get("label"), _FRONTEND_UNKNOWN_PRIORITY),
|
||||||
|
-float(j.get("confidence", 0.0)),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _identity_key(judgment: Dict[str, Any]) -> tuple:
|
||||||
|
"""Stable identity for a corpus judgment row.
|
||||||
|
|
||||||
|
``v4_full_rank`` is unique per section (1..32), so it serves as the
|
||||||
|
section-local identity. Wrapped in a tuple with ``frame_number`` /
|
||||||
|
``template_id`` for diagnostic richness in assert messages (these
|
||||||
|
extras are NOT used to derive ordering; only for failure diagnosis).
|
||||||
|
"""
|
||||||
|
return (
|
||||||
|
judgment.get("v4_full_rank"),
|
||||||
|
judgment.get("frame_number"),
|
||||||
|
judgment.get("template_id"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ─── corpus shape sanity ───────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_corpus_file_is_present_and_non_empty(corpus, section_ids):
|
||||||
|
"""RULE 5 factual: corpus path + section inventory both surface up."""
|
||||||
|
assert isinstance(corpus, dict)
|
||||||
|
assert "mdx_sections" in corpus
|
||||||
|
assert len(section_ids) > 0, (
|
||||||
|
f"v4_full32_result.yaml has zero mdx_sections — corpus audit "
|
||||||
|
f"cannot run. Path: {_CORPUS_PATH}"
|
||||||
|
)
|
||||||
|
for sec_id in section_ids:
|
||||||
|
section = corpus["mdx_sections"][sec_id]
|
||||||
|
judgments = section.get("judgments_full32")
|
||||||
|
assert isinstance(judgments, list) and len(judgments) > 0, (
|
||||||
|
f"Section {sec_id}: judgments_full32 missing or empty."
|
||||||
|
)
|
||||||
|
# Every judgment must carry the four sort-relevant fields.
|
||||||
|
for j in judgments:
|
||||||
|
assert "label" in j, f"{sec_id}: judgment missing 'label'."
|
||||||
|
assert "confidence" in j, f"{sec_id}: judgment missing 'confidence'."
|
||||||
|
assert "v4_full_rank" in j, (
|
||||||
|
f"{sec_id}: judgment missing 'v4_full_rank' (tie-break key)."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ─── backend ↔ frontend mirror invariance ───────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_backend_policy_sort_matches_frontend_mirror_per_section(
|
||||||
|
corpus, section_ids,
|
||||||
|
):
|
||||||
|
"""Per-section: backend ``apply_ranking_sort`` == frontend mirror order."""
|
||||||
|
from src.phase_z2_pipeline import apply_ranking_sort
|
||||||
|
|
||||||
|
divergences: List[str] = []
|
||||||
|
for sec_id in section_ids:
|
||||||
|
judgments = corpus["mdx_sections"][sec_id]["judgments_full32"]
|
||||||
|
|
||||||
|
backend_sorted = apply_ranking_sort(
|
||||||
|
judgments,
|
||||||
|
v4_rank_key="v4_full_rank",
|
||||||
|
)
|
||||||
|
frontend_sorted = _frontend_mirror_sort(judgments)
|
||||||
|
|
||||||
|
backend_keys = [_identity_key(j) for j in backend_sorted]
|
||||||
|
frontend_keys = [_identity_key(j) for j in frontend_sorted]
|
||||||
|
if backend_keys != frontend_keys:
|
||||||
|
divergences.append(
|
||||||
|
f"section={sec_id} backend_head={backend_keys[0]} "
|
||||||
|
f"frontend_head={frontend_keys[0]} "
|
||||||
|
f"first_divergence_index="
|
||||||
|
f"{next((i for i, (a, b) in enumerate(zip(backend_keys, frontend_keys)) if a != b), 'tail')}"
|
||||||
|
)
|
||||||
|
|
||||||
|
assert not divergences, (
|
||||||
|
"backend ↔ frontend mirror divergence on real corpus:\n "
|
||||||
|
+ "\n ".join(divergences)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_backend_rank_1_equals_frontend_candidate_0_per_section(
|
||||||
|
corpus, section_ids,
|
||||||
|
):
|
||||||
|
"""Stage 1 root-cause head-of-list invariant on every corpus section."""
|
||||||
|
from src.phase_z2_pipeline import apply_ranking_sort
|
||||||
|
|
||||||
|
head_mismatches: List[str] = []
|
||||||
|
for sec_id in section_ids:
|
||||||
|
judgments = corpus["mdx_sections"][sec_id]["judgments_full32"]
|
||||||
|
|
||||||
|
backend_rank_1 = apply_ranking_sort(
|
||||||
|
judgments,
|
||||||
|
v4_rank_key="v4_full_rank",
|
||||||
|
)[0]
|
||||||
|
frontend_candidate_0 = _frontend_mirror_sort(judgments)[0]
|
||||||
|
|
||||||
|
if _identity_key(backend_rank_1) != _identity_key(frontend_candidate_0):
|
||||||
|
head_mismatches.append(
|
||||||
|
f"section={sec_id} "
|
||||||
|
f"backend_rank_1={_identity_key(backend_rank_1)} "
|
||||||
|
f"frontend_candidate_0={_identity_key(frontend_candidate_0)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
assert not head_mismatches, (
|
||||||
|
"backend selector 'rank 1' diverges from frontend frame_candidates[0]:\n "
|
||||||
|
+ "\n ".join(head_mismatches)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ─── tie-break + label-priority contract on real data ──────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_policy_ordering_respects_label_priority_per_section(
|
||||||
|
corpus, section_ids,
|
||||||
|
):
|
||||||
|
"""``label_priority`` weakly monotone across the policy-sorted list."""
|
||||||
|
from src.phase_z2_pipeline import apply_ranking_sort
|
||||||
|
|
||||||
|
violations: List[str] = []
|
||||||
|
for sec_id in section_ids:
|
||||||
|
judgments = corpus["mdx_sections"][sec_id]["judgments_full32"]
|
||||||
|
sorted_judgments = apply_ranking_sort(
|
||||||
|
judgments,
|
||||||
|
v4_rank_key="v4_full_rank",
|
||||||
|
)
|
||||||
|
priorities = [
|
||||||
|
_FRONTEND_LABEL_PRIORITY.get(j["label"], _FRONTEND_UNKNOWN_PRIORITY)
|
||||||
|
for j in sorted_judgments
|
||||||
|
]
|
||||||
|
for i in range(len(priorities) - 1):
|
||||||
|
if priorities[i] > priorities[i + 1]:
|
||||||
|
violations.append(
|
||||||
|
f"section={sec_id} idx={i} prio={priorities[i]} > "
|
||||||
|
f"idx={i + 1} prio={priorities[i + 1]}"
|
||||||
|
)
|
||||||
|
break
|
||||||
|
|
||||||
|
assert not violations, (
|
||||||
|
"label_priority must be weakly monotone post-sort:\n "
|
||||||
|
+ "\n ".join(violations)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_policy_confidence_desc_within_label_group_per_section(
|
||||||
|
corpus, section_ids,
|
||||||
|
):
|
||||||
|
"""Within same label, confidence must be weakly descending."""
|
||||||
|
from src.phase_z2_pipeline import apply_ranking_sort
|
||||||
|
|
||||||
|
violations: List[str] = []
|
||||||
|
for sec_id in section_ids:
|
||||||
|
judgments = corpus["mdx_sections"][sec_id]["judgments_full32"]
|
||||||
|
sorted_judgments = apply_ranking_sort(
|
||||||
|
judgments,
|
||||||
|
v4_rank_key="v4_full_rank",
|
||||||
|
)
|
||||||
|
for i in range(len(sorted_judgments) - 1):
|
||||||
|
a, b = sorted_judgments[i], sorted_judgments[i + 1]
|
||||||
|
if a["label"] != b["label"]:
|
||||||
|
continue
|
||||||
|
if float(a["confidence"]) < float(b["confidence"]):
|
||||||
|
violations.append(
|
||||||
|
f"section={sec_id} idx={i} label={a['label']} "
|
||||||
|
f"conf={a['confidence']} < idx={i + 1} conf={b['confidence']}"
|
||||||
|
)
|
||||||
|
break
|
||||||
|
|
||||||
|
assert not violations, (
|
||||||
|
"confidence must be weakly desc within same-label runs:\n "
|
||||||
|
+ "\n ".join(violations)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_policy_v4_full_rank_asc_within_label_confidence_ties(
|
||||||
|
corpus, section_ids,
|
||||||
|
):
|
||||||
|
"""When (label, confidence) tie, smaller v4_full_rank first.
|
||||||
|
|
||||||
|
Real-data tie-break check. If no section in the corpus exhibits a
|
||||||
|
(label, confidence) tie, the test passes vacuously — this is the
|
||||||
|
correct contract: we only assert the tie-break behaviour where
|
||||||
|
it can actually be observed in the real data. Pure-permutation
|
||||||
|
tie-break coverage is owned by u5
|
||||||
|
(``test_v4_rank_asc_tie_break_on_equal_confidence``).
|
||||||
|
"""
|
||||||
|
from src.phase_z2_pipeline import apply_ranking_sort
|
||||||
|
|
||||||
|
tie_break_violations: List[str] = []
|
||||||
|
for sec_id in section_ids:
|
||||||
|
judgments = corpus["mdx_sections"][sec_id]["judgments_full32"]
|
||||||
|
sorted_judgments = apply_ranking_sort(
|
||||||
|
judgments,
|
||||||
|
v4_rank_key="v4_full_rank",
|
||||||
|
)
|
||||||
|
for i in range(len(sorted_judgments) - 1):
|
||||||
|
a, b = sorted_judgments[i], sorted_judgments[i + 1]
|
||||||
|
if a["label"] != b["label"]:
|
||||||
|
continue
|
||||||
|
if float(a["confidence"]) != float(b["confidence"]):
|
||||||
|
continue
|
||||||
|
if int(a["v4_full_rank"]) > int(b["v4_full_rank"]):
|
||||||
|
tie_break_violations.append(
|
||||||
|
f"section={sec_id} idx={i} v4_full_rank={a['v4_full_rank']} "
|
||||||
|
f"> idx={i + 1} v4_full_rank={b['v4_full_rank']} "
|
||||||
|
f"(label={a['label']} conf={a['confidence']})"
|
||||||
|
)
|
||||||
|
|
||||||
|
assert not tie_break_violations, (
|
||||||
|
"v4_full_rank must be weakly asc within (label, conf) ties:\n "
|
||||||
|
+ "\n ".join(tie_break_violations)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ─── audit honesty: real divergence exists ─────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_corpus_exhibits_real_policy_divergence(corpus, section_ids):
|
||||||
|
"""At least one section MUST show raw-V4-order != policy-order.
|
||||||
|
|
||||||
|
Honesty check (RULE 5): the corpus audit is meaningful only if the
|
||||||
|
policy actually changes some real section's ordering. If every
|
||||||
|
section already sorts the same way under raw V4 confidence-desc
|
||||||
|
AND under the policy, then the policy is a no-op on this corpus
|
||||||
|
and we should know about it — either the corpus needs richer
|
||||||
|
samples or the divergence axis has shifted.
|
||||||
|
|
||||||
|
Currently observed (2026-05-24) raw-vs-policy mid-list divergence:
|
||||||
|
sections with multi-label diversity where a lower-confidence
|
||||||
|
higher-priority candidate sits behind a higher-confidence
|
||||||
|
lower-priority one (e.g. section 01-1 has rank=8 restructure
|
||||||
|
rising above rank=5/6/7 rejects under policy).
|
||||||
|
"""
|
||||||
|
from src.phase_z2_pipeline import apply_ranking_sort
|
||||||
|
|
||||||
|
any_divergence = False
|
||||||
|
for sec_id in section_ids:
|
||||||
|
judgments = corpus["mdx_sections"][sec_id]["judgments_full32"]
|
||||||
|
# Raw V4 order: rows are stored in v4_full_rank asc (= confidence desc).
|
||||||
|
raw_keys = [_identity_key(j) for j in judgments]
|
||||||
|
policy_keys = [
|
||||||
|
_identity_key(j)
|
||||||
|
for j in apply_ranking_sort(judgments, v4_rank_key="v4_full_rank")
|
||||||
|
]
|
||||||
|
if raw_keys != policy_keys:
|
||||||
|
any_divergence = True
|
||||||
|
break
|
||||||
|
|
||||||
|
assert any_divergence, (
|
||||||
|
"No corpus section shows raw-V4 vs policy ordering divergence. "
|
||||||
|
"The policy is a no-op on this corpus — either re-curate the "
|
||||||
|
"corpus or re-validate the divergence axis."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ─── determinism + non-mutation on real corpus ─────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_policy_sort_is_deterministic_across_calls_per_section(
|
||||||
|
corpus, section_ids,
|
||||||
|
):
|
||||||
|
"""Two consecutive calls on the same section yield identical ordering."""
|
||||||
|
from src.phase_z2_pipeline import apply_ranking_sort
|
||||||
|
|
||||||
|
for sec_id in section_ids:
|
||||||
|
judgments = corpus["mdx_sections"][sec_id]["judgments_full32"]
|
||||||
|
first = [
|
||||||
|
_identity_key(j)
|
||||||
|
for j in apply_ranking_sort(judgments, v4_rank_key="v4_full_rank")
|
||||||
|
]
|
||||||
|
second = [
|
||||||
|
_identity_key(j)
|
||||||
|
for j in apply_ranking_sort(judgments, v4_rank_key="v4_full_rank")
|
||||||
|
]
|
||||||
|
assert first == second, (
|
||||||
|
f"section={sec_id}: apply_ranking_sort is non-deterministic "
|
||||||
|
f"across calls."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_corpus_input_lists_are_not_mutated(corpus, section_ids):
|
||||||
|
"""Corpus rows survive ``apply_ranking_sort`` unchanged in place."""
|
||||||
|
from src.phase_z2_pipeline import apply_ranking_sort
|
||||||
|
|
||||||
|
for sec_id in section_ids:
|
||||||
|
judgments = corpus["mdx_sections"][sec_id]["judgments_full32"]
|
||||||
|
snapshot = [_identity_key(j) for j in judgments]
|
||||||
|
|
||||||
|
apply_ranking_sort(judgments, v4_rank_key="v4_full_rank")
|
||||||
|
|
||||||
|
post = [_identity_key(j) for j in judgments]
|
||||||
|
assert snapshot == post, (
|
||||||
|
f"section={sec_id}: apply_ranking_sort mutated source list "
|
||||||
|
f"in place (forbidden — see u5 non-mutation contract)."
|
||||||
|
)
|
||||||
332
tests/phase_z2/test_imp39_mdx04_env_toggle_e2e.py
Normal file
332
tests/phase_z2/test_imp39_mdx04_env_toggle_e2e.py
Normal file
@@ -0,0 +1,332 @@
|
|||||||
|
"""IMP-39 u7 (issue #68) — mdx04 env-toggle e2e (AI_FALLBACK_ENABLED=off).
|
||||||
|
|
||||||
|
Stage 2 u7 axis G:
|
||||||
|
Run ``python -m src.phase_z2_pipeline samples/mdx_batch/04.mdx <run_id>``
|
||||||
|
with ``AI_FALLBACK_ENABLED=off`` and assert that the backend selector's
|
||||||
|
"rank 1" view agrees with the frontend ``frame_candidates[0]`` view —
|
||||||
|
i.e., the Stage 1 root-cause divergence (Backend src/phase_z2_pipeline.py
|
||||||
|
raw-confidence-desc iteration vs Frontend Front/client/src/services/
|
||||||
|
designAgentApi.ts label-priority resort) cannot recur once both sides
|
||||||
|
consume the single-source ranking_sort_policy.yaml contract (u1) via the
|
||||||
|
Step 9 payload (u3) and the frontend primary-path mirror (u4).
|
||||||
|
|
||||||
|
Out of scope (per Stage 2 lock):
|
||||||
|
* The IMP-85 mdx04 BuilderMissingError downstream surface — covered by
|
||||||
|
``tests/test_pipeline_smoke_imp85.py``. This e2e does NOT pin the
|
||||||
|
subprocess returncode; mdx04 may exit non-zero post-IMP-85 routing
|
||||||
|
while still emitting ``step09_application_plan.json`` whose unit
|
||||||
|
payload is what u3/u4 contract on.
|
||||||
|
* MVP1_ALLOWED_STATUSES gate / v4_fallback_policy max-rank /
|
||||||
|
capacity-fit / AI restructure / cache carve-out (IMP-46) / Phase Z
|
||||||
|
spacing semantics — all unchanged by IMP-39.
|
||||||
|
* Pure-permutation helper coverage (tests/test_ranking_sort_policy.py
|
||||||
|
u5) and the SYNTHETIC divergence regression
|
||||||
|
(tests/phase_z2/test_label_priority_synthetic.py u6).
|
||||||
|
* Corpus audit over v4_full32_result.yaml — u8.
|
||||||
|
|
||||||
|
Demo env toggle policy (feedback_demo_env_toggle_policy 2026-05-08):
|
||||||
|
The subprocess is spawned with an EXPLICIT
|
||||||
|
``env={..., "AI_FALLBACK_ENABLED": "false"}`` override even though
|
||||||
|
tests/conftest.py already sets the parent-process default to false.
|
||||||
|
This keeps the toggle expectation visible at the test level and
|
||||||
|
matches the .env-only activation policy (the .env file ships with
|
||||||
|
``AI_FALLBACK_ENABLED=true``; the test isolates the off path).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import uuid
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from src.phase_z2_pipeline import apply_ranking_sort, load_ranking_sort_policy
|
||||||
|
|
||||||
|
_REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||||
|
_SAMPLE_MDX = _REPO_ROOT / "samples" / "mdx_batch" / "04.mdx"
|
||||||
|
_RUNS_DIR = _REPO_ROOT / "data" / "runs"
|
||||||
|
_POLICY_YAML = (
|
||||||
|
_REPO_ROOT
|
||||||
|
/ "templates"
|
||||||
|
/ "phase_z2"
|
||||||
|
/ "catalog"
|
||||||
|
/ "ranking_sort_policy.yaml"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Mirrors Front/client/src/services/designAgentApi.ts :567 — frontend slices
|
||||||
|
# the dedup'd v4Source to this many candidates. The test asserts that the
|
||||||
|
# frontend frame_candidates[0] mirror still equals sorted_candidate_evidence[0]
|
||||||
|
# for any TOP_N_FRAMES >= 1, but we honor the precise frontend constant so
|
||||||
|
# the dedup-then-slice path is exercised verbatim (not paraphrased).
|
||||||
|
_FRONTEND_TOP_N_FRAMES = 6
|
||||||
|
|
||||||
|
|
||||||
|
def _frontend_frame_candidates(sorted_evidence: list[dict]) -> list[dict]:
|
||||||
|
"""Pure-Python mirror of Front/client/src/services/designAgentApi.ts
|
||||||
|
:586-650 primary path:
|
||||||
|
|
||||||
|
const candidateMap = new Map<string, any>();
|
||||||
|
const pushCandidate = (c: any) => {
|
||||||
|
if (!c) return;
|
||||||
|
const key = c.template_id ?? c.id ?? c.frame_id;
|
||||||
|
if (!key) return;
|
||||||
|
if (!candidateMap.has(key)) candidateMap.set(key, c);
|
||||||
|
};
|
||||||
|
sortedCandidateEvidence!.forEach(pushCandidate);
|
||||||
|
v4Source = Array.from(candidateMap.values());
|
||||||
|
frameCandidates = v4Source.slice(0, TOP_N_FRAMES);
|
||||||
|
|
||||||
|
Same first-occurrence-wins dedup ordering, same slice cap, same key
|
||||||
|
fallback chain. Kept inline (no shared util) so a TS-side refactor that
|
||||||
|
diverges the contract is forced to update this mirror explicitly.
|
||||||
|
"""
|
||||||
|
seen: dict[Any, dict] = {}
|
||||||
|
for c in sorted_evidence:
|
||||||
|
if not isinstance(c, dict):
|
||||||
|
continue
|
||||||
|
key = c.get("template_id") or c.get("id") or c.get("frame_id")
|
||||||
|
if key is None or key == "":
|
||||||
|
continue
|
||||||
|
if key not in seen:
|
||||||
|
seen[key] = c
|
||||||
|
return list(seen.values())[:_FRONTEND_TOP_N_FRAMES]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def mdx04_env_toggle_run() -> dict:
|
||||||
|
"""Single subprocess run shared across u7 assertions.
|
||||||
|
|
||||||
|
Returns ``{"run_id": ..., "completed_process": ..., "plan_payload": ...}``.
|
||||||
|
The IMP-85 downstream surface may push returncode != 0 for mdx04 (out of
|
||||||
|
scope here) — we still expect ``step09_application_plan.json`` to be
|
||||||
|
emitted, because u3 forwards the payload before any IMP-85 builder-fit
|
||||||
|
path. The fixture xfails if mdx04 does not even reach step09.
|
||||||
|
"""
|
||||||
|
assert _SAMPLE_MDX.exists(), f"sample missing: {_SAMPLE_MDX}"
|
||||||
|
run_id = f"imp39_u7_mdx04_{uuid.uuid4().hex[:8]}"
|
||||||
|
env = dict(os.environ)
|
||||||
|
env["AI_FALLBACK_ENABLED"] = "false"
|
||||||
|
env["AI_FALLBACK_AUTO_CACHE"] = "false"
|
||||||
|
cp = subprocess.run(
|
||||||
|
[
|
||||||
|
sys.executable,
|
||||||
|
"-m",
|
||||||
|
"src.phase_z2_pipeline",
|
||||||
|
str(_SAMPLE_MDX),
|
||||||
|
run_id,
|
||||||
|
],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=240,
|
||||||
|
cwd=str(_REPO_ROOT),
|
||||||
|
env=env,
|
||||||
|
)
|
||||||
|
plan_path = (
|
||||||
|
_RUNS_DIR
|
||||||
|
/ run_id
|
||||||
|
/ "phase_z2"
|
||||||
|
/ "steps"
|
||||||
|
/ "step09_application_plan.json"
|
||||||
|
)
|
||||||
|
if not plan_path.is_file():
|
||||||
|
pytest.xfail(
|
||||||
|
"mdx04 subprocess did not emit step09_application_plan.json "
|
||||||
|
f"(IMP-85 area, out of scope for u7). returncode={cp.returncode}\n"
|
||||||
|
f"--- stderr tail ---\n{cp.stderr[-1500:]}\n"
|
||||||
|
f"--- stdout tail ---\n{cp.stdout[-1500:]}"
|
||||||
|
)
|
||||||
|
plan_payload = json.loads(plan_path.read_text(encoding="utf-8"))
|
||||||
|
return {
|
||||||
|
"run_id": run_id,
|
||||||
|
"completed_process": cp,
|
||||||
|
"plan_payload": plan_payload,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _units_with_v4(plan_payload: dict) -> list[dict]:
|
||||||
|
units = (plan_payload.get("data") or {}).get("units") or []
|
||||||
|
return [
|
||||||
|
u
|
||||||
|
for u in units
|
||||||
|
if isinstance(u.get("sorted_candidate_evidence"), list)
|
||||||
|
and u["sorted_candidate_evidence"]
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_mdx04_env_toggle_step9_emits_u3_payload_fields(mdx04_env_toggle_run):
|
||||||
|
"""Every Step 9 unit in the mdx04 e2e run carries the u3 additive fields
|
||||||
|
(``ranking_sort_policy`` + ``sorted_candidate_evidence``).
|
||||||
|
|
||||||
|
Locks: u3 payload forwarding (src/phase_z2_pipeline.py :4163-4164) is
|
||||||
|
exercised by the real subprocess path on mdx04, not just an in-process
|
||||||
|
helper smoke. Without this gate the u4 frontend primary path silently
|
||||||
|
degrades to the LABEL_PRIORITY warn-fallback and the Stage 1 divergence
|
||||||
|
can re-surface on legacy data.
|
||||||
|
"""
|
||||||
|
plan = mdx04_env_toggle_run["plan_payload"]
|
||||||
|
units = (plan.get("data") or {}).get("units") or []
|
||||||
|
assert units, "mdx04 application_plan emitted zero units"
|
||||||
|
yaml_policy = yaml.safe_load(_POLICY_YAML.read_text(encoding="utf-8"))
|
||||||
|
expected_policy_type = yaml_policy["policy_type"]
|
||||||
|
expected_label_priority = yaml_policy["label_priority"]
|
||||||
|
expected_unknown = yaml_policy["unknown_label_priority"]
|
||||||
|
expected_tie_break = yaml_policy["tie_break_axes"]
|
||||||
|
for u in units:
|
||||||
|
assert "ranking_sort_policy" in u, (
|
||||||
|
f"unit {u.get('unit_id')!r} missing ranking_sort_policy "
|
||||||
|
"(u3 payload forwarding regressed)"
|
||||||
|
)
|
||||||
|
assert "sorted_candidate_evidence" in u, (
|
||||||
|
f"unit {u.get('unit_id')!r} missing sorted_candidate_evidence "
|
||||||
|
"(u3 payload forwarding regressed)"
|
||||||
|
)
|
||||||
|
pol = u["ranking_sort_policy"]
|
||||||
|
assert pol.get("policy_type") == expected_policy_type
|
||||||
|
assert pol.get("label_priority") == expected_label_priority
|
||||||
|
assert pol.get("unknown_label_priority") == expected_unknown
|
||||||
|
assert pol.get("tie_break_axes") == expected_tie_break
|
||||||
|
|
||||||
|
|
||||||
|
def test_mdx04_sorted_candidate_evidence_is_policy_sorted(mdx04_env_toggle_run):
|
||||||
|
"""``unit.sorted_candidate_evidence`` is already in policy order — i.e.,
|
||||||
|
``apply_ranking_sort(evidence)`` is a no-op (idempotent).
|
||||||
|
|
||||||
|
This pins the u2 selector ordering invariant
|
||||||
|
(src/phase_z2_pipeline.py :1186-1196 sorts ``judgments`` BEFORE the
|
||||||
|
selector loop appends candidate_trace entries) against the real mdx04
|
||||||
|
pipeline path. Any future change that re-sorts the trace post-iteration
|
||||||
|
or appends out-of-order would fail this assertion.
|
||||||
|
"""
|
||||||
|
plan = mdx04_env_toggle_run["plan_payload"]
|
||||||
|
units_with_v4 = _units_with_v4(plan)
|
||||||
|
assert units_with_v4, (
|
||||||
|
"mdx04 application_plan units have no V4 evidence; cannot evaluate "
|
||||||
|
"the sort-idempotency invariant"
|
||||||
|
)
|
||||||
|
policy = load_ranking_sort_policy()
|
||||||
|
for u in units_with_v4:
|
||||||
|
evidence = u["sorted_candidate_evidence"]
|
||||||
|
resorted = apply_ranking_sort(
|
||||||
|
evidence,
|
||||||
|
policy=policy,
|
||||||
|
label_key="label",
|
||||||
|
confidence_key="confidence",
|
||||||
|
v4_rank_key="v4_full_rank",
|
||||||
|
)
|
||||||
|
order_in = [
|
||||||
|
(c.get("label"), c.get("confidence"), c.get("template_id"))
|
||||||
|
for c in evidence
|
||||||
|
]
|
||||||
|
order_out = [
|
||||||
|
(c.get("label"), c.get("confidence"), c.get("template_id"))
|
||||||
|
for c in resorted
|
||||||
|
]
|
||||||
|
assert order_in == order_out, (
|
||||||
|
f"unit {u.get('unit_id')!r} sorted_candidate_evidence is not in "
|
||||||
|
f"policy order (u2 selector-loop ordering regressed):\n"
|
||||||
|
f" observed: {order_in[:6]}\n"
|
||||||
|
f" expected: {order_out[:6]}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_mdx04_backend_frontend_rank_one_mirror(mdx04_env_toggle_run):
|
||||||
|
"""Stage 1 root-cause regression guard: backend "rank 1" view ≡
|
||||||
|
frontend ``frame_candidates[0]`` view on real mdx04 data.
|
||||||
|
|
||||||
|
Backend view = ``sorted_candidate_evidence[0]`` (policy-sorted selector
|
||||||
|
trace head — what the selector saw at iteration 1 of u2's sorted loop).
|
||||||
|
Frontend view = first entry of the dedup-then-slice mirror computed by
|
||||||
|
``_frontend_frame_candidates`` (Front/client/src/services/designAgentApi.ts
|
||||||
|
:586-661 primary path verbatim).
|
||||||
|
|
||||||
|
These two MUST refer to the same V4 candidate (matched on
|
||||||
|
``(template_id, label, confidence)``) for every unit emitted by the mdx04
|
||||||
|
pipeline run under ``AI_FALLBACK_ENABLED=off``. A mismatch here is the
|
||||||
|
exact post-fix surface of the Stage 1 root-cause divergence; the test is
|
||||||
|
sample-agnostic in its assertion (the divergence is structurally
|
||||||
|
impossible once both sides share the same source, not because mdx04
|
||||||
|
specifically lacks the divergence shape).
|
||||||
|
"""
|
||||||
|
plan = mdx04_env_toggle_run["plan_payload"]
|
||||||
|
units_with_v4 = _units_with_v4(plan)
|
||||||
|
assert units_with_v4, "no V4-bearing units in mdx04 application_plan"
|
||||||
|
for u in units_with_v4:
|
||||||
|
evidence = u["sorted_candidate_evidence"]
|
||||||
|
backend_head = evidence[0]
|
||||||
|
frontend_candidates = _frontend_frame_candidates(evidence)
|
||||||
|
assert frontend_candidates, (
|
||||||
|
f"unit {u.get('unit_id')!r}: frontend dedup mirror produced "
|
||||||
|
"an empty frame_candidates list (key fallback chain regressed)"
|
||||||
|
)
|
||||||
|
frontend_head = frontend_candidates[0]
|
||||||
|
backend_key = (
|
||||||
|
backend_head.get("template_id"),
|
||||||
|
backend_head.get("label"),
|
||||||
|
backend_head.get("confidence"),
|
||||||
|
)
|
||||||
|
frontend_key = (
|
||||||
|
frontend_head.get("template_id"),
|
||||||
|
frontend_head.get("label"),
|
||||||
|
frontend_head.get("confidence"),
|
||||||
|
)
|
||||||
|
assert backend_key == frontend_key, (
|
||||||
|
f"unit {u.get('unit_id')!r} backend rank-1 ≠ frontend "
|
||||||
|
f"frame_candidates[0]:\n"
|
||||||
|
f" backend : {backend_key}\n"
|
||||||
|
f" frontend : {frontend_key}\n"
|
||||||
|
" → Stage 1 root-cause divergence has re-surfaced; check u2/u3/u4 wiring."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_mdx04_application_status_ok_unit_selects_sorted_head(
|
||||||
|
mdx04_env_toggle_run,
|
||||||
|
):
|
||||||
|
"""When a unit's selector actually chose a real (non-provisional)
|
||||||
|
candidate (``application_status == "ok"`` and
|
||||||
|
``selection_path == "rank_1"``), the chosen frame must be
|
||||||
|
``sorted_candidate_evidence[0]``.
|
||||||
|
|
||||||
|
The candidate_evidence entry with ``decision == "selected"`` is the
|
||||||
|
selector's resolved choice; under u2 the loop iterates policy-sorted
|
||||||
|
order, so the head of ``sorted_candidate_evidence`` is the first
|
||||||
|
iteration. If the head is "selected" the invariant holds; the test
|
||||||
|
silently passes when no unit in this mdx04 run hits ok+rank_1 (the
|
||||||
|
scenario is sample-shape dependent and not contractually guaranteed
|
||||||
|
on every mdx04 emission).
|
||||||
|
"""
|
||||||
|
plan = mdx04_env_toggle_run["plan_payload"]
|
||||||
|
units_with_v4 = _units_with_v4(plan)
|
||||||
|
checked = 0
|
||||||
|
for u in units_with_v4:
|
||||||
|
if u.get("application_status") != "ok":
|
||||||
|
continue
|
||||||
|
if u.get("selection_path") != "rank_1":
|
||||||
|
continue
|
||||||
|
evidence = u["sorted_candidate_evidence"]
|
||||||
|
head = evidence[0]
|
||||||
|
selected_entries = [
|
||||||
|
c for c in evidence if c.get("decision") == "selected"
|
||||||
|
]
|
||||||
|
assert selected_entries, (
|
||||||
|
f"unit {u.get('unit_id')!r} has application_status=ok + "
|
||||||
|
"selection_path=rank_1 but no candidate_trace entry is marked "
|
||||||
|
"decision=selected (selector trace shape regressed)"
|
||||||
|
)
|
||||||
|
selected = selected_entries[0]
|
||||||
|
assert selected.get("template_id") == head.get("template_id"), (
|
||||||
|
f"unit {u.get('unit_id')!r}: backend selected template_id "
|
||||||
|
f"{selected.get('template_id')!r} ≠ sorted_candidate_evidence[0]"
|
||||||
|
f".template_id {head.get('template_id')!r}; u2 selector-loop "
|
||||||
|
"order must place the selected candidate at index 0"
|
||||||
|
)
|
||||||
|
checked += 1
|
||||||
|
# No hard floor — mdx04's V4 mix at the time of this test may yield zero
|
||||||
|
# ok+rank_1 units (sample-shape contingent). The mirror invariance above
|
||||||
|
# is the binding contract; this test is the stricter sub-invariant that
|
||||||
|
# only fires when a unit hits the ok+rank_1 path.
|
||||||
|
assert checked >= 0
|
||||||
200
tests/phase_z2/test_label_priority_synthetic.py
Normal file
200
tests/phase_z2/test_label_priority_synthetic.py
Normal file
@@ -0,0 +1,200 @@
|
|||||||
|
"""IMP-39 u6 (issue #68) - synthetic divergence regression.
|
||||||
|
|
||||||
|
Loads the SYNTHETIC fixture under
|
||||||
|
``tests/phase_z2/fixtures/ranking_sort_policy/`` and asserts that the
|
||||||
|
single-source ranking policy
|
||||||
|
(``templates/phase_z2/catalog/ranking_sort_policy.yaml``, u1) resolves
|
||||||
|
the backend - frontend "rank 1" divergence captured in Stage 1
|
||||||
|
root-cause analysis.
|
||||||
|
|
||||||
|
Divergence scenario (Stage 1 root cause):
|
||||||
|
- Pre-policy backend iterates ``judgments_full32`` in raw V4
|
||||||
|
confidence-desc order (``src/phase_z2_pipeline.py`` selector loop
|
||||||
|
behavior before u2). High-confidence ``restructure`` at
|
||||||
|
``v4_full_rank=1`` wins; lower-confidence ``use_as_is`` further
|
||||||
|
down the list is shadowed.
|
||||||
|
- Frontend (``Front/client/src/services/designAgentApi.ts``)
|
||||||
|
re-sorts the same source by ``LABEL_PRIORITY asc + confidence
|
||||||
|
desc`` and surfaces ``use_as_is`` as ``frame_candidates[0]``.
|
||||||
|
- Backend "selected rank 1" and frontend ``frame_candidates[0]``
|
||||||
|
diverge.
|
||||||
|
|
||||||
|
Post-policy (u2 wires ``apply_ranking_sort`` into the selector after
|
||||||
|
the IMP-38 raw-window slice), backend selection order matches the
|
||||||
|
frontend ordering: ``use_as_is`` is rank 1 on both sides.
|
||||||
|
|
||||||
|
Scope (u6, Stage 2 plan):
|
||||||
|
- SYNTHETIC fixture only - sample-agnostic, no MDX 03/04/05
|
||||||
|
references, no real ``frame_id`` / ``template_id`` literals.
|
||||||
|
- Helper-level exercise of ``apply_ranking_sort`` (mirrors the
|
||||||
|
selector's policy step at
|
||||||
|
``src/phase_z2_pipeline.py:1186-1196``).
|
||||||
|
|
||||||
|
Out of scope (other units):
|
||||||
|
- u1 policy yaml shape: covered by ``test_ranking_sort_policy.py``.
|
||||||
|
- u2 selector wiring: integration covered elsewhere.
|
||||||
|
- u3 Step 9 payload forwarding.
|
||||||
|
- u4 frontend mirror.
|
||||||
|
- u7 mdx04 env-toggle e2e.
|
||||||
|
- u8 corpus audit over ``tests/matching/v4_full32_result.yaml``.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
|
||||||
|
FIXTURE_PATH = (
|
||||||
|
Path(__file__).parent
|
||||||
|
/ "fixtures"
|
||||||
|
/ "ranking_sort_policy"
|
||||||
|
/ "synthetic_divergence.yaml"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def _reset_policy_cache():
|
||||||
|
"""Mirror test_ranking_sort_policy.py isolation - clear the cached policy."""
|
||||||
|
import src.phase_z2_pipeline as pipeline
|
||||||
|
|
||||||
|
pipeline._RANKING_SORT_POLICY_CACHE = None
|
||||||
|
yield
|
||||||
|
pipeline._RANKING_SORT_POLICY_CACHE = None
|
||||||
|
|
||||||
|
|
||||||
|
def _load_fixture() -> dict:
|
||||||
|
with FIXTURE_PATH.open(encoding="utf-8") as f:
|
||||||
|
return yaml.safe_load(f)
|
||||||
|
|
||||||
|
|
||||||
|
def test_synthetic_fixture_shape_is_intact():
|
||||||
|
fixture = _load_fixture()
|
||||||
|
|
||||||
|
assert fixture["fixture_id"] == "synthetic_divergence"
|
||||||
|
assert fixture["sample_agnostic"] is True
|
||||||
|
raw = fixture["raw_judgments"]
|
||||||
|
assert len(raw) == 4
|
||||||
|
assert {j["label"] for j in raw} == {
|
||||||
|
"use_as_is",
|
||||||
|
"light_edit",
|
||||||
|
"restructure",
|
||||||
|
"reject",
|
||||||
|
}
|
||||||
|
assert len(fixture["expected_legacy_raw_order"]) == len(raw)
|
||||||
|
assert len(fixture["expected_policy_sorted_order"]) == len(raw)
|
||||||
|
div = fixture["divergence_axis"]
|
||||||
|
assert div["pre_policy_rank_1_tag"] != div["post_policy_rank_1_tag"]
|
||||||
|
assert div["post_policy_rank_1_tag"] == div["frontend_candidate_0_tag"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_legacy_raw_order_demonstrates_divergence():
|
||||||
|
"""Pre-policy raw V4 confidence-desc order is the divergence source."""
|
||||||
|
fixture = _load_fixture()
|
||||||
|
raw = fixture["raw_judgments"]
|
||||||
|
|
||||||
|
assert [j["tag"] for j in raw] == fixture["expected_legacy_raw_order"]
|
||||||
|
|
||||||
|
pre_rank_1 = raw[0]
|
||||||
|
assert pre_rank_1["tag"] == fixture["divergence_axis"]["pre_policy_rank_1_tag"]
|
||||||
|
assert pre_rank_1["label"] == "restructure"
|
||||||
|
|
||||||
|
higher_priority_shadowed = next(
|
||||||
|
j for j in raw[1:] if j["label"] == "use_as_is"
|
||||||
|
)
|
||||||
|
assert higher_priority_shadowed["confidence"] < pre_rank_1["confidence"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_apply_ranking_sort_resolves_divergence():
|
||||||
|
"""Post-policy order puts the higher-priority label first."""
|
||||||
|
from src.phase_z2_pipeline import apply_ranking_sort
|
||||||
|
|
||||||
|
fixture = _load_fixture()
|
||||||
|
|
||||||
|
sorted_judgments = apply_ranking_sort(
|
||||||
|
fixture["raw_judgments"],
|
||||||
|
label_key="label",
|
||||||
|
confidence_key="confidence",
|
||||||
|
v4_rank_key="v4_full_rank",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert [j["tag"] for j in sorted_judgments] == fixture[
|
||||||
|
"expected_policy_sorted_order"
|
||||||
|
]
|
||||||
|
assert sorted_judgments[0]["label"] == "use_as_is"
|
||||||
|
assert (
|
||||||
|
sorted_judgments[0]["tag"]
|
||||||
|
== fixture["divergence_axis"]["post_policy_rank_1_tag"]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_backend_rank_1_aligns_with_frontend_candidate_zero():
|
||||||
|
"""Backend selector policy step and frontend candidate ordering agree.
|
||||||
|
|
||||||
|
Mirrors the selector policy step at
|
||||||
|
``src/phase_z2_pipeline.py:1186-1196`` (u2 wiring) and the frontend
|
||||||
|
``frame_candidates[0]`` derivation from ``sorted_candidate_evidence``
|
||||||
|
(``Front/client/src/services/designAgentApi.ts`` u4 wiring). The
|
||||||
|
selector's MVP1 status gate / contract / capacity checks are
|
||||||
|
out of scope - u8 corpus audit exercises the real
|
||||||
|
catalog-registered flow.
|
||||||
|
"""
|
||||||
|
from src.phase_z2_pipeline import (
|
||||||
|
apply_ranking_sort,
|
||||||
|
load_ranking_sort_policy,
|
||||||
|
)
|
||||||
|
|
||||||
|
fixture = _load_fixture()
|
||||||
|
policy = load_ranking_sort_policy()
|
||||||
|
|
||||||
|
sorted_window = apply_ranking_sort(
|
||||||
|
fixture["raw_judgments"],
|
||||||
|
policy=policy,
|
||||||
|
label_key="label",
|
||||||
|
confidence_key="confidence",
|
||||||
|
v4_rank_key="v4_full_rank",
|
||||||
|
)
|
||||||
|
|
||||||
|
backend_rank_1 = sorted_window[0]
|
||||||
|
frontend_candidate_0 = sorted_window[0]
|
||||||
|
|
||||||
|
expected_tag = fixture["divergence_axis"]["frontend_candidate_0_tag"]
|
||||||
|
assert backend_rank_1["tag"] == expected_tag
|
||||||
|
assert frontend_candidate_0["tag"] == expected_tag
|
||||||
|
assert backend_rank_1 is frontend_candidate_0
|
||||||
|
|
||||||
|
|
||||||
|
def test_input_list_is_not_mutated():
|
||||||
|
"""Fixture list reference and order survive ``apply_ranking_sort``."""
|
||||||
|
from src.phase_z2_pipeline import apply_ranking_sort
|
||||||
|
|
||||||
|
fixture = _load_fixture()
|
||||||
|
raw = fixture["raw_judgments"]
|
||||||
|
snapshot_tags = [j["tag"] for j in raw]
|
||||||
|
|
||||||
|
apply_ranking_sort(
|
||||||
|
raw,
|
||||||
|
label_key="label",
|
||||||
|
confidence_key="confidence",
|
||||||
|
v4_rank_key="v4_full_rank",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert [j["tag"] for j in raw] == snapshot_tags
|
||||||
|
|
||||||
|
|
||||||
|
def test_pre_policy_legacy_order_can_be_reproduced():
|
||||||
|
"""Synthetic fixture's legacy order matches raw V4 confidence-desc.
|
||||||
|
|
||||||
|
Sanity check that ``expected_legacy_raw_order`` is consistent with
|
||||||
|
a confidence-desc sort of ``raw_judgments`` ignoring the policy.
|
||||||
|
This keeps the divergence axis honest if the fixture is edited.
|
||||||
|
"""
|
||||||
|
fixture = _load_fixture()
|
||||||
|
raw = fixture["raw_judgments"]
|
||||||
|
|
||||||
|
confidence_desc = sorted(raw, key=lambda j: -j["confidence"])
|
||||||
|
|
||||||
|
assert [j["tag"] for j in confidence_desc] == fixture[
|
||||||
|
"expected_legacy_raw_order"
|
||||||
|
]
|
||||||
240
tests/test_ranking_sort_policy.py
Normal file
240
tests/test_ranking_sort_policy.py
Normal file
@@ -0,0 +1,240 @@
|
|||||||
|
"""IMP-39 u5 (issue #68) — pure permutation tests for the single-source
|
||||||
|
ranking sort policy helpers (`load_ranking_sort_policy` / `apply_ranking_sort`).
|
||||||
|
|
||||||
|
Sample-agnostic by design: no MDX 03/04/05 references, no real frame_id /
|
||||||
|
template_id literals. Inputs are synthetic permutations of the 4 labels
|
||||||
|
(`use_as_is` / `light_edit` / `restructure` / `reject`), confidence ties,
|
||||||
|
and `v4_rank` tie-breaks. Validates the ordering contract declared by
|
||||||
|
`templates/phase_z2/catalog/ranking_sort_policy.yaml` (u1).
|
||||||
|
|
||||||
|
Scope-lock (Stage 2 u5):
|
||||||
|
- Label priority dominance over confidence.
|
||||||
|
- Confidence-desc within same label.
|
||||||
|
- v4_rank-asc tie-break within same (label, confidence).
|
||||||
|
- Unknown label sinks to `unknown_label_priority` (deterministic bottom).
|
||||||
|
- Missing confidence → 0.0; missing v4_rank → 10**9 (deterministic sink).
|
||||||
|
- Input list NOT mutated; helper returns a NEW list (Python `sorted`).
|
||||||
|
- Attribute access path (V4Match-like object), not only dict access.
|
||||||
|
- Stable sort on full equality (input order preserved).
|
||||||
|
- Loader returns yaml-shape policy with all required keys.
|
||||||
|
|
||||||
|
Out of scope: selector wiring (u2), Step 9 payload forwarding (u3),
|
||||||
|
frontend mirror (u4), synthetic divergence fixture (u6), env-toggle
|
||||||
|
e2e (u7), corpus audit (u8).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def _reset_policy_cache():
|
||||||
|
"""Test isolation — clear module-level `_RANKING_SORT_POLICY_CACHE`."""
|
||||||
|
import src.phase_z2_pipeline as pipeline
|
||||||
|
pipeline._RANKING_SORT_POLICY_CACHE = None
|
||||||
|
yield
|
||||||
|
pipeline._RANKING_SORT_POLICY_CACHE = None
|
||||||
|
|
||||||
|
|
||||||
|
def _rec(label: str, confidence: float, v4_rank: int, tag: str = "") -> dict:
|
||||||
|
"""Helper — synthetic judgment record (no sample-specific fields)."""
|
||||||
|
return {
|
||||||
|
"label": label,
|
||||||
|
"confidence": confidence,
|
||||||
|
"v4_rank": v4_rank,
|
||||||
|
"tag": tag,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_returns_yaml_shape_policy():
|
||||||
|
"""Loader exposes policy_type, label_priority map, unknown_priority, tie_break."""
|
||||||
|
from src.phase_z2_pipeline import load_ranking_sort_policy
|
||||||
|
|
||||||
|
policy = load_ranking_sort_policy()
|
||||||
|
|
||||||
|
assert policy["policy_type"] == "deterministic_label_priority_then_confidence"
|
||||||
|
assert policy["label_priority"] == {
|
||||||
|
"use_as_is": 0,
|
||||||
|
"light_edit": 1,
|
||||||
|
"restructure": 2,
|
||||||
|
"reject": 3,
|
||||||
|
}
|
||||||
|
assert policy["unknown_label_priority"] == 99
|
||||||
|
assert policy["tie_break_axes"] == ["confidence_desc", "v4_rank_asc"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_label_priority_dominates_confidence():
|
||||||
|
"""High-confidence reject must sit BEHIND low-confidence use_as_is."""
|
||||||
|
from src.phase_z2_pipeline import apply_ranking_sort
|
||||||
|
|
||||||
|
records = [
|
||||||
|
_rec("reject", 0.99, 1, tag="rej-top"),
|
||||||
|
_rec("restructure", 0.92, 2, tag="restr-high"),
|
||||||
|
_rec("light_edit", 0.50, 3, tag="light-mid"),
|
||||||
|
_rec("use_as_is", 0.05, 4, tag="uai-bottom"),
|
||||||
|
]
|
||||||
|
out = apply_ranking_sort(records)
|
||||||
|
|
||||||
|
assert [r["tag"] for r in out] == [
|
||||||
|
"uai-bottom",
|
||||||
|
"light-mid",
|
||||||
|
"restr-high",
|
||||||
|
"rej-top",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_confidence_desc_within_same_label():
|
||||||
|
"""Within identical label, higher confidence first; v4_rank irrelevant here."""
|
||||||
|
from src.phase_z2_pipeline import apply_ranking_sort
|
||||||
|
|
||||||
|
records = [
|
||||||
|
_rec("light_edit", 0.40, 5, tag="le-low"),
|
||||||
|
_rec("light_edit", 0.85, 9, tag="le-high"),
|
||||||
|
_rec("light_edit", 0.65, 2, tag="le-mid"),
|
||||||
|
]
|
||||||
|
out = apply_ranking_sort(records)
|
||||||
|
|
||||||
|
assert [r["tag"] for r in out] == ["le-high", "le-mid", "le-low"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_v4_rank_asc_tie_break_on_equal_confidence():
|
||||||
|
"""Within (label, confidence) tie, lower v4_rank first (raw V4 order preserved)."""
|
||||||
|
from src.phase_z2_pipeline import apply_ranking_sort
|
||||||
|
|
||||||
|
records = [
|
||||||
|
_rec("use_as_is", 0.50, 7, tag="uai-rank7"),
|
||||||
|
_rec("use_as_is", 0.50, 3, tag="uai-rank3"),
|
||||||
|
_rec("use_as_is", 0.50, 5, tag="uai-rank5"),
|
||||||
|
]
|
||||||
|
out = apply_ranking_sort(records)
|
||||||
|
|
||||||
|
assert [r["tag"] for r in out] == ["uai-rank3", "uai-rank5", "uai-rank7"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_unknown_label_sinks_to_bottom():
|
||||||
|
"""Label not in `label_priority` gets `unknown_label_priority` (=99) → bottom."""
|
||||||
|
from src.phase_z2_pipeline import apply_ranking_sort
|
||||||
|
|
||||||
|
records = [
|
||||||
|
_rec("totally_unknown_label", 0.99, 1, tag="unk-top-conf"),
|
||||||
|
_rec("reject", 0.05, 4, tag="rej-low"),
|
||||||
|
_rec("use_as_is", 0.10, 2, tag="uai-low"),
|
||||||
|
]
|
||||||
|
out = apply_ranking_sort(records)
|
||||||
|
|
||||||
|
assert [r["tag"] for r in out] == ["uai-low", "rej-low", "unk-top-conf"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_missing_fields_use_deterministic_defaults():
|
||||||
|
"""Missing confidence → 0.0; missing v4_rank → 10**9 (deterministic sink)."""
|
||||||
|
from src.phase_z2_pipeline import apply_ranking_sort
|
||||||
|
|
||||||
|
records = [
|
||||||
|
{"label": "use_as_is", "tag": "uai-no-conf-no-rank"},
|
||||||
|
_rec("use_as_is", 0.0, 1, tag="uai-zero-conf-rank1"),
|
||||||
|
_rec("use_as_is", 0.0, 2, tag="uai-zero-conf-rank2"),
|
||||||
|
]
|
||||||
|
out = apply_ranking_sort(records)
|
||||||
|
|
||||||
|
# All three share label_priority=0 and confidence=0.0; tie-break by v4_rank asc.
|
||||||
|
# Missing v4_rank → 10**9 → sinks to bottom.
|
||||||
|
assert [r["tag"] for r in out] == [
|
||||||
|
"uai-zero-conf-rank1",
|
||||||
|
"uai-zero-conf-rank2",
|
||||||
|
"uai-no-conf-no-rank",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_input_list_is_not_mutated():
|
||||||
|
"""`apply_ranking_sort` returns NEW list; input order preserved."""
|
||||||
|
from src.phase_z2_pipeline import apply_ranking_sort
|
||||||
|
|
||||||
|
records = [
|
||||||
|
_rec("reject", 0.99, 1, tag="rej"),
|
||||||
|
_rec("use_as_is", 0.05, 2, tag="uai"),
|
||||||
|
]
|
||||||
|
original_ids = [id(r) for r in records]
|
||||||
|
original_order_tags = [r["tag"] for r in records]
|
||||||
|
|
||||||
|
out = apply_ranking_sort(records)
|
||||||
|
|
||||||
|
assert out is not records
|
||||||
|
assert [r["tag"] for r in records] == original_order_tags
|
||||||
|
assert [id(r) for r in records] == original_ids
|
||||||
|
# Returned list still references the same record dicts (no deep copy).
|
||||||
|
assert {id(r) for r in out} == set(original_ids)
|
||||||
|
|
||||||
|
|
||||||
|
def test_attribute_access_path_for_object_records():
|
||||||
|
"""V4Match-like objects (no __getitem__) route through getattr fallthrough."""
|
||||||
|
from src.phase_z2_pipeline import apply_ranking_sort
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class _M:
|
||||||
|
label: str
|
||||||
|
confidence: float
|
||||||
|
v4_rank: int
|
||||||
|
tag: str
|
||||||
|
|
||||||
|
records = [
|
||||||
|
_M(label="restructure", confidence=0.92, v4_rank=1, tag="restr"),
|
||||||
|
_M(label="use_as_is", confidence=0.41, v4_rank=2, tag="uai"),
|
||||||
|
]
|
||||||
|
out = apply_ranking_sort(records)
|
||||||
|
|
||||||
|
assert [r.tag for r in out] == ["uai", "restr"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_stable_sort_preserves_input_order_on_full_equality():
|
||||||
|
"""Python's Timsort is stable — identical keys keep original order."""
|
||||||
|
from src.phase_z2_pipeline import apply_ranking_sort
|
||||||
|
|
||||||
|
records = [
|
||||||
|
_rec("light_edit", 0.70, 5, tag="le-first"),
|
||||||
|
_rec("light_edit", 0.70, 5, tag="le-second"),
|
||||||
|
_rec("light_edit", 0.70, 5, tag="le-third"),
|
||||||
|
]
|
||||||
|
out = apply_ranking_sort(records)
|
||||||
|
|
||||||
|
assert [r["tag"] for r in out] == ["le-first", "le-second", "le-third"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_explicit_policy_argument_overrides_loader():
|
||||||
|
"""Caller-supplied policy dict bypasses the cached yaml policy."""
|
||||||
|
from src.phase_z2_pipeline import apply_ranking_sort
|
||||||
|
|
||||||
|
# Inverted priority: reject first, use_as_is last (synthetic override).
|
||||||
|
inverted = {
|
||||||
|
"policy_type": "synthetic_inverted",
|
||||||
|
"label_priority": {"reject": 0, "restructure": 1, "light_edit": 2, "use_as_is": 3},
|
||||||
|
"unknown_label_priority": 99,
|
||||||
|
"tie_break_axes": ["confidence_desc", "v4_rank_asc"],
|
||||||
|
}
|
||||||
|
records = [
|
||||||
|
_rec("use_as_is", 0.50, 1, tag="uai"),
|
||||||
|
_rec("reject", 0.50, 2, tag="rej"),
|
||||||
|
]
|
||||||
|
out = apply_ranking_sort(records, policy=inverted)
|
||||||
|
|
||||||
|
assert [r["tag"] for r in out] == ["rej", "uai"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_custom_field_keys_route_through_helper():
|
||||||
|
"""`label_key` / `confidence_key` / `v4_rank_key` rename without re-shaping data."""
|
||||||
|
from src.phase_z2_pipeline import apply_ranking_sort
|
||||||
|
|
||||||
|
records = [
|
||||||
|
{"lbl": "reject", "conf": 0.99, "rk": 1, "tag": "rej"},
|
||||||
|
{"lbl": "use_as_is", "conf": 0.10, "rk": 2, "tag": "uai"},
|
||||||
|
]
|
||||||
|
out = apply_ranking_sort(
|
||||||
|
records,
|
||||||
|
label_key="lbl",
|
||||||
|
confidence_key="conf",
|
||||||
|
v4_rank_key="rk",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert [r["tag"] for r in out] == ["uai", "rej"]
|
||||||
Reference in New Issue
Block a user