diff --git a/tests/phase_z2/test_imp35_baseline_red_invariance.py b/tests/phase_z2/test_imp35_baseline_red_invariance.py new file mode 100644 index 0000000..1224efa --- /dev/null +++ b/tests/phase_z2/test_imp35_baseline_red_invariance.py @@ -0,0 +1,339 @@ +"""IMP-35 (#64) u11 — baseline-red invariance gate. + +Stage 2 binding contract (unit u11): + IMP-35 inherits a four-test red baseline from prior phases that is + explicitly OUT OF SCOPE for this issue: + + 1. tests/test_imp47b_step12_ai_wiring.py + ::test_mixed_units_classified_by_route_and_provisional_flag + 2. tests/test_imp47b_step12_ai_wiring.py + ::test_reject_provisional_unit_reaches_router_short_circuit + 3. tests/test_imp47b_step12_ai_wiring.py + ::test_step12_ai_repair_artifact_writes_json_serialisable_records + 4. tests/test_phase_z2_ai_fallback_config.py + ::test_ai_fallback_master_flag_default_off + + u11 does NOT fix these. u11 LOCKS the count + identity of the + baseline-red set so that IMP-35 cannot silently grow the red surface + while the issue is in-flight. A follow-up issue (Stage 2 plan + `follow_up_candidates`) tracks the actual repair. + +Invariance semantics: + - The exact four baseline-red node ids resolve to real, collectible + pytest items (a rename / delete is caught up front; the gate cannot + be defeated by silently removing the failing test). + - Running pytest on the BROADER baseline-area files + (``tests/test_imp47b_step12_ai_wiring.py`` + + ``tests/test_phase_z2_ai_fallback_config.py``) yields EXACTLY four + FAILED node ids and zero ERROR node ids; the FAILED set is exactly + the documented baseline-red set. + - A NEW red introduced by IMP-35 in the baseline area flips the + FAILED count above four AND/OR introduces an extra FAILED node id + that is not in the baseline set; either branch fails this gate. + +AI isolation contract (`feedback_ai_isolation_contract`): + The invariance gate runs pytest in a child process and parses stdout. + It must NOT import the Anthropic SDK and must NOT route through + ``route_ai_fallback``. The structural import test below locks this. + +Stage 2 plan source: Stage 2 exit report u11 — "u11 acknowledges the +current four red baseline tests as pre-existing and adds an invariance +gate so IMP-35 cannot worsen them." +""" +from __future__ import annotations + +import ast +import re +import subprocess +import sys +from pathlib import Path + + +# === BASELINE-RED REGISTRY (frozen by Stage 2 u11 contract) === +# +# Order is informational only; the gate compares as a set. Each entry +# is a fully-qualified pytest node id resolvable from the repo root. +IMP35_BASELINE_RED_NODE_IDS: tuple[str, ...] = ( + "tests/test_imp47b_step12_ai_wiring.py" + "::test_mixed_units_classified_by_route_and_provisional_flag", + "tests/test_imp47b_step12_ai_wiring.py" + "::test_reject_provisional_unit_reaches_router_short_circuit", + "tests/test_imp47b_step12_ai_wiring.py" + "::test_step12_ai_repair_artifact_writes_json_serialisable_records", + "tests/test_phase_z2_ai_fallback_config.py" + "::test_ai_fallback_master_flag_default_off", +) + +# Files that own the baseline-red set. The "no-new-red in baseline area" +# axis runs pytest on this set and checks that ONLY the registry above +# fails. +IMP35_BASELINE_RED_AREA_FILES: tuple[str, ...] = ( + "tests/test_imp47b_step12_ai_wiring.py", + "tests/test_phase_z2_ai_fallback_config.py", +) + + +# === Repo root resolution (subprocess CWD anchor) === + +# tests/phase_z2/.py -> parents[2] = repo root. +_REPO_ROOT: Path = Path(__file__).resolve().parents[2] + + +# === pytest stdout parsers === + +# Matches lines like: +# FAILED tests/test_imp47b_step12_ai_wiring.py::test_xxx +# and: +# FAILED tests/test_imp47b_step12_ai_wiring.py::test_xxx - AssertionError: ... +# The capture group is the bare node id (no trailing failure detail). +_FAILED_LINE_RE = re.compile(r"^FAILED\s+(\S+?)(?:\s+-\s+.*)?$", re.MULTILINE) + +# Matches lines like: +# ERROR tests/test_xxx.py::test_yyy +_ERROR_LINE_RE = re.compile(r"^ERROR\s+(\S+?)(?:\s+-\s+.*)?$", re.MULTILINE) + +# Matches the pytest tail summary line (sub-second timing field varies): +# 4 failed, 6 passed in 2.27s +_TAIL_SUMMARY_RE = re.compile( + r"^(?P.*?)\s+in\s+\d+(?:\.\d+)?s\s*$", re.MULTILINE +) + + +def _run_pytest_collect_only(node_ids: tuple[str, ...]) -> subprocess.CompletedProcess: + """Run ``pytest --collect-only -q`` against the supplied node ids. + + Used to confirm the baseline-red registry resolves to real, currently + collectible tests. If a test is renamed / moved / deleted out from + under the registry, pytest's collection failure is the signal. + """ + return subprocess.run( + [ + sys.executable, + "-m", + "pytest", + "--collect-only", + "-q", + *node_ids, + ], + cwd=_REPO_ROOT, + capture_output=True, + text=True, + check=False, + ) + + +def _run_pytest_quiet(targets: tuple[str, ...]) -> subprocess.CompletedProcess: + """Run ``pytest -q --tb=no -p no:cacheprovider`` against ``targets``. + + ``-p no:cacheprovider`` keeps the gate hermetic across reruns; the + parent pytest invocation that triggers this child process must not + poison or be poisoned by the child's cache state. + """ + return subprocess.run( + [ + sys.executable, + "-m", + "pytest", + "-q", + "--tb=no", + "-p", + "no:cacheprovider", + *targets, + ], + cwd=_REPO_ROOT, + capture_output=True, + text=True, + check=False, + ) + + +def _parse_failed_node_ids(stdout: str) -> set[str]: + """Extract the set of FAILED node ids from pytest's ``--tb=no -q`` stdout.""" + return {match.group(1) for match in _FAILED_LINE_RE.finditer(stdout)} + + +def _parse_error_node_ids(stdout: str) -> set[str]: + """Extract the set of ERROR node ids from pytest's ``--tb=no -q`` stdout.""" + return {match.group(1) for match in _ERROR_LINE_RE.finditer(stdout)} + + +# === Tests === + + +def test_imp35_baseline_red_registry_has_exactly_four_node_ids() -> None: + """The baseline-red registry is a frozen four-tuple (Stage 2 u11 lock).""" + assert len(IMP35_BASELINE_RED_NODE_IDS) == 4 + assert len(set(IMP35_BASELINE_RED_NODE_IDS)) == 4, ( + "IMP-35 baseline-red registry must not contain duplicate node ids; " + "duplicates would silently weaken the invariance gate." + ) + + +def test_imp35_baseline_red_registry_node_ids_are_well_formed() -> None: + """Each baseline-red node id must look like ``tests/.py::``.""" + for node_id in IMP35_BASELINE_RED_NODE_IDS: + assert node_id.startswith("tests/"), ( + f"IMP-35 baseline-red registry node id {node_id!r} must live " + "under tests/ — registry entries point at repo-rooted node ids." + ) + assert ".py::" in node_id, ( + f"IMP-35 baseline-red registry node id {node_id!r} must use the " + ".py:: pytest node id grammar." + ) + + +def test_imp35_baseline_red_registry_files_match_area_inventory() -> None: + """Registry node ids must all live in declared baseline-area files. + + Locks the cross-axis link between :data:`IMP35_BASELINE_RED_NODE_IDS` + and :data:`IMP35_BASELINE_RED_AREA_FILES` — adding a registry entry + without expanding the area sweep (or vice versa) is the kind of + half-wiring that would silently let the gate miss new reds. + """ + declared_files = set(IMP35_BASELINE_RED_AREA_FILES) + for node_id in IMP35_BASELINE_RED_NODE_IDS: + file_part, _, _ = node_id.partition("::") + assert file_part in declared_files, ( + f"IMP-35 baseline-red registry entry {node_id!r} references " + f"{file_part!r}, which is not in IMP35_BASELINE_RED_AREA_FILES. " + "Update both lists together or the area sweep will miss new reds." + ) + + +def test_imp35_baseline_red_node_ids_resolve_to_collectible_tests() -> None: + """``pytest --collect-only`` must resolve every baseline-red node id. + + A failure here means a baseline-red test was renamed / deleted / + moved out from under the gate; the registry must be updated in the + same commit (or, if the test was fixed, the follow-up issue must + deregister it). + """ + result = _run_pytest_collect_only(IMP35_BASELINE_RED_NODE_IDS) + # ``pytest --collect-only`` exits 0 on full collection, 2/4/5 on + # collection errors. Exit code 5 = no tests collected ("not found"). + assert result.returncode in (0,), ( + "pytest --collect-only failed for the IMP-35 baseline-red " + f"registry (rc={result.returncode}).\n" + f"STDOUT:\n{result.stdout}\n" + f"STDERR:\n{result.stderr}" + ) + + +def test_imp35_baseline_red_invariance_gate_failed_set_matches_registry() -> None: + """Running pytest on the baseline area must FAIL EXACTLY the registry. + + This is the core invariance contract. If IMP-35 work breaks a 5th + test in the baseline area, the FAILED set diverges from the registry + and this gate trips. If IMP-35 accidentally fixes one of the four, + the FAILED set shrinks below four and this gate also trips — at + which point the registry is removed from the failing test (the + follow-up issue deregisters it) and the gate is re-locked. + """ + result = _run_pytest_quiet(IMP35_BASELINE_RED_AREA_FILES) + + # The baseline area is currently red: pytest MUST exit non-zero. A + # zero return code here would mean the baseline magically went green + # (or the parser missed the failures); both branches require human + # review before the registry is updated. + assert result.returncode != 0, ( + "IMP-35 baseline-red area is expected to fail (4 known reds). " + "A clean pytest exit means either the baseline was unexpectedly " + "fixed (deregister via follow-up issue) or the gate's subprocess " + "did not reach the failing tests.\n" + f"STDOUT:\n{result.stdout}\n" + f"STDERR:\n{result.stderr}" + ) + + failed_ids = _parse_failed_node_ids(result.stdout) + error_ids = _parse_error_node_ids(result.stdout) + expected = set(IMP35_BASELINE_RED_NODE_IDS) + + assert error_ids == set(), ( + "IMP-35 baseline-red invariance gate found ERROR-state tests " + f"in the baseline area (expected zero): {sorted(error_ids)}.\n" + f"STDOUT:\n{result.stdout}" + ) + + assert failed_ids == expected, ( + "IMP-35 baseline-red invariance gate detected drift between the " + "registered baseline-red set and the actual pytest FAILED set.\n" + f" registered (expected): {sorted(expected)}\n" + f" actual (observed): {sorted(failed_ids)}\n" + f" unexpected new reds: {sorted(failed_ids - expected)}\n" + f" unexpectedly green: {sorted(expected - failed_ids)}\n" + "If new reds appear above, IMP-35 has silently grown the red " + "surface (u11 contract violation). If reds are unexpectedly " + "green, the follow-up issue must deregister them.\n" + f"STDOUT:\n{result.stdout}" + ) + + +def test_imp35_baseline_red_invariance_gate_failed_count_is_exactly_four() -> None: + """Count-only assertion: the baseline area has exactly four FAILED nodes. + + Complements the identity check above. Even if a parser bug or + output-format change ever weakens the identity check, the bare count + still catches the "did a new red sneak in?" failure mode. + """ + result = _run_pytest_quiet(IMP35_BASELINE_RED_AREA_FILES) + failed_ids = _parse_failed_node_ids(result.stdout) + assert len(failed_ids) == 4, ( + "IMP-35 baseline-red invariance gate expected exactly 4 FAILED " + f"node ids in the baseline area; observed {len(failed_ids)}: " + f"{sorted(failed_ids)}.\n" + f"STDOUT:\n{result.stdout}" + ) + + +def test_imp35_baseline_red_invariance_module_has_no_ai_imports() -> None: + """AI isolation contract — u11 invariance gate must stay pure stdlib. + + Mirrors the structural import lock used by u6 / u7 / u10. The gate + is deterministic-with-data (subprocess pytest + regex parse); any + Anthropic SDK import or route through the AI fallback router would + violate the ``feedback_ai_isolation_contract`` lock. + + The check is AST-based so the assertion bodies (which reference + forbidden tokens by name) do not self-trigger a string-substring + false positive. + """ + forbidden_module_prefix = "anthropic" + forbidden_attr_substring = "route_ai_fallback" + + module_source = Path(__file__).read_text(encoding="utf-8") + tree = ast.parse(module_source) + + for node in ast.walk(tree): + if isinstance(node, ast.Import): + for alias in node.names: + root = alias.name.split(".", 1)[0] + assert root != forbidden_module_prefix, ( + "IMP-35 u11 invariance gate must not import the " + f"Anthropic SDK (found ``import {alias.name}``)." + ) + elif isinstance(node, ast.ImportFrom): + if node.module is None: + continue + root = node.module.split(".", 1)[0] + assert root != forbidden_module_prefix, ( + "IMP-35 u11 invariance gate must not import from the " + f"Anthropic SDK (found ``from {node.module} import ...``)." + ) + for alias in node.names: + assert forbidden_attr_substring not in alias.name, ( + "IMP-35 u11 invariance gate must not route through the " + "AI fallback router (found " + f"``from {node.module} import {alias.name}``)." + ) + elif isinstance(node, ast.Call): + func = node.func + if isinstance(func, ast.Name): + assert forbidden_attr_substring not in func.id, ( + "IMP-35 u11 invariance gate must not call into the " + f"AI fallback router (found call to ``{func.id}``)." + ) + elif isinstance(func, ast.Attribute): + assert forbidden_attr_substring not in func.attr, ( + "IMP-35 u11 invariance gate must not call into the " + f"AI fallback router (found call to ``.{func.attr}``)." + )