C.E.L_Slide_test2/tests/test_imp47b_failure_surface.py

"""IMP-47B u8 — slide_status.ai_repair_status surfacing tests.

Scope (this slice):
  Helper ``_summarize_ai_repair_status(ai_repair_records, coverage_invariant)``
  (src/phase_z2_pipeline.py) composes u4 gather ``error`` + u5
  ``apply_status`` + u7 ``coverage_invariant`` into a single
  ``ai_repair_status`` axis attached to ``slide_status``. Failure-axis
  priority (highest → lowest): ``error`` > ``coverage_violated`` >
  ``unsupported_kind`` > ``applied`` > ``ok``. ``human_review_required``
  flips True on the three failure axes for u11 frontend surfacing.

The frontend reads ``slide_status.ai_repair_status`` to render a
notification per the IMP-47B policy ("AI 호출 실패 / proposal validation
실패 / coverage 미달 → frontend notification"). u9~u13 are out of scope.
The helper is pure (no IO, no AI call) so synthetic record / invariant
dicts exercise every branch directly.
"""
from __future__ import annotations

from src.phase_z2_pipeline import _summarize_ai_repair_status


def _record(
    *,
    unit_index: int = 0,
    apply_status: str | None = None,
    error: str | None = None,
    source_section_ids: list[str] | None = None,
    api_error_kind: str | None = None,
) -> dict:
    """Minimal Step 12 AI repair record stub — fields u8 reads.

    IMP-92 u3 — ``api_error_kind`` is stamped by Step 12 (u2 classifier)
    on the exception path; non-error paths leave it ``None``.
    """
    return {
        "unit_index": unit_index,
        "source_section_ids": source_section_ids or [f"MOCK_S{unit_index}"],
        "apply_status": apply_status,
        "error": error,
        "api_error_kind": api_error_kind,
    }


_OK_COVERAGE = {"status": "ok", "dropped_section_ids": []}
_VIOLATED_COVERAGE = {"status": "violated", "dropped_section_ids": ["MOCK_S2"]}


# ─── Case 1 : empty pipeline → status='ok' ──────────────────────────


def test_empty_records_returns_ok_no_human_review():
    """No AI work executed → status='ok', human_review_required=False.
    The flag-off default (no provisional units) lands here.

    IMP-92 u3 — ``api_error_kinds`` aggregation is always present with
    every kind initialised to 0 so the frontend operational formatter
    can read the bucket structure unconditionally."""
    result = _summarize_ai_repair_status([], _OK_COVERAGE)
    assert result["status"] == "ok"
    assert result["human_review_required"] is False
    assert result["counts"]["total"] == 0
    assert result["unsupported_kind_records"] == []
    assert result["error_records"] == []
    assert result["dropped_section_ids"] == []
    assert result["api_error_kinds"] == {
        "quota": 0,
        "billing": 0,
        "auth": 0,
        "other": 0,
    }


# ─── Case 2 : applied → status='applied', no human_review ───────────


def test_applied_partial_overrides_marks_applied_no_human_review():
    """Successful AI repair (PARTIAL_OVERRIDES applied) is the happy
    path. status='applied', no human_review surfacing."""
    records = [_record(apply_status="applied:partial_overrides")]
    result = _summarize_ai_repair_status(records, _OK_COVERAGE)
    assert result["status"] == "applied"
    assert result["human_review_required"] is False
    assert result["counts"]["applied"] == 1
    assert result["counts"]["error"] == 0


# ─── Case 3 : unsupported kind → status='unsupported_kind' ──────────


def test_unsupported_kind_marks_human_review_required():
    """u5 surfaces ``unsupported_kind_for_reject_route:<kind>`` for
    builder_options_patch / slot_mapping_proposal. u8 must classify as
    human_review_required so the frontend renders a notification."""
    records = [
        _record(
            unit_index=1,
            apply_status="unsupported_kind_for_reject_route:builder_options_patch",
            source_section_ids=["MOCK_S1"],
        ),
    ]
    result = _summarize_ai_repair_status(records, _OK_COVERAGE)
    assert result["status"] == "unsupported_kind"
    assert result["human_review_required"] is True
    assert result["counts"]["unsupported_kind"] == 1
    assert result["unsupported_kind_records"] == [
        {
            "unit_index": 1,
            "source_section_ids": ["MOCK_S1"],
            "apply_status": "unsupported_kind_for_reject_route:builder_options_patch",
        }
    ]


# ─── Case 4 : gather error → status='error' (highest priority) ──────


def test_gather_error_marks_status_error_with_records():
    """``record['error']`` set means ``gather_step12_ai_repair_proposals``
    caught a router exception (AI call / validator). status='error'
    is the highest-priority failure axis.

    IMP-92 u3 — non-Anthropic exception path leaves ``api_error_kind``
    as ``None``; the summary retains ``None`` per-record and does not
    increment any operational kind bucket."""
    records = [_record(
        unit_index=2,
        error="ValueError: missing slot 'title'",
        source_section_ids=["MOCK_S2"],
    )]
    result = _summarize_ai_repair_status(records, _OK_COVERAGE)
    assert result["status"] == "error"
    assert result["human_review_required"] is True
    assert result["counts"]["error"] == 1
    assert result["error_records"] == [
        {
            "unit_index": 2,
            "source_section_ids": ["MOCK_S2"],
            "error": "ValueError: missing slot 'title'",
            "api_error_kind": None,
        }
    ]
    assert result["api_error_kinds"] == {
        "quota": 0,
        "billing": 0,
        "auth": 0,
        "other": 0,
    }


# ─── IMP-92 u3 : api_error_kind propagation + aggregation ───────────


def test_api_error_kind_quota_propagates_to_summary_and_record():
    """Step 12 (u2) stamps ``api_error_kind='quota'`` on a 429
    Anthropic exception path. u8 must surface that kind per-record
    and increment the ``quota`` bucket in ``api_error_kinds``."""
    records = [_record(
        unit_index=3,
        error="RateLimitError: 429",
        source_section_ids=["MOCK_S3"],
        api_error_kind="quota",
    )]
    result = _summarize_ai_repair_status(records, _OK_COVERAGE)
    assert result["status"] == "error"
    assert result["human_review_required"] is True
    assert result["error_records"] == [
        {
            "unit_index": 3,
            "source_section_ids": ["MOCK_S3"],
            "error": "RateLimitError: 429",
            "api_error_kind": "quota",
        }
    ]
    assert result["api_error_kinds"] == {
        "quota": 1,
        "billing": 0,
        "auth": 0,
        "other": 0,
    }


def test_api_error_kinds_aggregate_across_all_operational_axes():
    """Mixed batch — one of each operational kind (quota / billing /
    auth / other). Aggregation must count each axis exactly once and
    keep per-record kinds intact (order preserved)."""
    records = [
        _record(unit_index=0, error="RateLimitError", api_error_kind="quota"),
        _record(unit_index=1, error="PermissionDeniedError", api_error_kind="billing"),
        _record(unit_index=2, error="AuthenticationError", api_error_kind="auth"),
        _record(unit_index=3, error="BadRequestError", api_error_kind="other"),
    ]
    result = _summarize_ai_repair_status(records, _OK_COVERAGE)
    assert result["status"] == "error"
    assert result["counts"]["error"] == 4
    assert result["api_error_kinds"] == {
        "quota": 1,
        "billing": 1,
        "auth": 1,
        "other": 1,
    }
    assert [rec["api_error_kind"] for rec in result["error_records"]] == [
        "quota",
        "billing",
        "auth",
        "other",
    ]


# ─── Case 5 : coverage violated → status='coverage_violated' ────────


def test_coverage_violation_surfaces_dropped_sections():
    """u7 coverage_invariant 'violated' means the AI repair dropped a
    section_id from the post-AI superset. dropped 절대 룰 — surface as
    human_review_required."""
    records = [_record(apply_status="applied:partial_overrides")]
    result = _summarize_ai_repair_status(records, _VIOLATED_COVERAGE)
    assert result["status"] == "coverage_violated"
    assert result["human_review_required"] is True
    assert result["coverage_status"] == "violated"
    assert result["dropped_section_ids"] == ["MOCK_S2"]


# ─── Case 6 : priority order — error > coverage > unsupported ───────


def test_error_dominates_over_coverage_and_unsupported():
    """When multiple failure axes coexist, priority order is
    error > coverage_violated > unsupported_kind > applied > ok."""
    records = [
        _record(unit_index=0, error="RuntimeError"),
        _record(unit_index=1,
                apply_status="unsupported_kind_for_reject_route:slot_mapping_proposal"),
        _record(unit_index=2, apply_status="applied:partial_overrides"),
    ]
    result = _summarize_ai_repair_status(records, _VIOLATED_COVERAGE)
    assert result["status"] == "error"
    assert result["human_review_required"] is True
    assert result["counts"]["error"] == 1
    assert result["counts"]["unsupported_kind"] == 1
    assert result["counts"]["applied"] == 1


# ─── Case 7 : no_proposal + no_zone_match counted, not failure ──────


def test_no_proposal_and_no_zone_match_do_not_trigger_human_review():
    """Flag-off short-circuit, not_provisional, route_not_ai_adaptation,
    and B4-mismatch (no_zone_match) are structural skips — not AI
    failures. They count but do not flip human_review_required."""
    records = [
        _record(unit_index=0, apply_status="no_proposal"),
        _record(unit_index=1, apply_status="no_zone_match"),
    ]
    result = _summarize_ai_repair_status(records, _OK_COVERAGE)
    assert result["status"] == "ok"
    assert result["human_review_required"] is False
    assert result["counts"]["no_proposal"] == 1
    assert result["counts"]["no_zone_match"] == 1