diff --git a/.github/workflows/multi-mdx-regression.yml b/.github/workflows/multi-mdx-regression.yml new file mode 100644 index 0000000..1213ab1 --- /dev/null +++ b/.github/workflows/multi-mdx-regression.yml @@ -0,0 +1,71 @@ +name: Multi-MDX Regression (IMP-91) + +# IMP-#91 u13 — auto-gate the mdx 01-05 acceptance set on every push to main +# and on PRs targeting main. Failure of any integration test blocks the +# commit. JSON report is emitted via pytest-json-report (u12 dep) and +# uploaded as an artifact for u14/u15 status-board updater consumption. +# +# [[feedback_validation_first_for_closed_issues]] — fresh subprocess per CI run. +# [[feedback_auto_pipeline_first]] — no manual review queue; deterministic gate. + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + multi-mdx-regression: + runs-on: ubuntu-latest + timeout-minutes: 30 + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Python 3.11 + uses: actions/setup-python@v5 + with: + python-version: "3.11" + cache: pip + + - name: Install Chrome and ChromeDriver + uses: browser-actions/setup-chrome@v1 + with: + install-chromedriver: true + + - name: Install project (dev extras + selenium) + run: | + python -m pip install --upgrade pip + python -m pip install -e ".[dev]" + python -m pip install "selenium>=4.20" + + - name: Run multi-mdx regression tests + run: | + python -m pytest -q -m integration \ + tests/integration/test_multi_mdx_regression.py \ + --json-report \ + --json-report-file=imp91-report.json \ + --json-report-omit keywords streams + + - name: Upload pytest JSON report + if: always() + uses: actions/upload-artifact@v4 + with: + name: imp91-multi-mdx-report + path: imp91-report.json + if-no-files-found: warn + + - name: Update status-board markers (IMP-91 u15) + if: always() + run: | + python scripts/update_status_board.py \ + --report imp91-report.json \ + --board docs/architecture/PHASE-Z-PIPELINE-STATUS-BOARD.md + + - name: Upload updated status board + if: always() + uses: actions/upload-artifact@v4 + with: + name: imp91-status-board + path: docs/architecture/PHASE-Z-PIPELINE-STATUS-BOARD.md + if-no-files-found: warn diff --git a/docs/architecture/PHASE-Z-PIPELINE-STATUS-BOARD.md b/docs/architecture/PHASE-Z-PIPELINE-STATUS-BOARD.md index 4835c21..22aab12 100644 --- a/docs/architecture/PHASE-Z-PIPELINE-STATUS-BOARD.md +++ b/docs/architecture/PHASE-Z-PIPELINE-STATUS-BOARD.md @@ -167,6 +167,21 @@ Step 0 (사전 준비) 의 Figma → HTML 변환은 *precondition phase 의 작 --- +## 7. Multi-MDX regression markers (IMP-91) + +> CI workflow `.github/workflows/multi-mdx-regression.yml` rewrites these via `scripts/update_status_board.py` after each push / PR. Initial value `?` = not yet observed. `PASS` / `FAIL` / `ERR` / `SKIP` = last CI run outcome per axis × mdx. Untouched markers remain `?` so collection failures are loud, not silent. + +| axis | mdx 01 | mdx 02 | mdx 03 | mdx 04 | mdx 05 | +|---|---|---|---|---|---| +| F0 normalize | ? | ? | ? | ? | ? | +| F1 V4 ranking | ? | ? | ? | ? | ? | +| F2 slot_payload | ? | ? | ? | ? | ? | +| F3 classifier-only AI | ? | ? | ? | ? | ? | +| F4 layout | ? | ? | ? | ? | ? | +| F5 final.html | ? | ? | ? | ? | ? | + +--- + ## 사용 방법 - 새 작업 들어오면 → 본 board 의 *어느 step* 의 status 를 바꾸는 작업인지 식별 diff --git a/pyproject.toml b/pyproject.toml index ddf1ff8..494a5cf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,7 @@ dependencies = [ dev = [ "pytest>=8.0", "pytest-asyncio>=0.24", + "pytest-json-report>=1.5", "ruff>=0.8", ] diff --git a/scripts/update_status_board.py b/scripts/update_status_board.py new file mode 100644 index 0000000..cc2bc69 --- /dev/null +++ b/scripts/update_status_board.py @@ -0,0 +1,75 @@ +"""IMP-#91 u14 — idempotent status-board marker updater. + +Reads a pytest-json-report artifact emitted by the IMP-91 CI workflow and +rewrites paired ``...`` markers +inside the Phase Z status board with a single-character outcome symbol. + +Pure functions (``parse_outcomes`` / ``update_board_text``) are exposed so +``tests/scripts/test_update_status_board.py`` can exercise the contract +without invoking pytest. The CLI just wires file IO around them so the +GitHub Actions step in u15 can call it deterministically. The updater is +additive: untouched markers stay; missing outcomes render ``?`` so a +collection failure is loud, not silent. [[feedback_auto_pipeline_first]] +[[feedback_artifact_status_naming]] +""" +from __future__ import annotations + +import argparse +import json +import re +from pathlib import Path +from typing import Dict, Mapping, Tuple + +AXIS_FROM_TEST = { + "test_normalize_snapshot_matches": "F0", + "test_v4_ranking_snapshot_matches": "F1", + "test_slot_payload_snapshot_matches": "F2", + "test_ai_classifier_snapshot_matches": "F3", + "test_layout_snapshot_matches": "F4", + "test_final_html_snapshot_matches": "F5", +} +SYMBOL = {"passed": "PASS", "failed": "FAIL", "error": "ERR", "skipped": "SKIP"} +NODEID_RE = re.compile(r"::(test_[a-z0-9_]+)\[(\d{2})\]$") +MARKER_RE = re.compile( + r"()(.*?)()", re.DOTALL +) + + +def parse_outcomes(report: Mapping[str, object]) -> Dict[Tuple[str, str], str]: + out: Dict[Tuple[str, str], str] = {} + for test in report.get("tests", []) or []: + m = NODEID_RE.search(str(test.get("nodeid", ""))) + if not m: + continue + axis = AXIS_FROM_TEST.get(m.group(1)) + if not axis: + continue + out[(axis, m.group(2))] = SYMBOL.get(str(test.get("outcome")), "?") + return out + + +def update_board_text(board: str, outcomes: Mapping[Tuple[str, str], str]) -> str: + def repl(match: "re.Match[str]") -> str: + key = (match.group(2), match.group(3)) + symbol = outcomes.get(key, "?") + return f"{match.group(1)}{symbol}{match.group(5)}" + + return MARKER_RE.sub(repl, board) + + +def main() -> int: + parser = argparse.ArgumentParser(description="IMP-91 status-board updater") + parser.add_argument("--report", required=True, type=Path) + parser.add_argument("--board", required=True, type=Path) + args = parser.parse_args() + report = json.loads(args.report.read_text(encoding="utf-8")) + outcomes = parse_outcomes(report) + args.board.write_text( + update_board_text(args.board.read_text(encoding="utf-8"), outcomes), + encoding="utf-8", + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/integration/__snapshots__/ai_classifier.json b/tests/integration/__snapshots__/ai_classifier.json new file mode 100644 index 0000000..17434ea --- /dev/null +++ b/tests/integration/__snapshots__/ai_classifier.json @@ -0,0 +1,73 @@ +{ + "_doc": "IMP-91 u9 — F3 classifier-only AI axis. Pin observed step12 per-unit classifier label / route_hint / AI-isolation flags + coverage_invariant + step15 fit_classification + step16 router_active + step18 failure_type. Default-OFF AI invariant ([[feedback_ai_isolation_contract]]): ai_called MUST be False for every unit unless AI_FALLBACK_ENABLED is flipped via .env (not via pipeline default). If any unit flips ai_called=True silently, this snapshot fails loudly per [[feedback_demo_env_toggle_policy]].", + "01": { + "units": [ + {"source_section_ids": ["01-2"], "label": "use_as_is", "route_hint": "direct_render", "provisional": false, "ai_called": false, "skip_reason": "not_provisional", "apply_status": "no_proposal"}, + {"source_section_ids": ["01-1"], "label": "use_as_is", "route_hint": "direct_render", "provisional": false, "ai_called": false, "skip_reason": "not_provisional", "apply_status": "no_proposal"} + ], + "coverage_invariant_status": "ok", + "fit_visual_check_passed": true, + "fit_classifications_count": 0, + "fit_categories_seen": [], + "router_active": false, + "router_routed_count": 0, + "router_v4_fallback_used_count": 0, + "failure_type": "not_attempted" + }, + "02": { + "units": [ + {"source_section_ids": ["02-1"], "label": "use_as_is", "route_hint": "direct_render", "provisional": false, "ai_called": false, "skip_reason": "not_provisional", "apply_status": "no_proposal"}, + {"source_section_ids": ["02-2-sub-1", "02-2-sub-2"], "label": "use_as_is", "route_hint": "direct_render", "provisional": true, "ai_called": false, "skip_reason": "route_not_ai_adaptation:direct_render", "apply_status": "no_proposal"} + ], + "coverage_invariant_status": "ok", + "fit_visual_check_passed": true, + "fit_classifications_count": 0, + "fit_categories_seen": [], + "router_active": false, + "router_routed_count": 0, + "router_v4_fallback_used_count": 0, + "failure_type": "not_attempted" + }, + "03": { + "units": [ + {"source_section_ids": ["03-1"], "label": "use_as_is", "route_hint": "direct_render", "provisional": false, "ai_called": false, "skip_reason": "not_provisional", "apply_status": "no_proposal"}, + {"source_section_ids": ["03-2"], "label": "use_as_is", "route_hint": "direct_render", "provisional": false, "ai_called": false, "skip_reason": "not_provisional", "apply_status": "no_proposal"} + ], + "coverage_invariant_status": "ok", + "fit_visual_check_passed": true, + "fit_classifications_count": 0, + "fit_categories_seen": [], + "router_active": false, + "router_routed_count": 0, + "router_v4_fallback_used_count": 0, + "failure_type": "not_attempted" + }, + "04": { + "units": [ + {"source_section_ids": ["04-2-sub-2"], "label": "light_edit", "route_hint": "deterministic_minor_adjustment", "provisional": false, "ai_called": false, "skip_reason": "not_provisional", "apply_status": "no_proposal"}, + {"source_section_ids": ["04-2-sub-1"], "label": "restructure", "route_hint": "ai_adaptation_required", "provisional": true, "ai_called": false, "skip_reason": "router_short_circuit", "apply_status": "no_proposal"}, + {"source_section_ids": ["04-1"], "label": "reject", "route_hint": "ai_adaptation_required", "provisional": true, "ai_called": false, "skip_reason": "router_short_circuit", "apply_status": "no_proposal"} + ], + "coverage_invariant_status": "ok", + "fit_visual_check_passed": true, + "fit_classifications_count": 0, + "fit_categories_seen": [], + "router_active": false, + "router_routed_count": 0, + "router_v4_fallback_used_count": 0, + "failure_type": "not_attempted" + }, + "05": { + "units": [ + {"source_section_ids": ["05-1", "05-2-sub-1", "05-2-sub-2"], "label": "empty_shell", "route_hint": null, "provisional": true, "ai_called": false, "skip_reason": "route_not_ai_adaptation:None", "apply_status": "no_proposal"} + ], + "coverage_invariant_status": "ok", + "fit_visual_check_passed": true, + "fit_classifications_count": 0, + "fit_categories_seen": [], + "router_active": false, + "router_routed_count": 0, + "router_v4_fallback_used_count": 0, + "failure_type": "not_attempted" + } +} diff --git a/tests/integration/__snapshots__/coverage.json b/tests/integration/__snapshots__/coverage.json new file mode 100644 index 0000000..dae34be --- /dev/null +++ b/tests/integration/__snapshots__/coverage.json @@ -0,0 +1,43 @@ +{ + "_doc": "IMP-#91 u5 — full_mdx_coverage / aligned_section_ids / covered_section_ids / filtered_section_ids snapshot pinned from observed step20_slide_status.json across MDX_SET (mdx 01-05). Drift = real change in coverage outcome; re-baseline only with conscious explanation in commit body.", + "01": { + "full_mdx_coverage": true, + "rendered": true, + "visual_check_passed": true, + "aligned_section_ids": ["01-1", "01-2"], + "covered_section_ids": ["01-1", "01-2"], + "filtered_section_ids": [] + }, + "02": { + "full_mdx_coverage": true, + "rendered": true, + "visual_check_passed": true, + "aligned_section_ids": ["02-1", "02-2-sub-1", "02-2-sub-2"], + "covered_section_ids": ["02-1", "02-2-sub-1", "02-2-sub-2"], + "filtered_section_ids": [] + }, + "03": { + "full_mdx_coverage": true, + "rendered": true, + "visual_check_passed": true, + "aligned_section_ids": ["03-1", "03-2"], + "covered_section_ids": ["03-1", "03-2"], + "filtered_section_ids": [] + }, + "04": { + "full_mdx_coverage": true, + "rendered": true, + "visual_check_passed": true, + "aligned_section_ids": ["04-1", "04-2-sub-1", "04-2-sub-2"], + "covered_section_ids": ["04-1", "04-2-sub-1", "04-2-sub-2"], + "filtered_section_ids": [] + }, + "05": { + "full_mdx_coverage": false, + "rendered": true, + "visual_check_passed": true, + "aligned_section_ids": ["05-1", "05-2-sub-1", "05-2-sub-2"], + "covered_section_ids": ["05-1", "05-2-sub-1", "05-2-sub-2"], + "filtered_section_ids": ["05-1", "05-2-sub-1", "05-2-sub-2"] + } +} diff --git a/tests/integration/__snapshots__/final_html.json b/tests/integration/__snapshots__/final_html.json new file mode 100644 index 0000000..0ce3442 --- /dev/null +++ b/tests/integration/__snapshots__/final_html.json @@ -0,0 +1,88 @@ +{ + "_doc": "IMP-91 u11 — F5 final.html extraction axis. Pin step13_render.json metadata (step_status / pipeline_path_connected / render_inputs.zones_count / render_inputs.layout_preset / slide_title|footer non-empty / final_html_size_bytes) AND structural markers extracted from the on-disk final.html (HTML , slide root count, slide-footer presence, data-zone-position/data-template-id topology). The HTML-extracted zone topology MUST match the step12 slot_payload (position, template_id) sequence already pinned in slot_payload.json (u8) — Jinja2 renders from step12, not step09, so step12 is the correct upstream parity source (step09 selection vs step12 __empty__ collapse is intentional per IMP-87 honesty gate and surfaces in u8). Drift between final.html and slot_payload = render pipeline disconnect. on-disk final.html size_bytes MUST equal step13's reported final_html_size_bytes (byte parity = no truncation / no double-write race).", + "01": { + "step13_status": "done", + "step13_pipeline_path_connected": true, + "render_inputs_zones_count": 2, + "render_inputs_layout_preset": "horizontal-2", + "render_inputs_slide_title_nonempty": true, + "render_inputs_slide_footer_nonempty": true, + "html_title_matches_render_input": true, + "html_slide_root_count": 1, + "html_slide_footer_present": true, + "html_zone_count": 2, + "html_zone_topology": [ + {"position": "top", "template_id": "bim_dx_comparison_table"}, + {"position": "bottom", "template_id": "construction_bim_three_usage"} + ], + "final_html_size_matches_step13_reported": true + }, + "02": { + "step13_status": "done", + "step13_pipeline_path_connected": true, + "render_inputs_zones_count": 2, + "render_inputs_layout_preset": "horizontal-2", + "render_inputs_slide_title_nonempty": true, + "render_inputs_slide_footer_nonempty": true, + "html_title_matches_render_input": true, + "html_slide_root_count": 1, + "html_slide_footer_present": true, + "html_zone_count": 2, + "html_zone_topology": [ + {"position": "top", "template_id": "construction_goals_three_circle_intersection"}, + {"position": "bottom", "template_id": "__empty__"} + ], + "final_html_size_matches_step13_reported": true + }, + "03": { + "step13_status": "done", + "step13_pipeline_path_connected": true, + "render_inputs_zones_count": 2, + "render_inputs_layout_preset": "vertical-2", + "render_inputs_slide_title_nonempty": true, + "render_inputs_slide_footer_nonempty": true, + "html_title_matches_render_input": true, + "html_slide_root_count": 1, + "html_slide_footer_present": true, + "html_zone_count": 2, + "html_zone_topology": [ + {"position": "left", "template_id": "three_parallel_requirements"}, + {"position": "right", "template_id": "process_product_two_way"} + ], + "final_html_size_matches_step13_reported": true + }, + "04": { + "step13_status": "done", + "step13_pipeline_path_connected": true, + "render_inputs_zones_count": 3, + "render_inputs_layout_preset": "top-1-bottom-2", + "render_inputs_slide_title_nonempty": true, + "render_inputs_slide_footer_nonempty": true, + "html_title_matches_render_input": true, + "html_slide_root_count": 1, + "html_slide_footer_present": true, + "html_zone_count": 3, + "html_zone_topology": [ + {"position": "top", "template_id": "bim_issues_quadrant_four"}, + {"position": "bottom-left", "template_id": "__empty__"}, + {"position": "bottom-right", "template_id": "__empty__"} + ], + "final_html_size_matches_step13_reported": true + }, + "05": { + "step13_status": "done", + "step13_pipeline_path_connected": true, + "render_inputs_zones_count": 1, + "render_inputs_layout_preset": "single", + "render_inputs_slide_title_nonempty": true, + "render_inputs_slide_footer_nonempty": true, + "html_title_matches_render_input": true, + "html_slide_root_count": 1, + "html_slide_footer_present": true, + "html_zone_count": 1, + "html_zone_topology": [ + {"position": "primary", "template_id": "__empty__"} + ], + "final_html_size_matches_step13_reported": true + } +} diff --git a/tests/integration/__snapshots__/layout.json b/tests/integration/__snapshots__/layout.json new file mode 100644 index 0000000..7117cb5 --- /dev/null +++ b/tests/integration/__snapshots__/layout.json @@ -0,0 +1,133 @@ +{ + "_doc": "IMP-91 u10 — F4 layout snapshot (step07 + step08). Pins observed layout decision axes (preset / candidates / override / computation / dynamic flags) + planning geometry (heights_px / widths_px / ratios / col_ratios) + per-zone planning shape (position / min_height_px / frame_cardinality_strict / sub_zones_count / region_layout_candidates). step_status='partial' = schema-lock marker per Step 7/8 note (region-level ratio + count-based v0 marker stays a marker, never silently flipped). layout_override_applied=True ONLY for mdx 03 (project_mdx03_frame_lock 2026-05-15 user lock — axis A vertical-2 override). Source: src/phase_z2_pipeline.py step07/step08 emit; auto_layout_preset=None for mdx 05 single-preset path. drift in heights_px/ratios = content_weight_distribution shift; drift in computation = decision-path swap (regression signal axis distinct from preset).", + "01": { + "step7_step_status": "partial", + "step7_pipeline_path_connected": true, + "layout_preset": "horizontal-2", + "auto_layout_preset": "horizontal-2", + "layout_override_applied": false, + "zones_count": 2, + "unit_count": 2, + "layout_candidates": ["horizontal-2", "vertical-2"], + "computation": "min_height_first + content_weight_distribution", + "dynamic_rows": true, + "dynamic_cols": false, + "heights_px": [299, 272], + "widths_px": [1180], + "ratios": [0.511, 0.465], + "width_ratios": [1.0], + "step8_step_status": "partial", + "step8_pipeline_path_connected": true, + "zone_heights_px_planned": [299, 272], + "zone_widths_px_planned": [1180], + "zone_col_ratios_planned": [1.0], + "per_zone_layout_shape": [ + {"position": "top", "min_height_px": 350, "frame_cardinality_strict": 2, "sub_zones_count": 3, "region_layout_candidates": ["region-single"]}, + {"position": "bottom", "min_height_px": 320, "frame_cardinality_strict": 3, "sub_zones_count": 3, "region_layout_candidates": ["region-single"]} + ] + }, + "02": { + "step7_step_status": "partial", + "step7_pipeline_path_connected": true, + "layout_preset": "horizontal-2", + "auto_layout_preset": "horizontal-2", + "layout_override_applied": false, + "zones_count": 2, + "unit_count": 2, + "layout_candidates": ["horizontal-2", "vertical-2"], + "computation": "min_height_first + content_weight_distribution", + "dynamic_rows": true, + "dynamic_cols": false, + "heights_px": [273, 298], + "widths_px": [1180], + "ratios": [0.467, 0.509], + "width_ratios": [1.0], + "step8_step_status": "partial", + "step8_pipeline_path_connected": true, + "zone_heights_px_planned": [273, 298], + "zone_widths_px_planned": [1180], + "zone_col_ratios_planned": [1.0], + "per_zone_layout_shape": [ + {"position": "top", "min_height_px": 320, "frame_cardinality_strict": 3, "sub_zones_count": 4, "region_layout_candidates": ["region-single"]}, + {"position": "bottom", "min_height_px": 350, "frame_cardinality_strict": 3, "sub_zones_count": 3, "region_layout_candidates": ["region-single"]} + ] + }, + "03": { + "step7_step_status": "partial", + "step7_pipeline_path_connected": true, + "layout_preset": "vertical-2", + "auto_layout_preset": "horizontal-2", + "layout_override_applied": true, + "zones_count": 2, + "unit_count": 2, + "layout_candidates": ["horizontal-2", "vertical-2"], + "computation": "user_override_geometry", + "dynamic_rows": false, + "dynamic_cols": true, + "heights_px": [585], + "widths_px": [408, 758], + "ratios": [1.0], + "width_ratios": [0.35, 0.65], + "step8_step_status": "partial", + "step8_pipeline_path_connected": true, + "zone_heights_px_planned": [585], + "zone_widths_px_planned": [408, 758], + "zone_col_ratios_planned": [0.35, 0.65], + "per_zone_layout_shape": [ + {"position": "left", "min_height_px": 230, "frame_cardinality_strict": 3, "sub_zones_count": 3, "region_layout_candidates": ["region-single"]}, + {"position": "right", "min_height_px": 345, "frame_cardinality_strict": 2, "sub_zones_count": 2, "region_layout_candidates": ["region-single"]} + ] + }, + "04": { + "step7_step_status": "partial", + "step7_pipeline_path_connected": true, + "layout_preset": "top-1-bottom-2", + "auto_layout_preset": "top-1-bottom-2", + "layout_override_applied": false, + "zones_count": 3, + "unit_count": 3, + "layout_candidates": ["top-1-bottom-2", "top-2-bottom-1", "left-1-right-2", "left-2-right-1"], + "computation": "2d_dynamic_aggregated", + "dynamic_rows": true, + "dynamic_cols": true, + "heights_px": [221, 350], + "widths_px": [583, 583], + "ratios": [0.378, 0.598], + "width_ratios": [0.494, 0.494], + "step8_step_status": "partial", + "step8_pipeline_path_connected": true, + "zone_heights_px_planned": [221, 350], + "zone_widths_px_planned": [583, 583], + "zone_col_ratios_planned": [0.494, 0.494], + "per_zone_layout_shape": [ + {"position": "top", "min_height_px": null, "frame_cardinality_strict": null, "sub_zones_count": 4, "region_layout_candidates": ["region-single"]}, + {"position": "bottom-left", "min_height_px": 350, "frame_cardinality_strict": 4, "sub_zones_count": 5, "region_layout_candidates": ["region-single"]}, + {"position": "bottom-right", "min_height_px": 350, "frame_cardinality_strict": null, "sub_zones_count": 1, "region_layout_candidates": ["region-single"]} + ] + }, + "05": { + "step7_step_status": "partial", + "step7_pipeline_path_connected": true, + "layout_preset": "single", + "auto_layout_preset": null, + "layout_override_applied": false, + "zones_count": 1, + "unit_count": 1, + "layout_candidates": ["single"], + "computation": "fr_default_from_preset", + "dynamic_rows": false, + "dynamic_cols": false, + "heights_px": [585], + "widths_px": [1180], + "ratios": [1.0], + "width_ratios": [1.0], + "step8_step_status": "partial", + "step8_pipeline_path_connected": true, + "zone_heights_px_planned": [585], + "zone_widths_px_planned": [1180], + "zone_col_ratios_planned": [1.0], + "per_zone_layout_shape": [ + {"position": "primary", "min_height_px": null, "frame_cardinality_strict": null, "sub_zones_count": 0, "region_layout_candidates": ["region-single"]} + ] + } +} diff --git a/tests/integration/__snapshots__/normalize.json b/tests/integration/__snapshots__/normalize.json new file mode 100644 index 0000000..170c235 --- /dev/null +++ b/tests/integration/__snapshots__/normalize.json @@ -0,0 +1,83 @@ +{ + "_doc": "IMP-91 u6 — F0 normalize axis snapshot (step02_normalized.json). Pins observed current state per [[feedback_validation_first_for_closed_issues]] / Stage 1 'do not invent a new expectation'. step_status='partial' is the schema-lock marker for IMP-02/03 (orphans + details detection unimplemented). adapter_enabled/used=false reflects default-OFF canary (chained adapter trace OFF). asset counts are step02 collection state (popups/images/tables list aggregation in stage0_normalized_assets); they may grow when IMP-03 detection lands and the snapshot will drift loudly.", + "01": { + "step_num": 2, + "step_status": "partial", + "pipeline_path_connected": true, + "sections_count": 2, + "section_ids": ["01-1", "01-2"], + "orphans_count": 0, + "details_count": 0, + "adapter_enabled": false, + "adapter_used": false, + "assets_popups_count": 0, + "assets_images_count": 0, + "assets_tables_count": 0, + "slide_title_nonempty": true, + "slide_footer_nonempty": true + }, + "02": { + "step_num": 2, + "step_status": "partial", + "pipeline_path_connected": true, + "sections_count": 2, + "section_ids": ["02-1", "02-2"], + "orphans_count": 0, + "details_count": 0, + "adapter_enabled": false, + "adapter_used": false, + "assets_popups_count": 0, + "assets_images_count": 0, + "assets_tables_count": 0, + "slide_title_nonempty": true, + "slide_footer_nonempty": true + }, + "03": { + "step_num": 2, + "step_status": "partial", + "pipeline_path_connected": true, + "sections_count": 2, + "section_ids": ["03-1", "03-2"], + "orphans_count": 0, + "details_count": 0, + "adapter_enabled": false, + "adapter_used": false, + "assets_popups_count": 0, + "assets_images_count": 0, + "assets_tables_count": 0, + "slide_title_nonempty": true, + "slide_footer_nonempty": true + }, + "04": { + "step_num": 2, + "step_status": "partial", + "pipeline_path_connected": true, + "sections_count": 2, + "section_ids": ["04-1", "04-2"], + "orphans_count": 0, + "details_count": 0, + "adapter_enabled": false, + "adapter_used": false, + "assets_popups_count": 0, + "assets_images_count": 0, + "assets_tables_count": 0, + "slide_title_nonempty": true, + "slide_footer_nonempty": true + }, + "05": { + "step_num": 2, + "step_status": "partial", + "pipeline_path_connected": true, + "sections_count": 2, + "section_ids": ["05-1", "05-2"], + "orphans_count": 0, + "details_count": 0, + "adapter_enabled": false, + "adapter_used": false, + "assets_popups_count": 0, + "assets_images_count": 0, + "assets_tables_count": 0, + "slide_title_nonempty": true, + "slide_footer_nonempty": true + } +} diff --git a/tests/integration/__snapshots__/slot_payload.json b/tests/integration/__snapshots__/slot_payload.json new file mode 100644 index 0000000..4d7d048 --- /dev/null +++ b/tests/integration/__snapshots__/slot_payload.json @@ -0,0 +1,103 @@ +{ + "_doc": "IMP-#91 u8 — F2 slot_payload axis. Pins step12_slot_payload.json per_zone structural shape (position / template_id / builder / slot_names / list_slot_counts / dict_slot_sub_counts / string_slot_nonempty) for mdx 01-05. Pins SHAPE not literal content — text edits in MDX won't drift this snapshot, but builder swap / slot rename / missing slot / list-cardinality drift will. __empty__ zones have builder=null and zero slots.", + "01": [ + { + "position": "top", + "template_id": "bim_dx_comparison_table", + "builder": "compare_table_2col", + "slot_names": ["col_a_label", "col_b_label", "rows", "title"], + "list_slot_counts": {"rows": 2}, + "dict_slot_sub_counts": {}, + "string_slot_nonempty": {"col_a_label": true, "col_b_label": true, "title": true} + }, + { + "position": "bottom", + "template_id": "construction_bim_three_usage", + "builder": "quadrant_flat_slots", + "slot_names": ["category_1_body", "category_1_label", "category_2_body", "category_2_label", "category_3_body", "category_3_label", "title"], + "list_slot_counts": {"category_1_body": 2, "category_2_body": 2, "category_3_body": 2}, + "dict_slot_sub_counts": {}, + "string_slot_nonempty": {"category_1_label": true, "category_2_label": true, "category_3_label": true, "title": true} + } + ], + "02": [ + { + "position": "top", + "template_id": "construction_goals_three_circle_intersection", + "builder": "cycle_intersect_3", + "slot_names": ["circle_1_label", "circle_2_label", "circle_3_label", "intersection", "title"], + "list_slot_counts": {}, + "dict_slot_sub_counts": {}, + "string_slot_nonempty": {"circle_1_label": true, "circle_2_label": true, "circle_3_label": true, "intersection": false, "title": true} + }, + { + "position": "bottom", + "template_id": "__empty__", + "builder": null, + "slot_names": [], + "list_slot_counts": {}, + "dict_slot_sub_counts": {}, + "string_slot_nonempty": {} + } + ], + "03": [ + { + "position": "left", + "template_id": "three_parallel_requirements", + "builder": "items_with_role", + "slot_names": ["pillars", "title"], + "list_slot_counts": {"pillars": 3}, + "dict_slot_sub_counts": {}, + "string_slot_nonempty": {"title": true} + }, + { + "position": "right", + "template_id": "process_product_two_way", + "builder": "process_product_pair", + "slot_names": ["banner_left", "banner_right", "process", "product", "title"], + "list_slot_counts": {}, + "dict_slot_sub_counts": {"process": {"sections": 3}, "product": {"sections": 3}}, + "string_slot_nonempty": {"banner_left": true, "banner_right": true, "title": true} + } + ], + "04": [ + { + "position": "top", + "template_id": "bim_issues_quadrant_four", + "builder": "quadrant_flat_slots", + "slot_names": ["quadrant_1_body", "quadrant_1_label", "quadrant_2_body", "quadrant_2_label", "quadrant_3_body", "quadrant_3_label", "quadrant_4_body", "quadrant_4_label", "title"], + "list_slot_counts": {"quadrant_1_body": 2, "quadrant_2_body": 2, "quadrant_3_body": 2, "quadrant_4_body": 2}, + "dict_slot_sub_counts": {}, + "string_slot_nonempty": {"quadrant_1_label": true, "quadrant_2_label": true, "quadrant_3_label": true, "quadrant_4_label": true, "title": true} + }, + { + "position": "bottom-left", + "template_id": "__empty__", + "builder": null, + "slot_names": [], + "list_slot_counts": {}, + "dict_slot_sub_counts": {}, + "string_slot_nonempty": {} + }, + { + "position": "bottom-right", + "template_id": "__empty__", + "builder": null, + "slot_names": [], + "list_slot_counts": {}, + "dict_slot_sub_counts": {}, + "string_slot_nonempty": {} + } + ], + "05": [ + { + "position": "primary", + "template_id": "__empty__", + "builder": null, + "slot_names": [], + "list_slot_counts": {}, + "dict_slot_sub_counts": {}, + "string_slot_nonempty": {} + } + ] +} diff --git a/tests/integration/__snapshots__/structural.json b/tests/integration/__snapshots__/structural.json new file mode 100644 index 0000000..5e0a85f --- /dev/null +++ b/tests/integration/__snapshots__/structural.json @@ -0,0 +1,43 @@ +{ + "_doc": "IMP-#91 u3 structural snapshot — pins observed step20 overall + step09 per-zone selected_template_id per mdx in the 01-05 acceptance set. Each entry is fresh-run evidence (not aspirational). Update only when an intentional pipeline change moves the observed value; treat unexplained drift as regression. [[feedback_validation_first_for_closed_issues]] [[feedback_artifact_status_naming]]", + "01": { + "overall": "PASS", + "zone_count": 2, + "zones": [ + {"position": "top", "selected_template_id": "bim_dx_comparison_table"}, + {"position": "bottom", "selected_template_id": "construction_bim_three_usage"} + ] + }, + "02": { + "overall": "PASS", + "zone_count": 2, + "zones": [ + {"position": "top", "selected_template_id": "construction_goals_three_circle_intersection"}, + {"position": "bottom", "selected_template_id": "three_persona_benefits"} + ] + }, + "03": { + "overall": "PASS", + "zone_count": 2, + "zones": [ + {"position": "left", "selected_template_id": "three_parallel_requirements"}, + {"position": "right", "selected_template_id": "process_product_two_way"} + ] + }, + "04": { + "overall": "PASS", + "zone_count": 3, + "zones": [ + {"position": "top", "selected_template_id": "bim_issues_quadrant_four"}, + {"position": "bottom-left", "selected_template_id": "sw_dependency_four_problems"}, + {"position": "bottom-right", "selected_template_id": "pre_construction_model_info_stacked"} + ] + }, + "05": { + "overall": "EMPTY_SHELL_NO_CONTENT", + "zone_count": 1, + "zones": [ + {"position": "primary", "selected_template_id": "__empty__"} + ] + } +} diff --git a/tests/integration/__snapshots__/v4_ranking.json b/tests/integration/__snapshots__/v4_ranking.json new file mode 100644 index 0000000..bdd8f16 --- /dev/null +++ b/tests/integration/__snapshots__/v4_ranking.json @@ -0,0 +1,112 @@ +{ + "_doc": "IMP-91 u7 — F1 V4 ranking observed snapshot (step05_v4_evidence). Pins v4_source (POSIX-normalized), aligned_section_ids, and per-section {section_id, candidate_status, candidates: [{template_id, label, confidence}]}. confidence kept at current 4-decimal rounding. Sections appear in pipeline-emitted order.", + "01": { + "v4_source": "tests/matching/v4_full32_result.yaml", + "aligned_section_ids": ["01-1", "01-2"], + "sections": [ + { + "section_id": "01-1", + "candidate_status": "ok", + "candidates": [ + {"template_id": "construction_bim_three_usage", "label": "use_as_is", "confidence": 0.9101}, + {"template_id": "construction_goals_three_circle_intersection", "label": "light_edit", "confidence": 0.8261}, + {"template_id": "dx_sw_necessity_three_perspectives", "label": "light_edit", "confidence": 0.8168} + ] + }, + { + "section_id": "01-2", + "candidate_status": "ok", + "candidates": [ + {"template_id": "bim_dx_comparison_table", "label": "use_as_is", "confidence": 0.9459}, + {"template_id": "app_sw_package_vs_solution", "label": "restructure", "confidence": 0.6813} + ] + } + ] + }, + "02": { + "v4_source": "tests/matching/v4_full32_result.yaml", + "aligned_section_ids": ["02-1", "02-2-sub-1", "02-2-sub-2"], + "sections": [ + { + "section_id": "02-1", + "candidate_status": "ok", + "candidates": [ + {"template_id": "construction_goals_three_circle_intersection", "label": "use_as_is", "confidence": 0.914} + ] + }, + { + "section_id": "02-2-sub-1", + "candidate_status": "no_non_reject_v4_candidate", + "candidates": [] + }, + { + "section_id": "02-2-sub-2", + "candidate_status": "no_non_reject_v4_candidate", + "candidates": [] + } + ] + }, + "03": { + "v4_source": "tests/matching/v4_full32_result.yaml", + "aligned_section_ids": ["03-1", "03-2"], + "sections": [ + { + "section_id": "03-1", + "candidate_status": "ok", + "candidates": [ + {"template_id": "three_parallel_requirements", "label": "use_as_is", "confidence": 0.9268}, + {"template_id": "dx_sw_necessity_three_perspectives", "label": "light_edit", "confidence": 0.8413} + ] + }, + { + "section_id": "03-2", + "candidate_status": "ok", + "candidates": [ + {"template_id": "process_product_two_way", "label": "use_as_is", "confidence": 0.9198} + ] + } + ] + }, + "04": { + "v4_source": "tests/matching/v4_full32_result.yaml", + "aligned_section_ids": ["04-1", "04-2-sub-1", "04-2-sub-2"], + "sections": [ + { + "section_id": "04-1", + "candidate_status": "no_non_reject_v4_candidate", + "candidates": [] + }, + { + "section_id": "04-2-sub-1", + "candidate_status": "no_non_reject_v4_candidate", + "candidates": [] + }, + { + "section_id": "04-2-sub-2", + "candidate_status": "no_non_reject_v4_candidate", + "candidates": [] + } + ] + }, + "05": { + "v4_source": "tests/matching/v4_full32_result.yaml", + "aligned_section_ids": ["05-1", "05-2-sub-1", "05-2-sub-2"], + "sections": [ + { + "section_id": "05-1", + "candidate_status": "no_non_reject_v4_candidate", + "candidates": [] + }, + { + "section_id": "05-2-sub-1", + "candidate_status": "no_non_reject_v4_candidate", + "candidates": [] + }, + { + "section_id": "05-2-sub-2", + "candidate_status": "no_non_reject_v4_candidate", + "candidates": [] + } + ] + } +} diff --git a/tests/integration/__snapshots__/visual.json b/tests/integration/__snapshots__/visual.json new file mode 100644 index 0000000..c0a0139 --- /dev/null +++ b/tests/integration/__snapshots__/visual.json @@ -0,0 +1,48 @@ +{ + "_doc": "u4 — pin observed step14_visual_check overflow/clip per mdx 01-05. Fresh subprocess observation per [[feedback_validation_first_for_closed_issues]]; drift surfaces visual regression (overflow / clip) loudly per [[feedback_artifact_status_naming]] 3-axis honesty. Snapshot pinned to current-state, not to invented expectation (Stage 1 scope-lock).", + "01": { + "slide_overflowed": false, + "slide_body_overflowed": false, + "passed": true, + "zones": [ + {"position": "top", "template_id": "bim_dx_comparison_table", "overflowed": false, "clipped_inner_count": 0}, + {"position": "bottom", "template_id": "construction_bim_three_usage", "overflowed": false, "clipped_inner_count": 0} + ] + }, + "02": { + "slide_overflowed": false, + "slide_body_overflowed": false, + "passed": true, + "zones": [ + {"position": "top", "template_id": "construction_goals_three_circle_intersection", "overflowed": false, "clipped_inner_count": 0}, + {"position": "bottom", "template_id": "__empty__", "overflowed": false, "clipped_inner_count": 0} + ] + }, + "03": { + "slide_overflowed": false, + "slide_body_overflowed": false, + "passed": true, + "zones": [ + {"position": "left", "template_id": "three_parallel_requirements", "overflowed": false, "clipped_inner_count": 0}, + {"position": "right", "template_id": "process_product_two_way", "overflowed": false, "clipped_inner_count": 0} + ] + }, + "04": { + "slide_overflowed": false, + "slide_body_overflowed": false, + "passed": true, + "zones": [ + {"position": "top", "template_id": "bim_issues_quadrant_four", "overflowed": false, "clipped_inner_count": 0}, + {"position": "bottom-left", "template_id": "__empty__", "overflowed": false, "clipped_inner_count": 0}, + {"position": "bottom-right", "template_id": "__empty__", "overflowed": false, "clipped_inner_count": 0} + ] + }, + "05": { + "slide_overflowed": false, + "slide_body_overflowed": false, + "passed": true, + "zones": [ + {"position": "primary", "template_id": "__empty__", "overflowed": false, "clipped_inner_count": 0} + ] + } +} diff --git a/tests/integration/test_multi_mdx_regression.py b/tests/integration/test_multi_mdx_regression.py new file mode 100644 index 0000000..f20562e --- /dev/null +++ b/tests/integration/test_multi_mdx_regression.py @@ -0,0 +1,573 @@ +"""IMP-#91 u2 — multi-mdx regression CI scaffold (mdx 01-05 acceptance set). + +Session-scoped subprocess cache that runs each MDX acceptance fixture +exactly once. u3-u11 extend this module with per-axis assertions +(structural / visual / coverage / F0-F5). u2 alone pins the cache +contract: each mdx in ``MDX_SET`` produces a run directory under +``data/runs/<run_id>/phase_z2/`` containing the step JSONs and +``final.html`` that downstream parametrized tests will read. + +[[feedback_validation_first_for_closed_issues]] — fresh subprocess per +session, no frozen artifacts. [[feedback_artifact_status_naming]] — the +overall status (PASS / RENDERED_WITH_VISUAL_REGRESSION / +PARTIAL_COVERAGE / EMPTY_SHELL_NO_CONTENT) is asserted in u3-u5; u2 +only pins the artifact-production contract. +""" +from __future__ import annotations + +import json +import re +import subprocess +import sys +import uuid +from pathlib import Path +from typing import Dict, List, NamedTuple + +import pytest + +REPO_ROOT = Path(__file__).resolve().parents[2] +SAMPLES_DIR = REPO_ROOT / "samples" / "mdx_batch" +RUNS_DIR = REPO_ROOT / "data" / "runs" +SNAPSHOTS_DIR = Path(__file__).resolve().parent / "__snapshots__" +MDX_SET = ("01", "02", "03", "04", "05") + + +class PipelineRun(NamedTuple): + mdx_id: str + run_id: str + returncode: int + stdout: str + stderr: str + run_dir: Path + + +@pytest.fixture(scope="session") +def multi_mdx_runs() -> Dict[str, PipelineRun]: + """Run the Phase Z pipeline once per mdx in ``MDX_SET`` (session-cached).""" + cache: Dict[str, PipelineRun] = {} + for mdx_id in MDX_SET: + run_id = f"imp91_{mdx_id}_{uuid.uuid4().hex[:8]}" + cp = subprocess.run( + [ + sys.executable, + "-m", + "src.phase_z2_pipeline", + str(SAMPLES_DIR / f"{mdx_id}.mdx"), + run_id, + ], + capture_output=True, + text=True, + timeout=360, + cwd=str(REPO_ROOT), + ) + cache[mdx_id] = PipelineRun( + mdx_id=mdx_id, + run_id=run_id, + returncode=cp.returncode, + stdout=cp.stdout, + stderr=cp.stderr, + run_dir=RUNS_DIR / run_id / "phase_z2", + ) + return cache + + +@pytest.mark.integration +@pytest.mark.parametrize("mdx_id", MDX_SET) +def test_pipeline_run_produces_step20_status( + mdx_id: str, multi_mdx_runs: Dict[str, PipelineRun] +) -> None: + """Cache contract: every mdx subprocess produces step20_slide_status.json.""" + run = multi_mdx_runs[mdx_id] + status_path = run.run_dir / "steps" / "step20_slide_status.json" + assert status_path.is_file(), ( + f"{mdx_id}.mdx run {run.run_id} did not produce {status_path} " + f"(returncode={run.returncode}); stderr tail: {run.stderr[-800:]}" + ) + + +@pytest.mark.integration +@pytest.mark.parametrize("mdx_id", MDX_SET) +def test_structural_snapshot_matches( + mdx_id: str, multi_mdx_runs: Dict[str, PipelineRun] +) -> None: + """u3 — pin observed overall + per-zone selected_template_id against snapshot.""" + snapshot = json.loads((SNAPSHOTS_DIR / "structural.json").read_text(encoding="utf-8")) + expected = snapshot[mdx_id] + run = multi_mdx_runs[mdx_id] + status = json.loads( + (run.run_dir / "steps" / "step20_slide_status.json").read_text(encoding="utf-8") + )["data"] + frame_sel = json.loads( + (run.run_dir / "steps" / "step09_frame_selection.json").read_text(encoding="utf-8") + )["data"] + zones = frame_sel.get("per_zone", []) + actual_zones = [ + {"position": z.get("position"), "selected_template_id": z.get("selected_template_id")} + for z in zones + ] + assert status.get("overall") == expected["overall"], ( + f"{mdx_id}.mdx overall drift: expected {expected['overall']!r}, " + f"got {status.get('overall')!r}" + ) + assert len(actual_zones) == expected["zone_count"], ( + f"{mdx_id}.mdx zone_count drift: expected {expected['zone_count']}, " + f"got {len(actual_zones)} (zones={actual_zones})" + ) + assert actual_zones == expected["zones"], ( + f"{mdx_id}.mdx zone topology drift: expected {expected['zones']}, " + f"got {actual_zones}" + ) + + +@pytest.mark.integration +@pytest.mark.parametrize("mdx_id", MDX_SET) +def test_visual_snapshot_matches( + mdx_id: str, multi_mdx_runs: Dict[str, PipelineRun] +) -> None: + """u4 — pin observed step14 visual_check overflow/clip against snapshot.""" + snapshot = json.loads((SNAPSHOTS_DIR / "visual.json").read_text(encoding="utf-8")) + expected = snapshot[mdx_id] + run = multi_mdx_runs[mdx_id] + visual = json.loads( + (run.run_dir / "steps" / "step14_visual_check.json").read_text(encoding="utf-8") + )["data"] + slide_overflowed = visual.get("slide", {}).get("overflowed") + slide_body_overflowed = visual.get("slide_body", {}).get("overflowed") + visual_passed = visual.get("passed") + actual_zones = [ + { + "position": z.get("position"), + "template_id": z.get("template_id"), + "overflowed": z.get("overflowed"), + "clipped_inner_count": len(z.get("clipped_inner") or []), + } + for z in visual.get("zones", []) + ] + assert slide_overflowed == expected["slide_overflowed"], ( + f"{mdx_id}.mdx slide.overflowed drift: expected {expected['slide_overflowed']}, " + f"got {slide_overflowed}" + ) + assert slide_body_overflowed == expected["slide_body_overflowed"], ( + f"{mdx_id}.mdx slide_body.overflowed drift: expected {expected['slide_body_overflowed']}, " + f"got {slide_body_overflowed}" + ) + assert visual_passed == expected["passed"], ( + f"{mdx_id}.mdx visual_check.passed drift: expected {expected['passed']}, " + f"got {visual_passed}" + ) + assert actual_zones == expected["zones"], ( + f"{mdx_id}.mdx zone visual drift: expected {expected['zones']}, " + f"got {actual_zones}" + ) + + +@pytest.mark.integration +@pytest.mark.parametrize("mdx_id", MDX_SET) +def test_coverage_snapshot_matches( + mdx_id: str, multi_mdx_runs: Dict[str, PipelineRun] +) -> None: + """u5 — pin observed full_mdx_coverage + section_id parity against snapshot.""" + snapshot = json.loads((SNAPSHOTS_DIR / "coverage.json").read_text(encoding="utf-8")) + expected = snapshot[mdx_id] + run = multi_mdx_runs[mdx_id] + status = json.loads( + (run.run_dir / "steps" / "step20_slide_status.json").read_text(encoding="utf-8") + )["data"] + assert status.get("rendered") == expected["rendered"], ( + f"{mdx_id}.mdx rendered drift: expected {expected['rendered']}, " + f"got {status.get('rendered')}" + ) + assert status.get("visual_check_passed") == expected["visual_check_passed"], ( + f"{mdx_id}.mdx visual_check_passed drift: expected {expected['visual_check_passed']}, " + f"got {status.get('visual_check_passed')}" + ) + assert status.get("full_mdx_coverage") == expected["full_mdx_coverage"], ( + f"{mdx_id}.mdx full_mdx_coverage drift: expected {expected['full_mdx_coverage']}, " + f"got {status.get('full_mdx_coverage')}" + ) + assert sorted(status.get("aligned_section_ids") or []) == sorted(expected["aligned_section_ids"]), ( + f"{mdx_id}.mdx aligned_section_ids drift: expected {expected['aligned_section_ids']}, " + f"got {status.get('aligned_section_ids')}" + ) + assert sorted(status.get("covered_section_ids") or []) == sorted(expected["covered_section_ids"]), ( + f"{mdx_id}.mdx covered_section_ids drift: expected {expected['covered_section_ids']}, " + f"got {status.get('covered_section_ids')}" + ) + assert sorted(status.get("filtered_section_ids") or []) == sorted(expected["filtered_section_ids"]), ( + f"{mdx_id}.mdx filtered_section_ids drift: expected {expected['filtered_section_ids']}, " + f"got {status.get('filtered_section_ids')}" + ) + + +@pytest.mark.integration +@pytest.mark.parametrize("mdx_id", MDX_SET) +def test_normalize_snapshot_matches( + mdx_id: str, multi_mdx_runs: Dict[str, PipelineRun] +) -> None: + """u6 — F0 normalize: pin observed step02_normalized shape per mdx.""" + snapshot = json.loads((SNAPSHOTS_DIR / "normalize.json").read_text(encoding="utf-8")) + expected = snapshot[mdx_id] + run = multi_mdx_runs[mdx_id] + raw = json.loads( + (run.run_dir / "steps" / "step02_normalized.json").read_text(encoding="utf-8") + ) + d = raw["data"] + diag = d.get("stage0_adapter_diagnostics", {}) or {} + assets = d.get("stage0_normalized_assets", {}) or {} + actual = { + "step_num": raw.get("step_num"), + "step_status": raw.get("step_status"), + "pipeline_path_connected": raw.get("pipeline_path_connected"), + "sections_count": d.get("sections_count"), + "section_ids": [s.get("section_id") for s in d.get("sections", [])], + "orphans_count": len(d.get("orphans") or []), + "details_count": len(d.get("details") or []), + "adapter_enabled": diag.get("enabled"), + "adapter_used": diag.get("used"), + "assets_popups_count": len(assets.get("popups") or []), + "assets_images_count": len(assets.get("images") or []), + "assets_tables_count": len(assets.get("tables") or []), + "slide_title_nonempty": bool(d.get("slide_title")), + "slide_footer_nonempty": bool(d.get("slide_footer")), + } + for key, want in expected.items(): + got = actual[key] + assert got == want, ( + f"{mdx_id}.mdx normalize.{key} drift: expected {want!r}, got {got!r}" + ) + assert len(d.get("sections", [])) == expected["sections_count"], ( + f"{mdx_id}.mdx sections list length mismatch with sections_count: " + f"sections_count={expected['sections_count']}, got len(sections)={len(d.get('sections', []))}" + ) + for sect in d.get("sections", []): + assert (sect.get("raw_content_length") or 0) > 0, ( + f"{mdx_id}.mdx section {sect.get('section_id')!r} has empty raw_content " + f"(length={sect.get('raw_content_length')!r}) — normalize lost content" + ) + + +@pytest.mark.integration +@pytest.mark.parametrize("mdx_id", MDX_SET) +def test_v4_ranking_snapshot_matches( + mdx_id: str, multi_mdx_runs: Dict[str, PipelineRun] +) -> None: + """u7 — F1 V4 ranking: pin observed step05_v4_evidence per mdx. + + Pins ``v4_source`` (POSIX-normalized for cross-platform stability), + ``aligned_section_ids``, and per-section + ``{section_id, candidate_status, candidates: [{template_id, label, confidence}]}`` + in pipeline-emitted order. Confidence stays at the current 4-decimal + rounding emitted by the V4 yaml; drift any axis fails loudly so a + re-baseline is a conscious commit, not a silent shift. + """ + snapshot = json.loads((SNAPSHOTS_DIR / "v4_ranking.json").read_text(encoding="utf-8")) + expected = snapshot[mdx_id] + run = multi_mdx_runs[mdx_id] + raw = json.loads( + (run.run_dir / "steps" / "step05_v4_evidence.json").read_text(encoding="utf-8") + ) + data = raw["data"] + actual_v4_source = str(data.get("v4_source") or "").replace("\\", "/") + actual_sections = [ + { + "section_id": ev.get("section_id"), + "candidate_status": ev.get("candidate_status"), + "candidates": [ + { + "template_id": c.get("template_id"), + "label": c.get("label"), + "confidence": c.get("confidence"), + } + for c in (ev.get("v4_candidates") or []) + ], + } + for ev in (data.get("evidence_per_section") or []) + ] + assert actual_v4_source == expected["v4_source"], ( + f"{mdx_id}.mdx v4_source drift: expected {expected['v4_source']!r}, " + f"got {actual_v4_source!r}" + ) + assert data.get("aligned_section_ids") == expected["aligned_section_ids"], ( + f"{mdx_id}.mdx aligned_section_ids drift: expected {expected['aligned_section_ids']}, " + f"got {data.get('aligned_section_ids')}" + ) + assert actual_sections == expected["sections"], ( + f"{mdx_id}.mdx V4 ranking drift: expected {expected['sections']}, " + f"got {actual_sections}" + ) + + +def _slot_payload_zone_shape(zone: dict) -> dict: + """Reduce a step12 per_zone entry to a content-agnostic structural shape. + + Pins builder + slot names + per-slot list cardinality + dict sub-list + counts + string non-empty flags. MDX text edits don't drift this; a + builder swap, slot rename, missing slot, or list-cardinality change + does. Sub-dict shape pins ``sections`` length only — deeper field + pinning would require a fresh u8'-axis snapshot. + """ + sp = zone.get("slot_payload") or {} + slot_names = sorted(sp.keys()) + list_slot_counts: dict = {} + dict_slot_sub_counts: dict = {} + string_slot_nonempty: dict = {} + for name in slot_names: + value = sp[name] + if isinstance(value, list): + list_slot_counts[name] = len(value) + elif isinstance(value, dict): + sub: dict = {} + for sub_key, sub_val in value.items(): + if isinstance(sub_val, list): + sub[sub_key] = len(sub_val) + dict_slot_sub_counts[name] = sub + elif isinstance(value, str): + string_slot_nonempty[name] = bool(value.strip()) + return { + "position": zone.get("position"), + "template_id": zone.get("template_id"), + "builder": zone.get("builder"), + "slot_names": slot_names, + "list_slot_counts": list_slot_counts, + "dict_slot_sub_counts": dict_slot_sub_counts, + "string_slot_nonempty": string_slot_nonempty, + } + + +_AI_UNIT_KEYS = ( + "source_section_ids", "label", "route_hint", "provisional", + "ai_called", "skip_reason", "apply_status", +) + + +@pytest.mark.integration +@pytest.mark.parametrize("mdx_id", MDX_SET) +def test_ai_classifier_snapshot_matches( + mdx_id: str, multi_mdx_runs: Dict[str, PipelineRun] +) -> None: + """u9 — F3 classifier-only AI: pin step12/15/16/18 classifier signals. + + [[feedback_ai_isolation_contract]] / [[feedback_demo_env_toggle_policy]] + central invariant: ``ai_called`` MUST stay False per unit by default; + activation requires explicit .env toggle, never pipeline default. + """ + snapshot = json.loads((SNAPSHOTS_DIR / "ai_classifier.json").read_text(encoding="utf-8")) + expected = snapshot[mdx_id] + steps = multi_mdx_runs[mdx_id].run_dir / "steps" + ai = json.loads((steps / "step12_ai_repair.json").read_text(encoding="utf-8"))["data"] + fit = json.loads((steps / "step15_fit_classification.json").read_text(encoding="utf-8"))["data"] + router = json.loads((steps / "step16_router_decision.json").read_text(encoding="utf-8"))["data"] + failure = json.loads((steps / "step18_failure_classification.json").read_text(encoding="utf-8"))["data"] + units = [{k: u.get(k) for k in _AI_UNIT_KEYS} for u in (ai.get("per_unit") or [])] + actual = { + "units": units, + "coverage_invariant_status": (ai.get("coverage_invariant") or {}).get("status"), + "fit_visual_check_passed": fit.get("visual_check_passed"), + "fit_classifications_count": len(fit.get("classifications") or []), + "fit_categories_seen": fit.get("categories_seen") or [], + "router_active": router.get("router_active"), + "router_routed_count": router.get("routed_count"), + "router_v4_fallback_used_count": (router.get("v4_fallback_summary") or {}).get("fallback_used_count"), + "failure_type": failure.get("failure_type"), + } + for key, want in expected.items(): + assert actual[key] == want, ( + f"{mdx_id}.mdx ai_classifier.{key} drift: expected {want!r}, got {actual[key]!r}" + ) + breaches = [u for u in units if u["ai_called"] is not False] + assert not breaches, ( + f"{mdx_id}.mdx F3 AI-isolation breach (ai_called must be False by default): {breaches}" + ) + + +def _layout_zone_shape(zone: dict) -> dict: + """Reduce a step08 per_zone_plan entry to a content-agnostic F4 layout shape.""" + sub_zones = zone.get("sub_zones_planned") or [] + return { + "position": zone.get("position"), + "min_height_px": zone.get("min_height_px"), + "frame_cardinality_strict": zone.get("frame_cardinality_strict"), + "sub_zones_count": len(sub_zones), + "region_layout_candidates": zone.get("region_layout_candidates") or [], + } + + +@pytest.mark.integration +@pytest.mark.parametrize("mdx_id", MDX_SET) +def test_layout_snapshot_matches( + mdx_id: str, multi_mdx_runs: Dict[str, PipelineRun] +) -> None: + """u10 — F4 layout: pin step07_layout + step08_zone_region_ratios per mdx. + + Pins the layout decision path (``layout_preset`` / + ``auto_layout_preset`` / ``layout_override_applied`` / + ``layout_candidates`` / ``computation``) + planning geometry + (``heights_px`` / ``widths_px`` / ``ratios`` / ``width_ratios``) + + per-zone planning shape (``position`` / ``min_height_px`` / + ``frame_cardinality_strict`` / ``sub_zones_count`` / + ``region_layout_candidates``). ``step_status='partial'`` is the + Step 7/8 schema-lock marker (region-level ratio + count-based v0). + mdx 03 is the only ``layout_override_applied=True`` case (vertical-2 + user override per project_mdx03_frame_lock 2026-05-15 lock); drift + here flips F4 layer-A axis. mdx 04 ``top`` zone pins ``None`` for + min_height_px + frame_cardinality_strict (no frame cardinality on + the top zone — observed current state, not invented). + """ + snapshot = json.loads((SNAPSHOTS_DIR / "layout.json").read_text(encoding="utf-8")) + expected = snapshot[mdx_id] + run = multi_mdx_runs[mdx_id] + s7 = json.loads( + (run.run_dir / "steps" / "step07_layout.json").read_text(encoding="utf-8") + ) + s8 = json.loads( + (run.run_dir / "steps" / "step08_zone_region_ratios.json").read_text(encoding="utf-8") + ) + d7 = s7.get("data") or {} + d8 = s8.get("data") or {} + css = d7.get("layout_css") or {} + actual = { + "step7_step_status": s7.get("step_status"), + "step7_pipeline_path_connected": s7.get("pipeline_path_connected"), + "layout_preset": d7.get("layout_preset"), + "auto_layout_preset": d7.get("auto_layout_preset"), + "layout_override_applied": d7.get("layout_override_applied"), + "zones_count": d7.get("zones_count"), + "unit_count": d7.get("unit_count"), + "layout_candidates": d7.get("layout_candidates") or [], + "computation": css.get("computation"), + "dynamic_rows": css.get("dynamic_rows"), + "dynamic_cols": css.get("dynamic_cols"), + "heights_px": css.get("heights_px"), + "widths_px": css.get("widths_px"), + "ratios": css.get("ratios"), + "width_ratios": css.get("width_ratios"), + "step8_step_status": s8.get("step_status"), + "step8_pipeline_path_connected": s8.get("pipeline_path_connected"), + "zone_heights_px_planned": d8.get("zone_heights_px_planned"), + "zone_widths_px_planned": d8.get("zone_widths_px_planned"), + "zone_col_ratios_planned": d8.get("zone_col_ratios_planned"), + "per_zone_layout_shape": [ + _layout_zone_shape(z) for z in (d8.get("per_zone_plan") or []) + ], + } + for key, want in expected.items(): + got = actual[key] + assert got == want, ( + f"{mdx_id}.mdx layout.{key} drift: expected {want!r}, got {got!r}" + ) + + +@pytest.mark.integration +@pytest.mark.parametrize("mdx_id", MDX_SET) +def test_slot_payload_snapshot_matches( + mdx_id: str, multi_mdx_runs: Dict[str, PipelineRun] +) -> None: + """u8 — F2 slot_payload: pin observed step12_slot_payload per_zone shape per mdx. + + Snapshot pins content-agnostic structural shape (builder + slot + names + list cardinality + dict sub-list counts + string non-empty + flags), not literal payload text. MDX wording tweaks won't drift + this; builder swap, slot rename, slot count drift, or __empty__ + transitions will. Empty zones must have ``builder is None`` and no + slots — this is the IMP-87 empty_shell honesty contract surface for + F2. + """ + snapshot = json.loads((SNAPSHOTS_DIR / "slot_payload.json").read_text(encoding="utf-8")) + expected = snapshot[mdx_id] + run = multi_mdx_runs[mdx_id] + raw = json.loads( + (run.run_dir / "steps" / "step12_slot_payload.json").read_text(encoding="utf-8") + ) + per_zone = raw["data"].get("per_zone") or [] + actual = [_slot_payload_zone_shape(z) for z in per_zone] + assert len(actual) == len(expected), ( + f"{mdx_id}.mdx step12 zone_count drift: expected {len(expected)}, " + f"got {len(actual)} (positions={[z.get('position') for z in actual]})" + ) + for idx, (act, exp) in enumerate(zip(actual, expected)): + assert act == exp, ( + f"{mdx_id}.mdx step12 zone[{idx}] ({exp.get('position')!r}) shape drift: " + f"expected {exp}, got {act}" + ) + + +_ZONE_TAG_RE = re.compile( + r'<div[^>]*\sdata-zone-position="([^"]+)"[^>]*\sdata-template-id="([^"]+)"', + re.IGNORECASE, +) +_SLIDE_ROOT_RE = re.compile(r'<div\s+class="slide"\s+data-page="1"') +_TITLE_RE = re.compile(r'<title>([^<]*)', re.IGNORECASE) + + +def _extract_html_zone_topology(html: str) -> List[dict]: + """Extract (position, template_id) pairs in document order from final.html.""" + return [ + {"position": m.group(1), "template_id": m.group(2)} + for m in _ZONE_TAG_RE.finditer(html) + ] + + +@pytest.mark.integration +@pytest.mark.parametrize("mdx_id", MDX_SET) +def test_final_html_snapshot_matches( + mdx_id: str, multi_mdx_runs: Dict[str, PipelineRun] +) -> None: + """u11 — F5 final.html extraction: pin step13_render metadata + on-disk HTML structure. + + Cross-snapshot parity gate: ``html_zone_topology`` (extracted from + final.html via ``data-zone-position`` / ``data-template-id`` markers) + MUST equal step12 slot_payload (u8) ``(position, template_id)`` + sequence — Jinja2 renders from step12, not step09, so this is the + correct upstream parity (step09 selection vs step12 ``__empty__`` + collapse is intentional per IMP-87 honesty gate and surfaces in u8). + Drift between final.html and slot_payload = render pipeline + disconnect. ``final.html`` on-disk size also MUST equal step13's + reported ``final_html_size_bytes`` — byte parity proves no + truncation / no double-write race. + """ + snapshot = json.loads((SNAPSHOTS_DIR / "final_html.json").read_text(encoding="utf-8")) + expected = snapshot[mdx_id] + run = multi_mdx_runs[mdx_id] + raw13 = json.loads( + (run.run_dir / "steps" / "step13_render.json").read_text(encoding="utf-8") + ) + d13 = raw13.get("data") or {} + ri = d13.get("render_inputs") or {} + final_path = run.run_dir / "final.html" + assert final_path.is_file(), f"{mdx_id}.mdx final.html missing at {final_path}" + html = final_path.read_text(encoding="utf-8") + title_match = _TITLE_RE.search(html) + html_title = title_match.group(1).strip() if title_match else "" + html_topology = _extract_html_zone_topology(html) + actual = { + "step13_status": raw13.get("step_status"), + "step13_pipeline_path_connected": raw13.get("pipeline_path_connected"), + "render_inputs_zones_count": ri.get("zones_count"), + "render_inputs_layout_preset": ri.get("layout_preset"), + "render_inputs_slide_title_nonempty": bool((ri.get("slide_title") or "").strip()), + "render_inputs_slide_footer_nonempty": bool((ri.get("slide_footer") or "").strip()), + "html_title_matches_render_input": html_title == (ri.get("slide_title") or "").strip(), + "html_slide_root_count": len(_SLIDE_ROOT_RE.findall(html)), + "html_slide_footer_present": '