feat(#72): IMP-43 u1~u8 --reuse-from incremental rerun (Step 0/1/2/5/6 reuse + Step 7+ re-execute)

u1 argparse --reuse-from PREV_RUN_ID + post-merge fail-closed guard (rejects layout/zone_geometry/zone_section/image override axes by name; only --override-frame is preserved). u2 src/phase_z2_reuse_snapshot.py — JSON-only Step 6 snapshot with mdx_sha256 integrity key and {value, source_path, upstream_step} provenance per axis (pickle forbidden per Stage 2 guardrail). u3 _write_reuse_snapshot at the Step 6 boundary; soft-fails to stderr without aborting the seed run. u4 prev_run_dir RO copy of step00/01/02/05/06 + _reuse_snapshot.json into new run_dir, state rehydration, reuse marker, frame-override application on restored units, Step 7+ resume. u4b fail-closed for missing prev_run_dir / missing/corrupt/invalid snapshot / mdx_sha256 mismatch / accidental new==prev write, with value+path+upstream diagnostics per axis. u5 reuse_from Optional[str] threaded through run_phase_z2_mvp1 signature and CLI dispatch; default None preserves byte-identical pre-IMP-43 behavior. u6 Front /api/run optional reuseFromRunId forwarding (vite.config.ts + designAgentApi.ts + run_pipeline_reuse_from.test.ts). u7a fast CI equivalence (1 mdx × 1 layout × 2 frames); step13 whitelist = run_id/timestamps/prev_run_id only. u7b 3 layouts × 3 mdx × 32 frames sweep gated by pytest.mark.sweep (registered in pyproject.toml; default CI must use -m 'not sweep'). u8 scripts/measure_reuse_savings.py argv-driven A/B/C harness with frame pin self-discovery + seed-time exclusion; status board §8 TBD anchor (issue-body 50-70% / 10-20s→3-8s claim explicitly unverified, not mirrored). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-24 22:44:27 +09:00
parent 8648a468d9
commit b4be6c1cd0
15 changed files with 5128 additions and 656 deletions
--- a/src/phase_z2_pipeline.py
+++ b/src/phase_z2_pipeline.py
--- a/src/phase_z2_reuse_snapshot.py
+++ b/src/phase_z2_reuse_snapshot.py
@@ -0,0 +1,301 @@
+"""IMP-43 (#72) u2 — Step 6 reuse snapshot schema (JSON-only).
+
+Stage 2 plan (locked) — ``--reuse-from PREV_RUN_ID`` reuses the
+Step 0 / 1 / 2 / 5 / 6 deterministic artifact subset plus the
+in-memory state that downstream steps need but that the existing
+``step02_normalized.json`` / ``step05_v4_evidence.json`` /
+``step06_composition_plan.json`` artifacts do not capture in a
+deserialize-ready form (e.g. ``CompositionUnit`` instances,
+``comp_debug``, ``v4_fallback_traces`` raw map, pre-override
+``layout_preset``). This module owns the schema for the additional
+``_reuse_snapshot.json`` sidecar written next to ``step06_composition_plan.json``.
+
+Scope (u2 only, Stage 2 unit split):
+    * Pure schema + serializers + validator. No file I/O.
+    * JSON-only — pickle is forbidden per Stage 2 guardrails.
+    * Provenance per top-level field: ``{value, source_path, upstream_step}``.
+    * ``mdx_sha256`` integrity key — ``--reuse-from`` must fail closed when
+      the prev run's MDX bytes don't match the current MDX bytes.
+    * ``schema_version`` — bumped on any non-additive shape change.
+
+Out of scope (deferred to later units):
+    * Writing the snapshot into the run_dir (u3).
+    * Copy / restore on ``--reuse-from`` (u4).
+    * Fail-closed snapshot/path errors at restore time (u4b).
+    * Threading ``reuse_from`` through ``run_phase_z2_mvp1`` (u5).
+"""
+from __future__ import annotations
+
+import json
+from typing import Any, Optional
+
+
+SNAPSHOT_VERSION = 1
+SNAPSHOT_FILENAME = "_reuse_snapshot.json"
+
+
+# Required top-level keys. Bare scalars (no provenance wrapper):
+#   - schema_version (contract key)
+#   - mdx_sha256 (integrity key)
+# All other keys are wrapped {value, source_path, upstream_step}.
+REQUIRED_TOP_LEVEL_KEYS: tuple[str, ...] = (
+    "schema_version",
+    "mdx_sha256",
+    "slide_title",
+    "slide_footer",
+    "sections",
+    "stage0_adapter_diagnostics",
+    "stage0_normalized_assets",
+    "v4_evidence",
+    "layout_preset_pre_override",
+    "units",
+    "comp_debug",
+    "v4_fallback_traces",
+    "ai_preflight",
+)
+
+_BARE_KEYS: frozenset[str] = frozenset({"schema_version", "mdx_sha256"})
+
+
+def _wrap(value: Any, *, source_path: str, upstream_step: str) -> dict[str, Any]:
+    return {
+        "value": value,
+        "source_path": source_path,
+        "upstream_step": upstream_step,
+    }
+
+
+def serialize_section(section: Any) -> dict[str, Any]:
+    """Serialize an ``MdxSection``-shaped object into a JSON-safe dict.
+
+    Duck-typed: accepts the production ``MdxSection`` dataclass or any
+    object exposing the same attribute names. Preserves the subset of
+    fields needed to reconstruct downstream pipeline behavior on the
+    reuse path.
+    """
+    return {
+        "section_id": section.section_id,
+        "section_num": section.section_num,
+        "title": section.title,
+        "raw_content": section.raw_content,
+        "heading_number": getattr(section, "heading_number", None),
+        "v4_alias_keys": list(getattr(section, "v4_alias_keys", []) or []),
+        "sub_sections": list(getattr(section, "sub_sections", []) or []),
+    }
+
+
+def serialize_unit(unit: Any) -> dict[str, Any]:
+    """Serialize a ``CompositionUnit``-shaped object into a JSON-safe dict.
+
+    ``v4_candidates`` entries are V4Match-duck-typed per the
+    CompositionUnit docstring; each is unwrapped to its 6 named
+    attributes so the snapshot file does not pin V4Match's dataclass
+    layout. ``v4_rank`` is included so the reuse path's Step 9
+    application-plan payload (``_build_application_plan_unit``)
+    remains byte-equivalent to the full-rerun path — full rerun stamps
+    each candidate's rank via ``_v4_match_from_judgment`` (e.g. 1, 2,
+    3, …) and Step 9 surfaces it under ``v4_candidates[i].v4_rank``.
+    Persisting it here lets the rehydrated ``_RehydratedV4Candidate``
+    expose the same attribute end-to-end and avoids None drift in the
+    Step 13 equivalence comparison (u7a).
+    """
+    return {
+        "source_section_ids": list(unit.source_section_ids),
+        "merge_type": unit.merge_type,
+        "frame_template_id": unit.frame_template_id,
+        "frame_id": unit.frame_id,
+        "frame_number": unit.frame_number,
+        "confidence": float(unit.confidence),
+        "label": unit.label,
+        "phase_z_status": unit.phase_z_status,
+        "raw_content": unit.raw_content,
+        "title": unit.title,
+        "v4_rank": unit.v4_rank,
+        "selection_path": unit.selection_path,
+        "fallback_reason": unit.fallback_reason,
+        "score": float(unit.score),
+        "rationale": dict(unit.rationale or {}),
+        "auto_selectable": bool(unit.auto_selectable),
+        "filter_reasons": list(unit.filter_reasons or []),
+        "notes": list(unit.notes or []),
+        "v4_candidates": [
+            {
+                "template_id": c.template_id,
+                "frame_id": c.frame_id,
+                "frame_number": c.frame_number,
+                "confidence": float(c.confidence),
+                "label": c.label,
+                "v4_rank": getattr(c, "v4_rank", None),
+            }
+            for c in (unit.v4_candidates or [])
+        ],
+        "provisional": bool(getattr(unit, "provisional", False)),
+    }
+
+
+def build_snapshot(
+    *,
+    mdx_sha256: str,
+    slide_title: Optional[str],
+    slide_footer: Optional[str],
+    sections: list,
+    stage0_adapter_diagnostics: Optional[dict],
+    stage0_normalized_assets: Optional[dict],
+    v4_evidence: list,
+    layout_preset_pre_override: Optional[str],
+    units: list,
+    comp_debug: Optional[dict],
+    v4_fallback_traces: Optional[dict],
+    ai_preflight: Optional[dict],
+) -> dict[str, Any]:
+    """Build a JSON-serializable Step 6 reuse snapshot with provenance.
+
+    Each top-level entry — except the two bare contract / integrity
+    keys (``schema_version``, ``mdx_sha256``) — is wrapped with
+    ``{value, source_path, upstream_step}``.
+
+    The function calls ``json.dumps(snapshot)`` at the end to enforce
+    JSON-safety at build time: any latent non-JSON value (set, Path,
+    dataclass instance, etc.) raises ``TypeError`` at the call site,
+    not later at restore.
+    """
+    snapshot: dict[str, Any] = {
+        "schema_version": SNAPSHOT_VERSION,
+        "mdx_sha256": mdx_sha256,
+        "slide_title": _wrap(
+            slide_title,
+            source_path="steps/step02_normalized.json#/slide_title",
+            upstream_step="step02",
+        ),
+        "slide_footer": _wrap(
+            slide_footer,
+            source_path="steps/step02_normalized.json#/slide_footer",
+            upstream_step="step02",
+        ),
+        "sections": _wrap(
+            [serialize_section(s) for s in sections],
+            source_path="steps/step02_normalized.json#/sections",
+            upstream_step="step02",
+        ),
+        "stage0_adapter_diagnostics": _wrap(
+            dict(stage0_adapter_diagnostics or {}),
+            source_path="steps/step02_normalized.json#/stage0_adapter_diagnostics",
+            upstream_step="step02",
+        ),
+        "stage0_normalized_assets": _wrap(
+            dict(stage0_normalized_assets or {}),
+            source_path="steps/step02_normalized.json#/stage0_normalized_assets",
+            upstream_step="step02",
+        ),
+        "v4_evidence": _wrap(
+            list(v4_evidence or []),
+            source_path="steps/step05_v4_evidence.json#/evidence_per_section",
+            upstream_step="step05",
+        ),
+        "layout_preset_pre_override": _wrap(
+            layout_preset_pre_override,
+            source_path="steps/step06_composition_plan.json#/layout_preset_decided",
+            upstream_step="step06",
+        ),
+        "units": _wrap(
+            [serialize_unit(u) for u in units],
+            source_path="steps/step06_composition_plan.json#/selected_units",
+            upstream_step="step06",
+        ),
+        "comp_debug": _wrap(
+            dict(comp_debug or {}),
+            source_path="steps/step06_composition_plan.json#/*",
+            upstream_step="step06",
+        ),
+        "v4_fallback_traces": _wrap(
+            dict(v4_fallback_traces or {}),
+            # v4_fallback_traces is assembled inside run_phase_z2_mvp1
+            # (see phase_z2_pipeline.py around the Step 5/6 boundary) and
+            # surfaces only partially into step06_composition_plan.json
+            # via the v4_fallback_summary / imp48_resplit fields. The
+            # canonical untruncated source is the in-memory dict at end
+            # of Step 6 — that's what the reuse path needs.
+            source_path="phase_z2_pipeline.run_phase_z2_mvp1::v4_fallback_traces",
+            upstream_step="step06",
+        ),
+        "ai_preflight": _wrap(
+            dict(ai_preflight or {}),
+            source_path="steps/step00_preconditions.json#/ai_preflight",
+            upstream_step="step00",
+        ),
+    }
+    json.dumps(snapshot)
+    return snapshot
+
+
+class SnapshotValidationError(ValueError):
+    """Raised by ``validate_snapshot`` when the snapshot is structurally
+    unusable or fails the ``mdx_sha256`` integrity check.
+
+    Subclass of ``ValueError`` so existing ``except ValueError`` callers
+    (u4b will add a tighter ``except SnapshotValidationError``) still
+    catch it without escaping to the outer CLI.
+    """
+
+
+def validate_snapshot(
+    snapshot: Any,
+    *,
+    expected_mdx_sha256: str,
+) -> None:
+    """Validate a loaded snapshot dict (fail-closed).
+
+    Raises ``SnapshotValidationError`` when:
+        * ``snapshot`` is not a dict
+        * ``schema_version`` is missing or != ``SNAPSHOT_VERSION``
+        * ``mdx_sha256`` is missing, non-string, or doesn't match
+          ``expected_mdx_sha256``
+        * any required top-level key is missing
+        * a wrapped entry doesn't expose ``{value, source_path, upstream_step}``
+
+    Returns ``None`` on success.
+
+    Callers (u4b) translate the raised error into an exit-code-2 abort
+    with the failing axis surfaced as `value + path + upstream`
+    (factual-verification guardrail).
+    """
+    if not isinstance(snapshot, dict):
+        raise SnapshotValidationError(
+            f"snapshot is not a dict (got {type(snapshot).__name__})"
+        )
+
+    version = snapshot.get("schema_version")
+    if version != SNAPSHOT_VERSION:
+        raise SnapshotValidationError(
+            f"schema_version mismatch: expected {SNAPSHOT_VERSION!r}, got {version!r}"
+        )
+
+    actual_sha = snapshot.get("mdx_sha256")
+    if not isinstance(actual_sha, str) or not actual_sha:
+        raise SnapshotValidationError(
+            f"mdx_sha256 missing or non-string: got {actual_sha!r}"
+        )
+    if actual_sha != expected_mdx_sha256:
+        raise SnapshotValidationError(
+            f"mdx_sha256 mismatch: snapshot={actual_sha!r} "
+            f"expected={expected_mdx_sha256!r}"
+        )
+
+    missing = [k for k in REQUIRED_TOP_LEVEL_KEYS if k not in snapshot]
+    if missing:
+        raise SnapshotValidationError(
+            f"missing required keys: {missing!r}"
+        )
+
+    for key, entry in snapshot.items():
+        if key in _BARE_KEYS:
+            continue
+        if not isinstance(entry, dict):
+            raise SnapshotValidationError(
+                f"key {key!r}: expected wrapper dict, got {type(entry).__name__}"
+            )
+        for field_name in ("value", "source_path", "upstream_step"):
+            if field_name not in entry:
+                raise SnapshotValidationError(
+                    f"key {key!r}: wrapper missing {field_name!r}"
+                )