fix(orchestrator): P5b first-line agent header strict + supplement throttle

Bug discovered during #24 IMP-24 K6 Stage 2 (2026-05-20): - Codex r1, r2, r3 started with '=== IMPLEMENTATION_UNITS ===' on first line (not '[Codex #N] ...'), so detect_agent (P0-1 strict, first-line only) returned None. - For non-audit issues, the P5 supplement guard was audit-only gated → silent loop until Codex r4 happened to use correct format. 4 rounds wasted. Verified that #21 Stage 4 had the same latent silent loop pattern ('## [Codex #1]' first line) — orchestrator looped through ~10 Claude rounds before random recovery. P5b fix addresses this long-standing bug. Patch (defensive parser-contract hardening; does not assume single root cause): 1. RULES global gets explicit "FIRST non-empty line MUST be [Claude #N] / [Codex #N]" rule that OVERRIDES any stage-specific "body MUST contain" constraint. 2. COMPACT_PLAN_RULE wording clarified: "body" begins AFTER the first-line agent header. The 'body MUST contain ONLY' set no longer accidentally permits '=== IMPLEMENTATION_UNITS ===' on line 1. 3. is_codex None supplement guard: - audit-only gate REMOVED → fires for all issues (#24 latent loop fixed) - Throttle: max 2 supplements per stage; on 3rd violation, orchestrator hard-stops the issue with explicit "user action required" message and exits run_stage cleanly - Supplement message names both Claude AND Codex (Claude's first-line violation also breaks downstream via Codex mimicry) - Body-head 80 chars logged on detection failure (debugging aid) 4. Regression tests (+5 cases in test_orchestrator_core.py): - TestDetectAgent: '=== IMPLEMENTATION_UNITS ===' first line → None - TestDetectAgent: [Codex #N] first line + units after → 'codex' OK - TestDetectAgent: '## ', '📌 **', '**' prefix all → None - TestRulesAndCompactPlanFirstLineContract: RULES wording has FIRST/OVERRIDES - TestRulesAndCompactPlanFirstLineContract: COMPACT_PLAN_RULE has carve-out Cosmetic side effect (accepted): Claude's '📌 **[Claude #N] ...**' or '## [Codex #N] ...' decoration prefixes will fail detect_agent. Agents will drop decorations from line 1; line 2+ can still use them. Out of scope (NOT included to keep regression risk low): - detect_agent function logic UNCHANGED (P0-1 strict preserved) - consensus parser UNCHANGED - stage loop structure UNCHANGED - git/Gitea retrieval logic UNCHANGED - audit-only mode P4/P4a guards UNCHANGED - pre-post comment validation (future axis, larger refactor) Total: 131/131 pytest pass (126 prior + 5 new). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-20 17:01:24 +09:00
parent 447e702520
commit 5d23b747ff
2 changed files with 133 additions and 13 deletions
--- a/tests/orchestrator_unit/test_orchestrator_core.py
+++ b/tests/orchestrator_unit/test_orchestrator_core.py
@@ -128,6 +128,76 @@ Addressing [Codex #2] findings ...
        )
        assert detect_agent(body_header_first) == "codex"

+    # P5b (2026-05-20) — Stage 2 compact-plan first-line conflict regression.
+    # #24 IMP-24 K6: Codex r1~r3 가 첫 줄을 '=== IMPLEMENTATION_UNITS ===' 로 시작 →
+    # detect_agent None → orchestrator silent loop. fix path = comment format strict,
+    # NOT detect_agent 완화 (P0-1 강화 그대로 유지).
+
+    def test_implementation_units_first_line_breaks_detection(self):
+        """=== IMPLEMENTATION_UNITS === 가 첫 줄이면 detect_agent None (P0-1 strict 정상 동작)."""
+        body = (
+            "=== IMPLEMENTATION_UNITS ===\n"
+            "- id: u1\n"
+            "  summary: ...\n"
+            "  files:\n"
+            "    - docs/architecture/PHASE-Q-AUDIT.md\n"
+            "  tests:\n"
+            "    - pytest -q tests\n"
+            "  estimate_lines: 1\n"
+            "\n"
+            "FINAL_CONSENSUS: YES\n"
+        )
+        assert detect_agent(body) is None, (
+            "=== IMPLEMENTATION_UNITS === as first line MUST cause detect_agent None "
+            "(P0-1 strict). Fix path: enforce agent header first-line in prompt, not relax detect_agent."
+        )
+
+    def test_compact_plan_with_header_first_works(self):
+        """올바른 Stage 2 compact format: [Codex #N] 첫 줄 → === IMPLEMENTATION_UNITS === 둘째 줄+."""
+        body = (
+            "[Codex #4] Stage 2 simulation-plan review - IMP-24 K6\n"
+            "\n"
+            "=== IMPLEMENTATION_UNITS ===\n"
+            "- id: u1\n"
+            "  summary: ...\n"
+            "  tests:\n"
+            "    - pytest -q tests\n"
+            "\n"
+            "FINAL_CONSENSUS: YES\n"
+        )
+        assert detect_agent(body) == "codex"
+
+    def test_markdown_prefix_breaks_detection(self):
+        """P5b — `## [Codex #N]` 같은 markdown header prefix 도 detect_agent None.
+        (#21 Stage 4 에서 관찰된 latent silent loop 원인.)"""
+        body_hash = "## [Codex #1] Stage 4 test-verify Round #1\n\nVerdict: PASS\n"
+        body_emoji = "📌 **[Claude #1] Stage 2 plan**\n\nbody\n"
+        body_bold = "**[Codex #1] Stage 4**\n\nbody\n"
+        assert detect_agent(body_hash) is None
+        assert detect_agent(body_emoji) is None
+        assert detect_agent(body_bold) is None
+
+
+class TestRulesAndCompactPlanFirstLineContract:
+    """P5b (2026-05-20) — RULES 와 COMPACT_PLAN_RULE 둘 다 first-line agent header
+    rule 을 명시해야 함. wording 검증."""
+
+    def test_rules_has_first_line_strict(self):
+        from orchestrator import RULES
+        # RULES 안에 first-line strict + 모든 stage 적용 명시 있어야 함.
+        assert "FIRST non-empty line" in RULES
+        assert "[Claude #N]" in RULES and "[Codex #N]" in RULES
+        # P5b OVERRIDES 키워드 — body rule 들이 first-line rule 보다 우선하지 않음을 강조
+        assert "OVERRIDES" in RULES or "overrides" in RULES.lower()
+
+    def test_compact_plan_rule_carves_out_first_line(self):
+        from orchestrator import COMPACT_PLAN_RULE
+        # "body" 는 first-line agent header 다음부터 시작한다고 명시
+        assert "FIRST non-empty line" in COMPACT_PLAN_RULE or "first-line agent header" in COMPACT_PLAN_RULE
+        # "after the first-line" 같은 carve-out wording 검증
+        body_lower = COMPACT_PLAN_RULE.lower()
+        assert "after the first" in body_lower or "after the agent header" in body_lower
+

 # ─────────────────────────────────────────────────────────────────
 # parse_consensus — YES/NO + rewind_target