From 5191acad85071a67b211f2ec77a409909123c0a4 Mon Sep 17 00:00:00 2001 From: kyeongmin Date: Fri, 15 May 2026 22:33:49 +0900 Subject: [PATCH] =?UTF-8?q?feat(IMP-08):=20U2=20=E2=80=94=20aligner=20cano?= =?UTF-8?q?nical=20sub-id=20+=20N-R5=20decimal=20alias=20guard?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit align_sections_to_v4_granularity now emits canonical sub-section ids of the form ${section_id}-sub-${ordinal} (e.g., "04-2-sub-1"), matching the frontend drag/drop schema. Each drilled sub-section populates heading_number (decimal "2.1" / integer "1" / None for undecorated) and v4_alias_keys for legacy V4 keys. N-R5 decimal-only alias guard : v4_alias_keys is populated only when heading_number matches re.fullmatch(r"\d+\.\d+", ...). Integer-only H3 headings (e.g., MDX 05's "### 1", "### 2") and bare H3 headings produce no alias to avoid sibling-parent V4 collisions (RULE 0 generalization — applies to all 32-frame MDX, not MDX 05-specific). The drill regex is broadened from r"^###\s+(\d+\.\d+)\s+..." to r"^###\s+(?:(\d+(?:\.\d+)?)\s+)?(.+?)$" so integer-only and bare H3 headings are now recognised as sub-sections; they previously failed the regex and were silently kept under the parent section. Tests : 7 new cases (MdxSection default 4-positional callers, V4 exact passthrough, decimal drill with alias, integer-only no-alias guard, bare H3 no-alias, no-H3 passthrough, end-to-end aligner -> resolver round-trip with legacy V4 alias). 15/15 in test_phase_z2_subsection_schema + 14 override + 8 fallback baseline = 37/37 PASS. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/phase_z2_pipeline.py | 47 +++++++++-- tests/test_phase_z2_subsection_schema.py | 100 ++++++++++++++++++++++- 2 files changed, 138 insertions(+), 9 deletions(-) diff --git a/src/phase_z2_pipeline.py b/src/phase_z2_pipeline.py index 5516902..e882b04 100644 --- a/src/phase_z2_pipeline.py +++ b/src/phase_z2_pipeline.py @@ -375,6 +375,16 @@ def load_v4_result() -> dict: def align_sections_to_v4_granularity(sections: list[MdxSection], v4: dict) -> list[MdxSection]: """V4 section granularity 에 맞춰 sections 조정. + IMP-08 B-3 : canonical sub-section id ``${section_id}-sub-${ordinal}`` + (예 : ``04-2-sub-1``) 를 emit 하고, legacy V4 키 (``04-2.1``) 는 + ``v4_alias_keys`` 로 보존하여 ``_resolve_v4_section_key`` 가 alias 경로로 + 매칭한다. canonical ordinal id 는 frontend drag/drop override 와 동일 + schema (`section_id-sub-N`). + + N-R5 alias guard : heading_number 가 decimal (``2.1``) 일 때만 alias + emit. integer-only (``1``) / non-numeric heading 은 alias 0 — sibling + parent V4 evidence 로 잘못 promote 되는 collision 방지 (RULE 0). + 각 section 에 대해 : - V4 에 section.section_id 키 있음 → 그대로 유지 (## level 매칭) - V4 에 키 없고 raw_content 에 ### sub-section 존재 → ### 로 drill @@ -388,31 +398,52 @@ def align_sections_to_v4_granularity(sections: list[MdxSection], v4: dict) -> li v4_keys = set(v4.get("mdx_sections", {}).keys()) aligned: list[MdxSection] = [] + # IMP-08 B-3 : capture optional heading-number prefix (decimal "2.1" or + # integer "1") + heading title. None group = bare "### Title". + sub_pattern = re.compile( + r"^###\s+(?:(\d+(?:\.\d+)?)\s+)?(.+?)$", re.MULTILINE + ) + decimal_re = re.compile(r"\d+\.\d+") + for section in sections: if section.section_id in v4_keys: aligned.append(section) continue - # ### drill 시도 - sub_pattern = re.compile(r"^###\s+(\d+\.\d+)\s+(.+?)$", re.MULTILINE) sub_matches = list(sub_pattern.finditer(section.raw_content)) if not sub_matches: aligned.append(section) # drill 불가, V4 lookup 에서 abort 됨 continue - # ### sub-section 추출 mdx_id = section.section_id.split("-")[0] # e.g., "04" - for i, m in enumerate(sub_matches): - subnum = m.group(1) # e.g., "2.1" + for ordinal, m in enumerate(sub_matches, start=1): + heading_number = m.group(1) # decimal "2.1" / integer "1" / None sub_title = m.group(2).strip() start = m.end() - end = sub_matches[i + 1].start() if i + 1 < len(sub_matches) else len(section.raw_content) + end = ( + sub_matches[ordinal].start() + if ordinal < len(sub_matches) + else len(section.raw_content) + ) raw = section.raw_content[start:end].strip() + + # N-R5 : alias only for decimal heading numbers. integer-only + # H3 (`### 1`) or undecorated H3 produce no alias to avoid + # sibling-parent V4 collisions (e.g., 05.mdx integer H3s). + alias_keys: list[str] = [] + if heading_number and decimal_re.fullmatch(heading_number): + alias_keys.append(f"{mdx_id}-{heading_number}") + + title = ( + f"{heading_number} {sub_title}" if heading_number else sub_title + ) aligned.append(MdxSection( - section_id=f"{mdx_id}-{subnum}", # e.g., "04-2.1" + section_id=f"{section.section_id}-sub-{ordinal}", section_num=section.section_num, - title=f"{subnum} {sub_title}", + title=title, raw_content=raw, + heading_number=heading_number, + v4_alias_keys=alias_keys, )) return aligned diff --git a/tests/test_phase_z2_subsection_schema.py b/tests/test_phase_z2_subsection_schema.py index 1de1e7c..0955b68 100644 --- a/tests/test_phase_z2_subsection_schema.py +++ b/tests/test_phase_z2_subsection_schema.py @@ -7,11 +7,17 @@ NO MDX-specific section ids beyond canonical id format. Locked scope (Stage 3 R8) : A. ``derive_parent_id`` canonical ordinal recognition + legacy decimal fallback. B. ``_resolve_v4_section_key`` exact > alias > None (no parent/sibling promotion). + C. ``align_sections_to_v4_granularity`` canonical ordinal id emit + N-R5 + decimal-only alias guard + MdxSection default-construction stability. """ from __future__ import annotations from src.phase_z2_composition import derive_parent_id -from src.phase_z2_pipeline import _resolve_v4_section_key +from src.phase_z2_pipeline import ( + MdxSection, + _resolve_v4_section_key, + align_sections_to_v4_granularity, +) # ─── A. derive_parent_id ──────────────────────────────────────────────────── @@ -80,3 +86,95 @@ def test_alias_resolver_miss_returns_none(): _resolve_v4_section_key(v4, "04-2-sub-1", alias_keys=["04-2.1"]) is None ) + + +# ─── C. align_sections_to_v4_granularity ──────────────────────────────────── + + +def _section(section_id, num, title, raw_content): + """Build an MdxSection with default sub-section schema fields.""" + return MdxSection( + section_id=section_id, + section_num=num, + title=title, + raw_content=raw_content, + ) + + +def test_mdx_section_default_construction_preserves_4_positional_callers(): + # IMP-08 B-3 : MdxSection still accepts the legacy 4-positional shape + # (defaults for heading_number / v4_alias_keys / sub_sections). + s = MdxSection("04-1", 1, "1. Top", "body") + assert s.heading_number is None + assert s.v4_alias_keys == [] + assert s.sub_sections == [] + + +def test_align_passthrough_when_v4_key_exact_match(): + # Section already aligned to V4 key — aligner keeps it untouched. + sections = [_section("04-1", 1, "1. Top", "body")] + v4 = {"mdx_sections": {"04-1": {"judgments_full32": []}}} + out = align_sections_to_v4_granularity(sections, v4) + assert len(out) == 1 + assert out[0].section_id == "04-1" + + +def test_align_drill_emits_canonical_ordinal_id_with_decimal_alias(): + # Decimal H3 headings -> canonical ordinal id + decimal alias (legacy V4 key). + raw = "### 2.1 First\nbody1\n### 2.2 Second\nbody2\n" + sections = [_section("04-2", 2, "2. Parent", raw)] + v4 = {"mdx_sections": {}} # forces drill (no exact key) + out = align_sections_to_v4_granularity(sections, v4) + assert [s.section_id for s in out] == ["04-2-sub-1", "04-2-sub-2"] + assert [s.heading_number for s in out] == ["2.1", "2.2"] + # N-R5 : decimal headings -> alias emitted. + assert out[0].v4_alias_keys == ["04-2.1"] + assert out[1].v4_alias_keys == ["04-2.2"] + + +def test_align_drill_integer_only_h3_emits_no_alias_n_r5_guard(): + # N-R5 : integer-only H3 (e.g., "### 1 Title") must NOT generate an alias, + # otherwise it would collide with sibling parent V4 entries (`{mdx_id}-1`). + raw = "### 1 Alpha\nbody1\n### 2 Beta\nbody2\n" + sections = [_section("05-2", 2, "2. Parent", raw)] + v4 = {"mdx_sections": {}} + out = align_sections_to_v4_granularity(sections, v4) + assert [s.section_id for s in out] == ["05-2-sub-1", "05-2-sub-2"] + assert [s.heading_number for s in out] == ["1", "2"] + assert out[0].v4_alias_keys == [] + assert out[1].v4_alias_keys == [] + + +def test_align_drill_undecorated_h3_emits_no_alias(): + # Plain `### Title` without numeric prefix -> heading_number=None, no alias. + raw = "### Alpha\nbody1\n### Beta\nbody2\n" + sections = [_section("03-3", 3, "3. Parent", raw)] + v4 = {"mdx_sections": {}} + out = align_sections_to_v4_granularity(sections, v4) + assert [s.section_id for s in out] == ["03-3-sub-1", "03-3-sub-2"] + assert [s.heading_number for s in out] == [None, None] + assert all(s.v4_alias_keys == [] for s in out) + + +def test_align_no_h3_passes_section_through_unchanged(): + # No H3 sub-headings in raw_content -> aligner keeps the section. + sections = [_section("04-1", 1, "1. Top", "no subheadings here\njust prose")] + v4 = {"mdx_sections": {}} + out = align_sections_to_v4_granularity(sections, v4) + assert len(out) == 1 + assert out[0].section_id == "04-1" + + +def test_align_resolver_round_trip_with_legacy_v4_alias(): + # End-to-end : aligner emits canonical id + alias keys; resolver finds the + # legacy decimal key in V4 via alias path (no parent promotion). + raw = "### 2.1 First\nbody1\n" + sections = [_section("04-2", 2, "2. Parent", raw)] + v4 = {"mdx_sections": {"04-2.1": {"judgments_full32": []}}} + out = align_sections_to_v4_granularity(sections, v4) + sub = out[0] + assert sub.section_id == "04-2-sub-1" + resolved = _resolve_v4_section_key( + v4, sub.section_id, alias_keys=sub.v4_alias_keys + ) + assert resolved == "04-2.1"