feat(IMP-08): U2 — aligner canonical sub-id + N-R5 decimal alias guard
align_sections_to_v4_granularity now emits canonical sub-section ids
of the form ${section_id}-sub-${ordinal} (e.g., "04-2-sub-1"), matching
the frontend drag/drop schema. Each drilled sub-section populates
heading_number (decimal "2.1" / integer "1" / None for undecorated)
and v4_alias_keys for legacy V4 keys.
N-R5 decimal-only alias guard : v4_alias_keys is populated only when
heading_number matches re.fullmatch(r"\d+\.\d+", ...). Integer-only
H3 headings (e.g., MDX 05's "### 1", "### 2") and bare H3 headings
produce no alias to avoid sibling-parent V4 collisions (RULE 0
generalization — applies to all 32-frame MDX, not MDX 05-specific).
The drill regex is broadened from r"^###\s+(\d+\.\d+)\s+..." to
r"^###\s+(?:(\d+(?:\.\d+)?)\s+)?(.+?)$" so integer-only and bare H3
headings are now recognised as sub-sections; they previously failed
the regex and were silently kept under the parent section.
Tests : 7 new cases (MdxSection default 4-positional callers, V4 exact
passthrough, decimal drill with alias, integer-only no-alias guard,
bare H3 no-alias, no-H3 passthrough, end-to-end aligner -> resolver
round-trip with legacy V4 alias). 15/15 in test_phase_z2_subsection_schema
+ 14 override + 8 fallback baseline = 37/37 PASS.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -375,6 +375,16 @@ def load_v4_result() -> dict:
|
|||||||
def align_sections_to_v4_granularity(sections: list[MdxSection], v4: dict) -> list[MdxSection]:
|
def align_sections_to_v4_granularity(sections: list[MdxSection], v4: dict) -> list[MdxSection]:
|
||||||
"""V4 section granularity 에 맞춰 sections 조정.
|
"""V4 section granularity 에 맞춰 sections 조정.
|
||||||
|
|
||||||
|
IMP-08 B-3 : canonical sub-section id ``${section_id}-sub-${ordinal}``
|
||||||
|
(예 : ``04-2-sub-1``) 를 emit 하고, legacy V4 키 (``04-2.1``) 는
|
||||||
|
``v4_alias_keys`` 로 보존하여 ``_resolve_v4_section_key`` 가 alias 경로로
|
||||||
|
매칭한다. canonical ordinal id 는 frontend drag/drop override 와 동일
|
||||||
|
schema (`section_id-sub-N`).
|
||||||
|
|
||||||
|
N-R5 alias guard : heading_number 가 decimal (``2.1``) 일 때만 alias
|
||||||
|
emit. integer-only (``1``) / non-numeric heading 은 alias 0 — sibling
|
||||||
|
parent V4 evidence 로 잘못 promote 되는 collision 방지 (RULE 0).
|
||||||
|
|
||||||
각 section 에 대해 :
|
각 section 에 대해 :
|
||||||
- V4 에 section.section_id 키 있음 → 그대로 유지 (## level 매칭)
|
- V4 에 section.section_id 키 있음 → 그대로 유지 (## level 매칭)
|
||||||
- V4 에 키 없고 raw_content 에 ### sub-section 존재 → ### 로 drill
|
- V4 에 키 없고 raw_content 에 ### sub-section 존재 → ### 로 drill
|
||||||
@@ -388,31 +398,52 @@ def align_sections_to_v4_granularity(sections: list[MdxSection], v4: dict) -> li
|
|||||||
v4_keys = set(v4.get("mdx_sections", {}).keys())
|
v4_keys = set(v4.get("mdx_sections", {}).keys())
|
||||||
aligned: list[MdxSection] = []
|
aligned: list[MdxSection] = []
|
||||||
|
|
||||||
|
# IMP-08 B-3 : capture optional heading-number prefix (decimal "2.1" or
|
||||||
|
# integer "1") + heading title. None group = bare "### Title".
|
||||||
|
sub_pattern = re.compile(
|
||||||
|
r"^###\s+(?:(\d+(?:\.\d+)?)\s+)?(.+?)$", re.MULTILINE
|
||||||
|
)
|
||||||
|
decimal_re = re.compile(r"\d+\.\d+")
|
||||||
|
|
||||||
for section in sections:
|
for section in sections:
|
||||||
if section.section_id in v4_keys:
|
if section.section_id in v4_keys:
|
||||||
aligned.append(section)
|
aligned.append(section)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# ### drill 시도
|
|
||||||
sub_pattern = re.compile(r"^###\s+(\d+\.\d+)\s+(.+?)$", re.MULTILINE)
|
|
||||||
sub_matches = list(sub_pattern.finditer(section.raw_content))
|
sub_matches = list(sub_pattern.finditer(section.raw_content))
|
||||||
if not sub_matches:
|
if not sub_matches:
|
||||||
aligned.append(section) # drill 불가, V4 lookup 에서 abort 됨
|
aligned.append(section) # drill 불가, V4 lookup 에서 abort 됨
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# ### sub-section 추출
|
|
||||||
mdx_id = section.section_id.split("-")[0] # e.g., "04"
|
mdx_id = section.section_id.split("-")[0] # e.g., "04"
|
||||||
for i, m in enumerate(sub_matches):
|
for ordinal, m in enumerate(sub_matches, start=1):
|
||||||
subnum = m.group(1) # e.g., "2.1"
|
heading_number = m.group(1) # decimal "2.1" / integer "1" / None
|
||||||
sub_title = m.group(2).strip()
|
sub_title = m.group(2).strip()
|
||||||
start = m.end()
|
start = m.end()
|
||||||
end = sub_matches[i + 1].start() if i + 1 < len(sub_matches) else len(section.raw_content)
|
end = (
|
||||||
|
sub_matches[ordinal].start()
|
||||||
|
if ordinal < len(sub_matches)
|
||||||
|
else len(section.raw_content)
|
||||||
|
)
|
||||||
raw = section.raw_content[start:end].strip()
|
raw = section.raw_content[start:end].strip()
|
||||||
|
|
||||||
|
# N-R5 : alias only for decimal heading numbers. integer-only
|
||||||
|
# H3 (`### 1`) or undecorated H3 produce no alias to avoid
|
||||||
|
# sibling-parent V4 collisions (e.g., 05.mdx integer H3s).
|
||||||
|
alias_keys: list[str] = []
|
||||||
|
if heading_number and decimal_re.fullmatch(heading_number):
|
||||||
|
alias_keys.append(f"{mdx_id}-{heading_number}")
|
||||||
|
|
||||||
|
title = (
|
||||||
|
f"{heading_number} {sub_title}" if heading_number else sub_title
|
||||||
|
)
|
||||||
aligned.append(MdxSection(
|
aligned.append(MdxSection(
|
||||||
section_id=f"{mdx_id}-{subnum}", # e.g., "04-2.1"
|
section_id=f"{section.section_id}-sub-{ordinal}",
|
||||||
section_num=section.section_num,
|
section_num=section.section_num,
|
||||||
title=f"{subnum} {sub_title}",
|
title=title,
|
||||||
raw_content=raw,
|
raw_content=raw,
|
||||||
|
heading_number=heading_number,
|
||||||
|
v4_alias_keys=alias_keys,
|
||||||
))
|
))
|
||||||
|
|
||||||
return aligned
|
return aligned
|
||||||
|
|||||||
@@ -7,11 +7,17 @@ NO MDX-specific section ids beyond canonical id format.
|
|||||||
Locked scope (Stage 3 R8) :
|
Locked scope (Stage 3 R8) :
|
||||||
A. ``derive_parent_id`` canonical ordinal recognition + legacy decimal fallback.
|
A. ``derive_parent_id`` canonical ordinal recognition + legacy decimal fallback.
|
||||||
B. ``_resolve_v4_section_key`` exact > alias > None (no parent/sibling promotion).
|
B. ``_resolve_v4_section_key`` exact > alias > None (no parent/sibling promotion).
|
||||||
|
C. ``align_sections_to_v4_granularity`` canonical ordinal id emit + N-R5
|
||||||
|
decimal-only alias guard + MdxSection default-construction stability.
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from src.phase_z2_composition import derive_parent_id
|
from src.phase_z2_composition import derive_parent_id
|
||||||
from src.phase_z2_pipeline import _resolve_v4_section_key
|
from src.phase_z2_pipeline import (
|
||||||
|
MdxSection,
|
||||||
|
_resolve_v4_section_key,
|
||||||
|
align_sections_to_v4_granularity,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# ─── A. derive_parent_id ────────────────────────────────────────────────────
|
# ─── A. derive_parent_id ────────────────────────────────────────────────────
|
||||||
@@ -80,3 +86,95 @@ def test_alias_resolver_miss_returns_none():
|
|||||||
_resolve_v4_section_key(v4, "04-2-sub-1", alias_keys=["04-2.1"])
|
_resolve_v4_section_key(v4, "04-2-sub-1", alias_keys=["04-2.1"])
|
||||||
is None
|
is None
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ─── C. align_sections_to_v4_granularity ────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def _section(section_id, num, title, raw_content):
|
||||||
|
"""Build an MdxSection with default sub-section schema fields."""
|
||||||
|
return MdxSection(
|
||||||
|
section_id=section_id,
|
||||||
|
section_num=num,
|
||||||
|
title=title,
|
||||||
|
raw_content=raw_content,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_mdx_section_default_construction_preserves_4_positional_callers():
|
||||||
|
# IMP-08 B-3 : MdxSection still accepts the legacy 4-positional shape
|
||||||
|
# (defaults for heading_number / v4_alias_keys / sub_sections).
|
||||||
|
s = MdxSection("04-1", 1, "1. Top", "body")
|
||||||
|
assert s.heading_number is None
|
||||||
|
assert s.v4_alias_keys == []
|
||||||
|
assert s.sub_sections == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_align_passthrough_when_v4_key_exact_match():
|
||||||
|
# Section already aligned to V4 key — aligner keeps it untouched.
|
||||||
|
sections = [_section("04-1", 1, "1. Top", "body")]
|
||||||
|
v4 = {"mdx_sections": {"04-1": {"judgments_full32": []}}}
|
||||||
|
out = align_sections_to_v4_granularity(sections, v4)
|
||||||
|
assert len(out) == 1
|
||||||
|
assert out[0].section_id == "04-1"
|
||||||
|
|
||||||
|
|
||||||
|
def test_align_drill_emits_canonical_ordinal_id_with_decimal_alias():
|
||||||
|
# Decimal H3 headings -> canonical ordinal id + decimal alias (legacy V4 key).
|
||||||
|
raw = "### 2.1 First\nbody1\n### 2.2 Second\nbody2\n"
|
||||||
|
sections = [_section("04-2", 2, "2. Parent", raw)]
|
||||||
|
v4 = {"mdx_sections": {}} # forces drill (no exact key)
|
||||||
|
out = align_sections_to_v4_granularity(sections, v4)
|
||||||
|
assert [s.section_id for s in out] == ["04-2-sub-1", "04-2-sub-2"]
|
||||||
|
assert [s.heading_number for s in out] == ["2.1", "2.2"]
|
||||||
|
# N-R5 : decimal headings -> alias emitted.
|
||||||
|
assert out[0].v4_alias_keys == ["04-2.1"]
|
||||||
|
assert out[1].v4_alias_keys == ["04-2.2"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_align_drill_integer_only_h3_emits_no_alias_n_r5_guard():
|
||||||
|
# N-R5 : integer-only H3 (e.g., "### 1 Title") must NOT generate an alias,
|
||||||
|
# otherwise it would collide with sibling parent V4 entries (`{mdx_id}-1`).
|
||||||
|
raw = "### 1 Alpha\nbody1\n### 2 Beta\nbody2\n"
|
||||||
|
sections = [_section("05-2", 2, "2. Parent", raw)]
|
||||||
|
v4 = {"mdx_sections": {}}
|
||||||
|
out = align_sections_to_v4_granularity(sections, v4)
|
||||||
|
assert [s.section_id for s in out] == ["05-2-sub-1", "05-2-sub-2"]
|
||||||
|
assert [s.heading_number for s in out] == ["1", "2"]
|
||||||
|
assert out[0].v4_alias_keys == []
|
||||||
|
assert out[1].v4_alias_keys == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_align_drill_undecorated_h3_emits_no_alias():
|
||||||
|
# Plain `### Title` without numeric prefix -> heading_number=None, no alias.
|
||||||
|
raw = "### Alpha\nbody1\n### Beta\nbody2\n"
|
||||||
|
sections = [_section("03-3", 3, "3. Parent", raw)]
|
||||||
|
v4 = {"mdx_sections": {}}
|
||||||
|
out = align_sections_to_v4_granularity(sections, v4)
|
||||||
|
assert [s.section_id for s in out] == ["03-3-sub-1", "03-3-sub-2"]
|
||||||
|
assert [s.heading_number for s in out] == [None, None]
|
||||||
|
assert all(s.v4_alias_keys == [] for s in out)
|
||||||
|
|
||||||
|
|
||||||
|
def test_align_no_h3_passes_section_through_unchanged():
|
||||||
|
# No H3 sub-headings in raw_content -> aligner keeps the section.
|
||||||
|
sections = [_section("04-1", 1, "1. Top", "no subheadings here\njust prose")]
|
||||||
|
v4 = {"mdx_sections": {}}
|
||||||
|
out = align_sections_to_v4_granularity(sections, v4)
|
||||||
|
assert len(out) == 1
|
||||||
|
assert out[0].section_id == "04-1"
|
||||||
|
|
||||||
|
|
||||||
|
def test_align_resolver_round_trip_with_legacy_v4_alias():
|
||||||
|
# End-to-end : aligner emits canonical id + alias keys; resolver finds the
|
||||||
|
# legacy decimal key in V4 via alias path (no parent promotion).
|
||||||
|
raw = "### 2.1 First\nbody1\n"
|
||||||
|
sections = [_section("04-2", 2, "2. Parent", raw)]
|
||||||
|
v4 = {"mdx_sections": {"04-2.1": {"judgments_full32": []}}}
|
||||||
|
out = align_sections_to_v4_granularity(sections, v4)
|
||||||
|
sub = out[0]
|
||||||
|
assert sub.section_id == "04-2-sub-1"
|
||||||
|
resolved = _resolve_v4_section_key(
|
||||||
|
v4, sub.section_id, alias_keys=sub.v4_alias_keys
|
||||||
|
)
|
||||||
|
assert resolved == "04-2.1"
|
||||||
|
|||||||
Reference in New Issue
Block a user