Add Phase Z Layer A planning scaffold

- add Internal Region model to Phase Z architecture docs and specs - add frame contract content type and Frame Slot declarations - add dormant content object extractor and internal region planner
2026-05-04 08:21:50 +09:00
parent e7848b602d
commit 2ec8fc5a77
7 changed files with 2604 additions and 0 deletions
--- a/src/phase_z2_content_extractor.py
+++ b/src/phase_z2_content_extractor.py
@@ -0,0 +1,323 @@
+"""Phase Z-2 Content Object extractor (B1 v0 — dormant module).
+
+SPEC v1 §1 의 typed content_object schema 만족하는 dedicated extractor.
+
+v0 minimal :
+  - 지원 type : text_block, transform_table 2 개 만 (table / image / diagram / details 제외)
+  - role : 모두 "summary" (v0 default — role 정밀화는 별 axis)
+  - dormant — runtime path 미연결 (pipeline / composition / mapper 미터치)
+  - mapper 미수정, 기존 helper move / promote / copy 없음
+  - transform_table 은 *arrow column 보존* 위해 B1 *local helper* 로 구현
+    (regex / parsing 일부가 mapper helper 와 유사 — 단 mapper helper 는 arrow 폐기.
+     향후 helper promote / 통합 refactor 는 별 axis)
+
+v0 흐름 :
+  section.raw_content
+    → 3-column markdown table 감지 (arrow glyph 포함) → transform_table
+    → 나머지 content → text_block (format / bullet_count / has_emphasis 분석)
+  → list[ContentObject]
+
+검증 :
+  - dormancy : MDX 03 final.html SHA = canonical 유지 (runtime path 미연결)
+  - correctness : __main__ self-test (text_block 1 case + transform_table 1 case)
+"""
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass, field
+
+# B1 v0 helper 처리 정직 기록 (기존 보고 정정 — 2026-04-30) :
+#   - `phase_z2_mapper` 미수정. 기존 mapper helper (`_extract_markdown_table` 등) move /
+#     promote / copy 없음.
+#   - 단 SPEC v1 §1.2 transform_table.rows = [{from, arrow, to}] schema 가
+#     mapper 의 helper 출력 (from/to 만, arrow 폐기) 와 호환 안 됨.
+#   - 따라서 *arrow column 보존* 이 필요한 transform_table 추출 부분은 본 module 의
+#     *layer-agnostic local helper* (`_capture_3col_transform_table`) 로 *별도 구현*.
+#   - mapper helper 와 regex / parsing 일부 유사 — 향후 *promote / 통합 refactor* 는
+#     별 axis (B1 안정 후 layer-agnostic helper module 통합 검토 가능).
+
+
+# ─── ContentObject schema (SPEC v1 §1.1) ────────────────────────
+
+
+@dataclass
+class ContentObject:
+    """SPEC v1 §1.1 base schema. v0 = text_block + transform_table 만 지원.
+
+    Fields :
+      id            : section 내 unique id (예: '03-2.transform-1' / '03-2.text-1')
+      type          : "text_block" | "transform_table"
+      role          : v0 = "summary" 만 (정밀화는 별 axis)
+      raw_payload   : 원본 markdown (자름 / 변형 X — 원문 보존 룰)
+      size_estimate : type 별 (line_count / rows 등)
+      type_specific : type 별 detail (SPEC v1 §1.2)
+    """
+
+    id: str
+    type: str
+    role: str
+    raw_payload: str
+    size_estimate: dict = field(default_factory=dict)
+    type_specific: dict = field(default_factory=dict)
+
+
+# ─── Transform table extraction ─────────────────────────────────
+
+
+_ARROW_GLYPHS = ("➜", "➠", "→", "->", "=>")
+
+_TABLE_PATTERN = re.compile(
+    r"(^[ \t]*\|[^\n]+\|\n[ \t]*\|[\s\-:|]+\|\n(?:[ \t]*\|[^\n]+\|\n?)+)",
+    re.MULTILINE,
+)
+
+
+def _capture_3col_transform_table(content: str) -> tuple[dict | None, str]:
+    """3-column markdown table 에서 (from / arrow / to) 캡처 → transform_table.
+
+    본 함수 = B1 v0 의 *layer-agnostic extractor helper*. mapper 의
+    `_extract_markdown_table` 와 regex / parsing 의 일부가 유사하나, mapper helper 는
+    arrow column 을 폐기 (from/to 만 추출) — SPEC v1 §1.2 의
+    `transform_table.rows = [{from, arrow, to}]` schema 를 직접 만족 못 함.
+    따라서 arrow column 보존 필요해 본 module 안에 *별도 구현*. mapper 미수정 유지.
+
+    *향후 helper promote / 통합 refactor 는 별 axis* — B1 안정 후 mapper 와
+    *layer-agnostic helper module* 통합 검토 가능.
+
+    arrow column 에 arrow glyph 가 있어야 transform 으로 인정.
+
+    Returns :
+      ({"type_specific": ..., "raw_payload": <table markdown>}, content_without_table)
+      또는 (None, original_content) — transform 패턴 미감지 시
+    """
+    m = _TABLE_PATTERN.search(content)
+    if not m:
+        return None, content
+
+    raw_lines = [r.strip() for r in m.group(1).strip().splitlines() if r.strip()]
+    if len(raw_lines) < 3:                                # header + separator + ≥1 data row
+        return None, content
+
+    data_rows = raw_lines[2:]                             # skip header + separator
+    pairs: list[dict] = []
+    arrow_glyph = ""
+    for r in data_rows:
+        cells = [c.strip() for c in r.strip("|").split("|")]
+        if len(cells) < 3:
+            continue
+        f = re.sub(r"\*\*(.+?)\*\*", r"\1", cells[0])
+        a = re.sub(r"\*\*(.+?)\*\*", r"\1", cells[1])
+        t = re.sub(r"\*\*(.+?)\*\*", r"\1", cells[2])
+        if not arrow_glyph:
+            for g in _ARROW_GLYPHS:
+                if g in a:
+                    arrow_glyph = g
+                    break
+        pairs.append({"from": f, "arrow": a, "to": t})
+
+    if not pairs:
+        return None, content
+
+    # transform 인지 검증 — arrow glyph 가 *어느 row 든* 등장해야
+    has_arrow = any(any(g in p["arrow"] for g in _ARROW_GLYPHS) for p in pairs)
+    if not has_arrow:
+        return None, content
+
+    type_specific = {
+        "pair_count": len(pairs),
+        "arrow_glyph": arrow_glyph,
+        "rows": pairs,
+    }
+    raw_table = m.group(1)
+    remaining = content[: m.start()] + content[m.end() :]
+    return ({"type_specific": type_specific, "raw_payload": raw_table}, remaining)
+
+
+# ─── Text block extraction ──────────────────────────────────────
+
+
+def _detect_text_block_specific(content: str) -> tuple[dict, int]:
+    """text_block 의 type_specific + line_count 추출.
+
+    format 결정 :
+      - top bullet 0           → paragraph
+      - top bullet 있음, nested 0  → bullet_list
+      - top bullet + nested    → nested_list
+
+    Returns :
+      (type_specific dict, line_count)
+    """
+    lines = content.splitlines()
+
+    top_bullets = sum(1 for l in lines if re.match(r"^[\*\-]\s", l))
+    nested_bullets = sum(1 for l in lines if re.match(r"^\s+[\*\-]\s", l))
+
+    # max_indent_level (2-space indent 단위)
+    max_indent = 0
+    for l in lines:
+        mm = re.match(r"^( *)[\*\-]\s", l)
+        if mm:
+            level = len(mm.group(1)) // 2
+            max_indent = max(max_indent, level)
+
+    if top_bullets == 0:
+        fmt = "paragraph"
+    elif nested_bullets > 0:
+        fmt = "nested_list"
+    else:
+        fmt = "bullet_list"
+
+    has_emphasis = bool(
+        re.search(r"\*\*[^*\n]+\*\*", content)
+        or re.search(r"(?<!\*)\*[^*\n]+\*(?!\*)", content)
+    )
+
+    line_count = sum(1 for l in lines if l.strip())
+
+    type_specific = {
+        "format": fmt,
+        "bullet_count": top_bullets,
+        "max_indent_level": max_indent,
+        "has_emphasis": has_emphasis,
+    }
+    return type_specific, line_count
+
+
+# ─── Public entry ───────────────────────────────────────────────
+
+
+def extract_content_objects(section) -> list[ContentObject]:
+    """MDX section.raw_content → typed content_object list (SPEC v1 §1).
+
+    v0 minimal :
+      - 1 section → 1~2 ContentObject (transform_table + text_block 또는 text_block 만)
+      - role = "summary" (모두 — v0 default)
+      - 미지원 type (table / image / diagram / details) = 무시 (별 axis)
+      - 원문 (raw_payload) = 자름 / 변형 X (원문 보존 룰)
+
+    Args :
+      section : MdxSection-like 객체 (section_id, raw_content 필드 필요)
+
+    Returns :
+      list[ContentObject]  — 0 ~ 2 개 (content 비어 있으면 0, transform-only 면 1, mixed 면 2)
+    """
+    content = section.raw_content
+    section_id = section.section_id
+
+    objects: list[ContentObject] = []
+
+    # 1. transform_table 추출 시도 (3-col with arrow)
+    transform_result, remaining = _capture_3col_transform_table(content)
+    if transform_result is not None:
+        objects.append(
+            ContentObject(
+                id=f"{section_id}.transform-1",
+                type="transform_table",
+                role="summary",
+                raw_payload=transform_result["raw_payload"],
+                size_estimate={"rows": transform_result["type_specific"]["pair_count"]},
+                type_specific=transform_result["type_specific"],
+            )
+        )
+
+    # 2. text_block 추출 (transform 추출 후 남은 content, 또는 transform 없으면 전체)
+    text_remainder = remaining if transform_result is not None else content
+    if text_remainder.strip():
+        text_specific, line_count = _detect_text_block_specific(text_remainder)
+        objects.append(
+            ContentObject(
+                id=f"{section_id}.text-1",
+                type="text_block",
+                role="summary",
+                raw_payload=text_remainder.strip(),
+                size_estimate={"line_count": line_count},
+                type_specific=text_specific,
+            )
+        )
+
+    return objects
+
+
+# ─── Self-test (B1 v0 correctness 검증) ─────────────────────────
+
+
+def _run_self_test():
+    """v0 unit test : text_block 1 case + transform_table 1 case.
+
+    scope-lock 의 검증 (b) correctness — 추출기 정확성 확인.
+    fixed input 기반, MDX 01/02/04 미사용.
+    """
+
+    class MockSection:
+        def __init__(self, section_id: str, raw_content: str):
+            self.section_id = section_id
+            self.raw_content = raw_content
+
+    # ─── Test 1 : text_block (nested_list 형태, F13 style) ───────
+    text_section = MockSection(
+        "test-1",
+        "* **기술 부족**\n"
+        "  * 디지털 도구 미숙\n"
+        "  * BIM 활용 제한\n"
+        "* **인력 부족**\n"
+        "  * 전문가 부재\n"
+        "* **자연 환경**\n"
+        "  * 지역적 제약\n",
+    )
+    objs1 = extract_content_objects(text_section)
+    assert len(objs1) == 1, f"text-only section → 1 obj 기대, got {len(objs1)}"
+    o = objs1[0]
+    assert o.type == "text_block", f"type=text_block 기대, got {o.type}"
+    assert o.role == "summary"
+    assert o.id == "test-1.text-1"
+    assert o.type_specific["format"] == "nested_list", f"format=nested_list 기대, got {o.type_specific['format']}"
+    assert o.type_specific["bullet_count"] == 3, f"top bullet=3 기대, got {o.type_specific['bullet_count']}"
+    assert o.type_specific["max_indent_level"] >= 1, "nested 가 있으니 max_indent ≥ 1"
+    assert o.type_specific["has_emphasis"] is True, "**bold** 존재 → has_emphasis=True"
+    assert o.size_estimate["line_count"] >= 6
+    assert "기술 부족" in o.raw_payload, "원문 보존 — '기술 부족' 잔존 필요"
+    print("[OK] Test 1 (text_block) passed.")
+
+    # ─── Test 2 : transform_table (3-col, arrow 포함) + 잔여 text ─
+    transform_section = MockSection(
+        "test-2",
+        "**프로세스 변환**\n"
+        "\n"
+        "| AS-IS | ➜ | TO-BE |\n"
+        "|---|---|---|\n"
+        "| 도면 중심 | ➜ | BIM 모델 중심 |\n"
+        "| 단계별 분리 | ➜ | 통합 협업 |\n"
+        "| 사후 검토 | ➜ | 실시간 검증 |\n"
+        "\n"
+        "추가 설명 : 위 변환이 핵심.\n",
+    )
+    objs2 = extract_content_objects(transform_section)
+    assert len(objs2) == 2, f"transform+text → 2 obj 기대, got {len(objs2)}"
+
+    # transform_table 검증
+    t = objs2[0]
+    assert t.type == "transform_table", f"첫 obj=transform_table 기대, got {t.type}"
+    assert t.role == "summary"
+    assert t.id == "test-2.transform-1"
+    assert t.type_specific["pair_count"] == 3, f"pair_count=3 기대, got {t.type_specific['pair_count']}"
+    assert t.type_specific["arrow_glyph"] == "➜", f"arrow_glyph=➜ 기대, got {t.type_specific['arrow_glyph']}"
+    assert len(t.type_specific["rows"]) == 3
+    assert t.type_specific["rows"][0]["from"] == "도면 중심"
+    assert t.type_specific["rows"][0]["to"] == "BIM 모델 중심"
+    assert t.size_estimate["rows"] == 3
+    assert "도면 중심" in t.raw_payload, "raw_payload 에 원본 table 보존"
+
+    # text_block 검증 (transform 제거 후 남은 content)
+    tb = objs2[1]
+    assert tb.type == "text_block", f"두번째 obj=text_block 기대, got {tb.type}"
+    assert tb.id == "test-2.text-1"
+    assert "프로세스 변환" in tb.raw_payload, "transform 제거 후 surrounding text 보존 — '프로세스 변환'"
+    assert "추가 설명" in tb.raw_payload, "transform 뒤 잔여 text 보존 — '추가 설명'"
+    print("[OK] Test 2 (transform_table + text_block) passed.")
+
+    print("\n=== B1 v0 self-test PASS ===")
+
+
+if __name__ == "__main__":
+    _run_self_test()