Checkpoint Type B pipeline refinement for run-002 and run-003

2026-04-07 12:16:58 +09:00
parent f48dbe5227
commit 11e9165a8f
71 changed files with 1318 additions and 1051 deletions
--- a/scripts/raw_bootstrap.py
+++ b/scripts/raw_bootstrap.py
@@ -1,4 +1,4 @@
-from __future__ import annotations
+from __future__ import annotations

 import json
 import re
@@ -7,61 +7,65 @@ from typing import Any


 def _read_text(path: Path) -> str:
-    return path.read_text(encoding="utf-8-sig")
+    return path.read_text(encoding='utf-8-sig')


-def _write_json(path: Path, data: dict[str, Any]) -> None:
-    path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
+def _write_json(path: Path, data: dict[str, Any] | list[Any]) -> None:
+    path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding='utf-8')


 def _write_text(path: Path, text: str) -> None:
-    path.write_text(text, encoding="utf-8")
+    path.write_text(text, encoding='utf-8')
+
+
+def _normalize_space(text: str) -> str:
+    return re.sub(r'\s+', ' ', text or '').strip()


 def _compact(text: str, max_len: int) -> str:
-    normalized = re.sub(r"\s+", " ", text).strip()
+    normalized = _normalize_space(text)
    if len(normalized) <= max_len:
        return normalized
-    cut = normalized[:max_len].rsplit(" ", 1)[0].strip()
-    return (cut or normalized[:max_len]).rstrip(" ,.;:") + "..."
+    cut = normalized[:max_len].rsplit(' ', 1)[0].strip()
+    return (cut or normalized[:max_len]).rstrip(' ,.;:') + '...'


 def _preserve_len(text: str, ratio: float = 0.85, floor: int = 180, ceiling: int = 900) -> int:
-    normalized = re.sub(r"\s+", " ", text).strip()
+    normalized = _normalize_space(text)
    if not normalized:
        return floor
    return max(floor, min(ceiling, int(len(normalized) * ratio)))


+def _normalize_title_key(text: str) -> str:
+    return re.sub(r'\s+', '', text or '').lower()
+
+
 def _strip_frontmatter_and_imports(raw: str) -> str:
-    text = raw.replace("\r\n", "\n")
-    if text.startswith("---\n"):
-        end = text.find("\n---", 4)
+    text = raw.replace('\r\n', '\n')
+    if text.startswith('---\n'):
+        end = text.find('\n---', 4)
        if end != -1:
            text = text[end + 4 :]
-    text = re.sub(r"^import\s+.+?$", "", text, flags=re.M)
+    text = re.sub(r'^import\s+.+?$', '', text, flags=re.M)
    return text.strip()


 def _dx_effect_lines(repo_root: Path) -> list[str]:
-    path = repo_root / "components" / "dx.astro"
+    path = repo_root / 'components' / 'dx.astro'
    if not path.exists():
        return []
    text = _read_text(path)
-    text = re.sub(r"<style.*?</style>", "", text, flags=re.S)
-    text = text.replace("<br />", " ")
-    text = re.sub(r"</?(div|table|thead|tbody|tr|td|th|colgroup|col|ul|strong)[^>]*>", "\n", text)
-    text = re.sub(r"<li[^>]*>", "- ", text)
-    text = re.sub(r"</li>", "\n", text)
-    text = re.sub(r"<[^>]+>", " ", text)
+    text = re.sub(r'<style.*?</style>', '', text, flags=re.S)
+    text = text.replace('<br />', ' ')
+    text = re.sub(r'</?(div|table|thead|tbody|tr|td|th|colgroup|col|ul|strong)[^>]*>', '\n', text)
+    text = re.sub(r'<li[^>]*>', '- ', text)
+    text = re.sub(r'</li>', '\n', text)
+    text = re.sub(r'<[^>]+>', ' ', text)
    lines: list[str] = []
    for raw in text.splitlines():
-        line = re.sub(r"\s+", " ", raw).strip()
-        if not line:
-            continue
-        if line.startswith("/*") or line.startswith("["):
-            continue
-        if len(line) < 6:
+        line = _normalize_space(raw)
+        if not line or line.startswith('/*') or line.startswith('[') or len(line) < 6:
            continue
        lines.append(line)
    deduped: list[str] = []
@@ -73,72 +77,29 @@ def _dx_effect_lines(repo_root: Path) -> list[str]:

 def _normalize_block_for_storage(text: str, repo_root: Path) -> str:
    dx_lines = _dx_effect_lines(repo_root)
-    if "<DxEffect" in text and dx_lines:
-        replacement = "\n".join(f"* {line}" for line in dx_lines)
-        text = re.sub(r"<DxEffect\s*/>", replacement, text)
-    text = re.sub(r"<summary[^>]*>(.*?)</summary>", lambda m: f"**{re.sub(r'<[^>]+>', ' ', m.group(1)).strip()}**", text, flags=re.S)
-    text = text.replace("<details>", "").replace("</details>", "")
-    text = re.sub(r"<br\s*/?>", "\n", text, flags=re.I)
-    text = re.sub(r"</?div[^>]*>", "", text)
-    text = re.sub(r":::\s*note\[(.*?)\]", r"**\1**", text)
-    text = text.replace(":::", "")
-    text = re.sub(r"!\[([^\]]+)\]\(([^\)]+)\)", r"[???] \1", text)
-    text = re.sub(r"\n{3,}", "\n\n", text)
+    if '<DxEffect' in text and dx_lines:
+        replacement = '\n'.join(f'* {line}' for line in dx_lines)
+        text = re.sub(r'<DxEffect\s*/>', replacement, text)
+    text = re.sub(r'<summary[^>]*>(.*?)</summary>', lambda m: f"**{re.sub(r'<[^>]+>', ' ', m.group(1)).strip()}**", text, flags=re.S)
+    text = text.replace('<details>', '').replace('</details>', '')
+    text = re.sub(r'<br\s*/?>', '\n', text, flags=re.I)
+    text = re.sub(r'</?div[^>]*>', '', text)
+    text = re.sub(r':::\s*note\[(.*?)\]', r'**\1**', text)
+    text = text.replace(':::', '')
+    text = re.sub(r'!\[([^\]]+)\]\(([^\)]+)\)', r'[image] \1', text)
+    text = re.sub(r'\n{3,}', '\n\n', text)
    return text.strip()


-def _first_nonempty_lines(text: str, limit: int = 8) -> list[str]:
-    lines: list[str] = []
-    for raw in text.splitlines():
-        line = raw.strip()
-        if not line:
-            continue
-        if line.startswith("---"):
-            continue
-        lines.append(line)
-        if len(lines) >= limit:
-            break
-    return lines
-
-
-def _extract_detail_topics(block: str, start_id: int, repo_root: Path) -> tuple[list[dict[str, Any]], str, int]:
-    topics: list[dict[str, Any]] = []
-    next_id = start_id
-
-    def repl(match: re.Match[str]) -> str:
-        nonlocal next_id
-        inner = match.group(1)
-        summary_match = re.search(r"<summary[^>]*>(.*?)</summary>", inner, flags=re.S)
-        summary = re.sub(r"<[^>]+>", " ", summary_match.group(1)).strip() if summary_match else "?? ??"
-        detail_body = re.sub(r"<summary[^>]*>.*?</summary>", "", inner, flags=re.S)
-        detail_source = _normalize_block_for_storage(detail_body, repo_root)
-        if detail_source:
-            topics.append({
-                "id": next_id,
-                "title": summary,
-                "purpose": "?? ?? ??",
-                "role": "reference",
-                "layer": "supporting",
-                "source_hint": summary,
-                "summary": _compact(detail_source, _preserve_len(detail_source, floor=220, ceiling=560)),
-                "source_data": detail_source,
-            })
-            next_id += 1
-        return f"\n* **{summary}**\n"
-
-    stripped = re.sub(r"<details>(.*?)</details>", repl, block, flags=re.S)
-    return topics, stripped, next_id
-
-
 def _extract_title_from_intro(block: str) -> str:
-    m = re.search(r"\*\s+\*\*(.+?)\*\*", block)
+    m = re.search(r'\*\s+\*\*(.+?)\*\*', block)
    if m:
        return m.group(1).strip()
-    return "도입"
+    return '서론'


 def _section_chunks(text: str) -> list[tuple[str, str]]:
-    matches = list(re.finditer(r"^##\s+(.+)$", text, flags=re.M))
+    matches = list(re.finditer(r'^##\s+(.+)$', text, flags=re.M))
    chunks: list[tuple[str, str]] = []
    for idx, match in enumerate(matches):
        title = match.group(1).strip()
@@ -149,7 +110,7 @@ def _section_chunks(text: str) -> list[tuple[str, str]]:


 def _subsection_chunks(text: str) -> list[tuple[str, str]]:
-    matches = list(re.finditer(r"^###\s+(.+)$", text, flags=re.M))
+    matches = list(re.finditer(r'^###\s+(.+)$', text, flags=re.M))
    chunks: list[tuple[str, str]] = []
    for idx, match in enumerate(matches):
        title = match.group(1).strip()
@@ -159,94 +120,150 @@ def _subsection_chunks(text: str) -> list[tuple[str, str]]:
    return chunks


-def _classify(title: str, layer_hint: str = "core") -> tuple[str, str, str]:
+def _classify(title: str, layer_hint: str = 'core') -> tuple[str, str, str]:
    clean = title.strip()
-    if "혼용" in clean:
-        return "problem", "flow", "intro"
-    if "정의" in clean:
-        return "definition", "flow", "core"
-    if "상호관계" in clean or "관계" in clean:
-        return "hierarchy", "flow", "core"
-    if "구분" in clean or "비교" in clean:
-        return "comparison", "reference", "supporting"
-    if "사례" in clean:
-        return "evidence", "reference", "supporting"
-    if "궁극적 목표" in clean:
-        return "goal", "flow", "core"
-    if "기대효과" in clean:
-        return "stakeholder_effect", "flow", "core"
-    if "필수 요건" in clean:
-        return "requirements", "flow", "core"
-    if "Process" in clean or "과정" in clean:
-        return "process", "flow", "core"
-    if "Product" in clean or "결과" in clean:
-        return "product", "flow", "core"
-    if "핵심 요약" in clean or "결론" in clean:
-        return "conclusion", "flow", "conclusion"
-    if layer_hint == "supporting":
-        return "support", "reference", "supporting"
-    return "section", "flow", "core"
+    key = _normalize_title_key(clean)
+    if any(token in key for token in ['혼용', '실태', '현실']):
+        return 'problem', 'flow', 'intro'
+    if any(token in key for token in ['정의', '개념', '용어']):
+        return 'definition', 'flow', 'core'
+    if any(token in key for token in ['상호관계', '관계', '위치']):
+        return 'hierarchy', 'flow', 'core'
+    if any(token in key for token in ['구분', '비교']):
+        return 'comparison', 'reference', 'supporting'
+    if any(token in key for token in ['사례', '근거', '대표']):
+        return 'evidence', 'reference', 'supporting'
+    if any(token in key for token in ['궁극적목표', '시행목표', '목표']):
+        return 'goal', 'flow', 'core'
+    if any(token in key for token in ['기대효과', '주체별', '효과']):
+        return 'stakeholder_effect', 'flow', 'core'
+    if any(token in key for token in ['필수요건', '요건']):
+        return 'requirements', 'flow', 'core'
+    if 'process' in key or '과정' in clean:
+        return 'process', 'flow', 'core'
+    if 'product' in key or '결과' in clean:
+        return 'product', 'flow', 'core'
+    if any(token in key for token in ['핵심요약', '요약', '결론']):
+        return 'conclusion', 'flow', 'conclusion'
+    if layer_hint == 'supporting':
+        return 'support', 'reference', 'supporting'
+    return 'section', 'flow', 'core'
+
+
+def _extract_detail_topics(block: str, start_id: int, repo_root: Path) -> tuple[list[dict[str, Any]], str, int]:
+    topics: list[dict[str, Any]] = []
+    next_id = start_id
+
+    def repl(match: re.Match[str]) -> str:
+        nonlocal next_id
+        inner = match.group(1)
+        summary_match = re.search(r'<summary[^>]*>(.*?)</summary>', inner, flags=re.S)
+        summary = re.sub(r'<[^>]+>', ' ', summary_match.group(1)).strip() if summary_match else '상세 내용'
+        detail_body = re.sub(r'<summary[^>]*>.*?</summary>', '', inner, flags=re.S)
+        detail_source = _normalize_block_for_storage(detail_body, repo_root)
+        if detail_source:
+            topics.append({
+                'id': next_id,
+                'title': summary,
+                'purpose': '상세 근거 또는 부연 설명',
+                'role': 'reference',
+                'layer': 'supporting',
+                'relation_type': 'evidence',
+                'source_hint': summary,
+                'summary': _compact(detail_source, _preserve_len(detail_source, floor=220, ceiling=560)),
+                'source_data': detail_source,
+                'structured_text': detail_source,
+                'popup_candidate': True,
+            })
+            next_id += 1
+        return f'\n* **{summary}**\n'
+
+    stripped = re.sub(r'<details>(.*?)</details>', repl, block, flags=re.S)
+    return topics, stripped, next_id


 def _extract_conclusion(text: str, repo_root: Path) -> tuple[str, str]:
-    m = re.search(r":::\s*note\[(.*?)\](.*?):::", text, flags=re.S)
+    m = re.search(r':::\s*note\[(.*?)\](.*?):::', text, flags=re.S)
    if not m:
-        return text, ""
-    note_title = re.sub(r"\s+", " ", m.group(1)).strip() or "\ud575\uc2ec \uc694\uc57d"
+        return text, ''
+    note_title = _normalize_space(m.group(1)) or '핵심 요약'
    note_body = _normalize_block_for_storage(m.group(2), repo_root)
-    note_source = f"**{note_title}**\n{note_body}".strip()
+    note_source = f'**{note_title}**\n{note_body}'.strip()
    stripped = text[: m.start()] + text[m.end() :]
    return stripped.strip(), note_source


-def extract_topics_from_raw(raw: str, repo_root: Path) -> tuple[str, list[dict[str, Any]]]:
-    title_match = re.search(r"^title:\s*(.+)$", raw, flags=re.M)
-    doc_title = title_match.group(1).strip() if title_match else "Document"
+def _content_family(topics: list[dict[str, Any]]) -> str:
+    relation_types = {str(t.get('relation_type', '') or '') for t in topics}
+    if ('comparison' in relation_types or 'definition' in relation_types or 'hierarchy' in relation_types) and 'goal' not in relation_types:
+        return 'type-a-compare-define-relate'
+    if 'goal' in relation_types or 'stakeholder_effect' in relation_types:
+        return 'type-b-goal-effect'
+    if 'requirements' in relation_types or 'product' in relation_types or 'process' in relation_types:
+        return 'type-b-requirements-process-product'
+    return 'type-b-section-stack'
+
+
+def _popup_candidate(topic: dict[str, Any]) -> bool:
+    relation = str(topic.get('relation_type', '') or '')
+    source = _normalize_space(str(topic.get('source_data', '') or ''))
+    return relation in {'comparison', 'evidence'} or len(source) > 520
+
+
+def extract_topics_from_raw(raw: str, repo_root: Path) -> tuple[str, list[dict[str, Any]], str]:
+    title_match = re.search(r'^title:\s*(.+)$', raw, flags=re.M)
+    doc_title = title_match.group(1).strip() if title_match else 'Document'
    clean = _strip_frontmatter_and_imports(raw)
    clean, conclusion_source = _extract_conclusion(clean, repo_root)

    topics: list[dict[str, Any]] = []
    next_id = 1

-    first_section = re.search(r"^##\s+", clean, flags=re.M)
+    first_section = re.search(r'^##\s+', clean, flags=re.M)
    intro_block = clean[: first_section.start()].strip() if first_section else clean.strip()
    if intro_block:
        detail_topics, intro_stripped, _ = _extract_detail_topics(intro_block, next_id + 1, repo_root)
        intro_source = _normalize_block_for_storage(intro_stripped, repo_root)
        if intro_source:
            title = _extract_title_from_intro(intro_source)
-            relation, role, layer = _classify(title, "intro")
+            relation, role, layer = _classify(title, 'intro')
            topics.append({
-                "id": next_id,
-                "title": title,
-                "purpose": "?? ?? ?? ??",
-                "role": role,
-                "layer": layer,
-                "source_hint": title,
-                "summary": _compact(intro_source, _preserve_len(intro_source, floor=260, ceiling=760)),
-                "source_data": intro_source,
+                'id': next_id,
+                'title': title,
+                'purpose': '문서 도입 또는 문제 제기',
+                'role': role,
+                'layer': layer,
+                'relation_type': relation,
+                'source_hint': title,
+                'summary': _compact(intro_source, _preserve_len(intro_source, floor=260, ceiling=760)),
+                'source_data': intro_source,
+                'structured_text': intro_source,
+                'popup_candidate': False,
            })
            next_id += 1
        topics.extend(detail_topics)
-        next_id = max([t["id"] for t in topics], default=0) + 1
+        next_id = max([t['id'] for t in topics], default=0) + 1

    for section_title, section_body in _section_chunks(clean):
        detail_topics, section_stripped, next_id = _extract_detail_topics(section_body, next_id, repo_root)
        subsections = _subsection_chunks(section_stripped)
-        lead = re.split(r"^###\s+.+$", section_stripped, maxsplit=1, flags=re.M)[0].strip() if subsections else section_stripped
+        lead = re.split(r'^###\s+.+$', section_stripped, maxsplit=1, flags=re.M)[0].strip() if subsections else section_stripped
        if lead:
            source = _normalize_block_for_storage(lead, repo_root)
            if source:
                relation, role, layer = _classify(section_title)
                topics.append({
-                    "id": next_id,
-                    "title": section_title,
-                    "purpose": f"{section_title} ?? ??",
-                    "role": role,
-                    "layer": layer,
-                    "source_hint": section_title,
-                    "summary": _compact(source, _preserve_len(source, floor=240, ceiling=780)),
-                    "source_data": source,
+                    'id': next_id,
+                    'title': section_title,
+                    'purpose': f'{section_title}의 핵심 내용',
+                    'role': role,
+                    'layer': layer,
+                    'relation_type': relation,
+                    'source_hint': section_title,
+                    'summary': _compact(source, _preserve_len(source, floor=240, ceiling=780)),
+                    'source_data': source,
+                    'structured_text': source,
+                    'popup_candidate': False,
                })
                next_id += 1
        for sub_title, sub_body in subsections:
@@ -254,135 +271,181 @@ def extract_topics_from_raw(raw: str, repo_root: Path) -> tuple[str, list[dict[s
            if source:
                relation, role, layer = _classify(sub_title)
                topics.append({
-                    "id": next_id,
-                    "title": sub_title,
-                    "purpose": f"{sub_title} ?? ??",
-                    "role": role,
-                    "layer": layer,
-                    "source_hint": sub_title,
-                    "summary": _compact(source, _preserve_len(source, floor=220, ceiling=760)),
-                    "source_data": source,
+                    'id': next_id,
+                    'title': sub_title,
+                    'purpose': f'{sub_title}의 세부 내용',
+                    'role': role,
+                    'layer': layer,
+                    'relation_type': relation,
+                    'source_hint': sub_title,
+                    'summary': _compact(source, _preserve_len(source, floor=220, ceiling=760)),
+                    'source_data': source,
+                    'structured_text': source,
+                    'popup_candidate': False,
                })
                next_id += 1
        topics.extend(detail_topics)
-        next_id = max([t["id"] for t in topics], default=0) + 1
+        next_id = max([t['id'] for t in topics], default=0) + 1

    if conclusion_source:
        topics.append({
-            "id": next_id,
-            "title": "\ud575\uc2ec \uc694\uc57d",
-            "purpose": "?? ?? ??",
-            "role": "flow",
-            "layer": "conclusion",
-            "source_hint": "\ud575\uc2ec \uc694\uc57d",
-            "summary": _compact(conclusion_source, _preserve_len(conclusion_source, floor=140, ceiling=360)),
-            "source_data": conclusion_source,
+            'id': next_id,
+            'title': '핵심 요약',
+            'purpose': '결론 또는 핵심 메시지',
+            'role': 'flow',
+            'layer': 'conclusion',
+            'relation_type': 'conclusion',
+            'source_hint': '핵심 요약',
+            'summary': _compact(conclusion_source, _preserve_len(conclusion_source, floor=140, ceiling=360)),
+            'source_data': conclusion_source,
+            'structured_text': conclusion_source,
+            'popup_candidate': False,
        })

-    return doc_title, topics
+    for topic in topics:
+        topic['popup_candidate'] = _popup_candidate(topic)
+
+    return doc_title, topics, _content_family(topics)


-def _page_structure(topics: list[dict[str, Any]]) -> dict[str, Any]:
-    intro_ids = [t["id"] for t in topics if t["layer"] == "intro"]
-    core_ids = [t["id"] for t in topics if t["layer"] == "core"]
-    support_ids = [t["id"] for t in topics if t["layer"] == "supporting"]
-    conclusion_ids = [t["id"] for t in topics if t["layer"] == "conclusion"]
+def _page_structure(topics: list[dict[str, Any]], family: str) -> dict[str, Any]:
+    intro_ids = [t['id'] for t in topics if t['layer'] == 'intro']
+    core_ids = [t['id'] for t in topics if t['layer'] == 'core']
+    support_ids = [t['id'] for t in topics if t['layer'] == 'supporting']
+    conclusion_ids = [t['id'] for t in topics if t['layer'] == 'conclusion']
    structure: dict[str, Any] = {}
-    if intro_ids:
-        structure["background"] = {"topic_ids": intro_ids, "weight": 0.24}
-    if core_ids:
-        structure["body"] = {"topic_ids": core_ids, "weight": 0.48 if support_ids else 0.58}
-    if support_ids:
-        structure["support"] = {"topic_ids": support_ids, "weight": 0.18}
+    if family == 'type-a-compare-define-relate':
+        if intro_ids:
+            structure['background'] = {'topic_ids': intro_ids, 'weight': 0.22}
+        if core_ids:
+            structure['body'] = {'topic_ids': core_ids, 'weight': 0.50}
+        if support_ids:
+            structure['support'] = {'topic_ids': support_ids, 'weight': 0.18}
+    else:
+        top_ids = intro_ids + core_ids[:1]
+        body_ids = core_ids[1:] if len(core_ids) > 1 else core_ids[:1]
+        support_main = support_ids[:]
+        if top_ids:
+            structure['body'] = {'topic_ids': top_ids + body_ids, 'weight': 0.58 if support_main else 0.64}
+        if support_main:
+            structure['support'] = {'topic_ids': support_main, 'weight': 0.18}
    if conclusion_ids:
-        structure["key_message"] = {"topic_ids": conclusion_ids, "weight": 0.10}
+        structure['key_message'] = {'topic_ids': conclusion_ids, 'weight': 0.10}
    return structure


 def rebuild_run_from_raw(repo_root: Path, run_dir: Path, input_file: Path) -> dict[str, Any]:
    raw = _read_text(input_file)
-    doc_title, topics = extract_topics_from_raw(raw, repo_root)
-    core_topic = next((t for t in topics if t["layer"] == "conclusion"), topics[-1] if topics else {"source_data": ""})
+    doc_title, topics, family = extract_topics_from_raw(raw, repo_root)
+    core_topic = next((t for t in topics if t['layer'] == 'conclusion'), topics[-1] if topics else {'source_data': ''})
    stage1a = {
-        "analysis": {
-            "title": doc_title,
-            "core_message": re.sub(r"\s+", " ", str(core_topic.get("source_data", ""))).strip(),
-            "total_pages": 1,
+        'analysis': {
+            'title': doc_title,
+            'core_message': _normalize_space(str(core_topic.get('source_data', ''))),
+            'total_pages': 1,
+            'layout_template': ('A' if family == 'type-a-compare-define-relate' else ('B_GOAL' if family == 'type-b-goal-effect' else ('B_RPP' if family == 'type-b-requirements-process-product' else 'B_STACK'))),
+            'content_family': family,
        },
-        "page_structure": _page_structure(topics),
-        "topics": topics,
+        'page_structure': _page_structure(topics, family),
+        'topics': topics,
    }
    stage1b = {
-        "concepts": [
+        'concepts': [
            {
-                "topic_id": t["id"],
-                "relation_type": _classify(t["title"], t["layer"])[0],
-                "expression_hint": "?? ??? ??? ???. ??? ? ?? ??? ??? popup?? ???. visible ??? ?? ???? 85% ??? ?? ???.",
-                "summary": t["summary"],
+                'topic_id': t['id'],
+                'relation_type': t['relation_type'],
+                'expression_hint': (
+                    '원문 제목과 원문 bullet을 우선 유지한다. 긴 세부 설명이나 큰 표는 popup으로 이동하되, 본문에는 핵심 bullet과 진입 요약을 남긴다.'
+                    if t.get('popup_candidate') else
+                    '원문 제목과 원문 bullet을 visible block으로 유지하고, 임의 재서술을 최소화한다.'
+                ),
+                'summary': t['summary'],
            }
            for t in topics
        ]
    }

-    plan_dir = run_dir / "04-plan"
-    plan_dir.mkdir(parents=True, exist_ok=True)
-    _write_json(plan_dir / "stage-1a-topics.json", stage1a)
-    _write_json(plan_dir / "stage-1b-refined-concepts.json", stage1b)
+    input_dir = run_dir / '01-input'
+    interp_dir = run_dir / '02-kei-interpretation'
+    structure_dir = run_dir / '03-structure'
+    plan_dir = run_dir / '04-plan'
+    for d in (input_dir, interp_dir, structure_dir, plan_dir):
+        d.mkdir(parents=True, exist_ok=True)
+
+    _write_json(plan_dir / 'stage-1a-topics.json', stage1a)
+    _write_json(plan_dir / 'stage-1b-refined-concepts.json', stage1b)
+    _write_json(structure_dir / 'source-blocks.json', {
+        'title': doc_title,
+        'content_family': family,
+        'blocks': [
+            {
+                'id': t['id'],
+                'title': t['title'],
+                'layer': t['layer'],
+                'relation_type': t['relation_type'],
+                'popup_candidate': bool(t.get('popup_candidate')),
+                'source_data': t['source_data'],
+            }
+            for t in topics
+        ],
+    })

-    input_dir = run_dir / "01-input"
-    input_dir.mkdir(parents=True, exist_ok=True)
    input_lines = [
-        "# Input Review",
-        "",
-        f"- ?? ???: {input_file.name}",
-        f"- ?? ??: {doc_title}",
-        "- ?? ?? ??: ?? block? ???? ?? ???? ???.",
-        "- ?? ??: ???? ?? 85% ?? ????, ? ?/?? ??? popup ??? ???.",
-        "",
-        "## ?? ??",
+        '# Input Review',
+        '',
+        f'- 입력 파일: {input_file.name}',
+        f'- 문서 제목: {doc_title}',
+        f'- content family 후보: {family}',
+        '- 우선 목표: 원문 block과 원문 순서를 최대한 보존한다.',
+        '- popup 전략: 큰 표, 긴 사례, 긴 근거는 popup 후보로 분리하고 본문에는 제목과 핵심 bullet을 남긴다.',
+        '',
+        '## 원문 블록 식별',
    ]
    for topic in topics:
-        input_lines.append(f"- {topic['title']}: { _compact(re.sub(r'\s+', ' ', topic['source_data']), 160) }")
-    _write_text(input_dir / "input-review.md", "\n".join(input_lines) + "\n")
+        popup_mark = ' [popup]' if topic.get('popup_candidate') else ''
+        input_lines.append(f"- {topic['title']} ({topic['relation_type']}/{topic['layer']}){popup_mark}: {_compact(_normalize_space(topic['source_data']), 180)}")
+    _write_text(input_dir / 'input-review.md', '\n'.join(input_lines) + '\n')

-    interp_dir = run_dir / "02-kei-interpretation"
-    interp_dir.mkdir(parents=True, exist_ok=True)
    interp_lines = [
-        "# Interpretation",
-        "",
-        "- ?? ??: ????? ?? ??? ???.",
-        "- ?? ??: ?? ??? ????, ??/??/popup ???? ???.",
-        "- popup ??: ? ?, ?? ??, ? ??? ??? popup?? ?? ???.",
-        "",
-        "## Topic Classification",
+        '# Interpretation',
+        '',
+        f'- content family: {family}',
+        '- 해석 원칙: 원문 제목/순서/표현을 우선 보존하고, 임의 재서술은 최소화한다.',
+        '- grouping 원칙: 관계가 같은 block만 묶고, 내용이 길다고 해서 본문에서 제거하지 않는다.',
+        '- popup 원칙: 상세는 popup으로 보내되 본문에는 핵심 bullet과 진입 문장을 남긴다.',
+        '',
+        '## Topic Classification',
    ]
    for topic in topics:
-        interp_lines.append(f"- {topic['title']}: layer={topic['layer']} / role={topic['role']}")
-    _write_text(interp_dir / "kei-interpretation.md", "\n".join(interp_lines) + "\n")
+        interp_lines.append(
+            f"- {topic['title']}: relation={topic['relation_type']} / layer={topic['layer']} / popup_candidate={str(bool(topic.get('popup_candidate'))).lower()}"
+        )
+    _write_text(interp_dir / 'kei-interpretation.md', '\n'.join(interp_lines) + '\n')

-    structure_dir = run_dir / "03-structure"
-    structure_dir.mkdir(parents=True, exist_ok=True)
    structure_lines = [
-        "# Content Structure",
-        "",
-        "- ??? ??: ?? ?? ??? ???.",
-        "- ??? ??: ?? ? ???? ????, ?? ???? ????.",
-        "- popup ??: ??? ? ?? ??? ? ?/? ??? popup?? ???.",
-        "",
-        "## Ordered Blocks",
+        '# Content Structure',
+        '',
+        f'- content family: {family}',
+        '- visible block 원칙: 각 섹션 제목과 핵심 bullet은 본문에 남긴다.',
+        '- popup block 원칙: 큰 표, 긴 사례, 긴 상세 설명만 popup으로 보낸다.',
+        '- 결론 원칙: note/결론 문장은 footer 또는 결론 배너에 직접 노출한다.',
+        '',
+        '## Ordered Blocks',
    ]
    for idx, topic in enumerate(topics, start=1):
-        structure_lines.append(f"{idx}. {topic['title']} ({topic['layer']})")
-    _write_text(structure_dir / "content-structure.md", "\n".join(structure_lines) + "\n")
+        popup_mark = ' popup' if topic.get('popup_candidate') else ' visible'
+        structure_lines.append(f"{idx}. {topic['title']} ({topic['relation_type']} / {topic['layer']} /{popup_mark})")
+    _write_text(structure_dir / 'content-structure.md', '\n'.join(structure_lines) + '\n')

    plan_lines = [
-        "# Execution Plan",
-        "",
-        "- ??? raw mdx?? ?? ???? stage-1a/stage-1b? ???.",
-        "- ?? ??? ??? ???.",
-        "- ?? ??, ? ?, ??? ?? ??? popup?? ?? ???.",
-        "- visible ??? section title + ?? bullet + ?? ?? ???? ???.",
+        '# Execution Plan',
+        '',
+        f'- content family: {family}',
+        '- stage-1a/stage-1b는 raw MDX 기반 block 추출 결과를 그대로 사용한다.',
+        '- Type A는 비교/정의/관계형으로, Type B는 본문 중심형으로 렌더한다.',
+        '- popup 후보 block은 삭제하지 않고 popup overlay로 이동한다.',
+        '- visible 영역에는 섹션 제목과 핵심 bullet을 남겨 원문 85% 보존 목표를 유지한다.',
    ]
-    _write_text(plan_dir / "execution-plan.md", "\n".join(plan_lines) + "\n")
+    _write_text(plan_dir / 'execution-plan.md', '\n'.join(plan_lines) + '\n')

-    return {"title": doc_title, "topics": topics}
+    return {'title': doc_title, 'topics': topics, 'content_family': family}