03번 MDX sections 매핑 수정: 상단 level=2 합침 + 하단 대목차 정확히 찾기

- _assemble_type_b: 상단에 해당하는 모든 level=2 section을 합침 (03번처럼 기술/사람/자연이 별도 section으로 분리된 경우 대응) - 하단 대목차: level=3 바로 앞의 level=2 section으로 정확히 찾기 - 03번 결과: 상단 카드(기술/사람/자연) + 하단(과정혁신/결과변화) 정상 - 02번 영향 없음 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-06 13:33:05 +09:00
parent 42d60e44a5
commit 4f0105926d
2 changed files with 46 additions and 14 deletions
--- a/scripts/assemble_stage2.py
+++ b/scripts/assemble_stage2.py
@@ -686,10 +686,23 @@ def _assemble_type_b(run: Path, ctx: dict):
    if top_role:
        rn, info = top_role
        tids = info.get("topic_ids", [])
-        # MDX 원본 sections에서 직접 가져오기 (Kei structured_text 대신)
-        top_section = norm_sections[0] if norm_sections else {}
-        all_text = top_section.get("content", "")
-        topic_title_from_section = top_section.get("title", "")
+        # MDX 원본 sections에서 직접 가져오기
+        # 상단: 첫 번째 level=2 section + 하단 대목차 전까지의 level=2 sections를 합침
+        # (03번처럼 기술/사람/자연이 별도 section으로 분리된 경우 대응)
+        topic_title_from_section = ""
+        top_contents = []
+        for s in norm_sections:
+            if s["level"] == 3:
+                break  # level=3(소목차) 나오면 상단 끝
+            if not topic_title_from_section and s.get("title"):
+                topic_title_from_section = s["title"]
+            content = s.get("content", "")
+            if content:
+                # section title도 소제목으로 포함 (첫 번째 제외)
+                if s["title"] and s["title"] != topic_title_from_section:
+                    top_contents.append(f"### {s['title']}")
+                top_contents.append(content)
+        all_text = "\n".join(top_contents)
        # 마크다운 bold → HTML
        all_text_clean = re.sub(r'\*\*(.+?)\*\*', r'<strong>\1</strong>', all_text)

@@ -814,13 +827,28 @@ def _assemble_type_b(run: Path, ctx: dict):
    # sections 구조: [level=2 상단, level=2 하단대목차, level=3 하단좌, level=3 하단우, ...]
    # 하단 대목차 = level=2 두 번째
    # 하단 소목차들 = level=3
+    # 하단: level=3이 존재하는 구역의 level=2가 대목차
    bottom_title = ""
    sub_sections_from_norm = []  # [(제목, content)]
-    for s in norm_sections[1:]:  # 상단 제외
-        if s["level"] == 2:
-            bottom_title = s.get("title", "")
-        elif s["level"] == 3:
+    found_level3 = False
+    for s in norm_sections:
+        if s["level"] == 3:
+            found_level3 = True
            sub_sections_from_norm.append((s.get("title", ""), s.get("content", "")))
+        elif s["level"] == 2 and not found_level3:
+            # level=3 전의 level=2는 상단에 속함 → 건너뜀
+            continue
+        elif s["level"] == 2 and found_level3:
+            # level=3 이후의 level=2 → 하단 대목차 후보 (이미 잡혔으면 무시)
+            pass
+    # 하단 대목차: level=3 바로 앞의 level=2
+    for s in norm_sections:
+        if s["level"] == 2:
+            # 이 section 다음에 level=3이 오면 이게 대목차
+            idx = norm_sections.index(s)
+            if idx + 1 < len(norm_sections) and norm_sections[idx + 1]["level"] == 3:
+                bottom_title = s.get("title", "")
+                break

    bl_indent = int(font_size * 1.2)

--- a/src/mdx_normalizer.py
+++ b/src/mdx_normalizer.py
@@ -145,10 +145,10 @@ def _process_mdx_patterns(text: str) -> tuple[str, list[dict]]:
        # 팝업 content 정화: JSX style 제거 + 마크다운 → HTML
        content = re.sub(r"<div\s+style=\{\{[^}]*\}\}\s*>", "", content)
        content = content.replace("</div>", "")
+        # 마크다운 테이블 → HTML 테이블 (br 치환보다 먼저 — 셀 내 <br/>로 행이 쪼개지는 것 방지)
+        content = _convert_md_table_to_html(content)
        content = re.sub(r"<br\s*/?>", "\n", content)
        content = re.sub(r"\*\*(.+?)\*\*", r"<strong>\1</strong>", content)
-        # 마크다운 테이블 → HTML 테이블
-        content = _convert_md_table_to_html(content)
        popups.append({"title": title, "content": content})
        return f"[팝업: {title}]"

@@ -288,10 +288,14 @@ def _extract_structure(text: str) -> dict[str, Any]:

        # 섹션 추출 (## 및 ### 기준 — 대목차/소목차 모두)
        if token.type == "heading_open" and token.tag in ("h2", "h3"):
-            _flush_section()
-            # 다음 토큰이 inline (제목 텍스트)
+            # 다음 토큰이 inline (제목 텍스트) — 무의미한 제목(<br/> 등)은 건너뜀
            if i + 1 < len(tokens) and tokens[i + 1].type == "inline":
-                current_section_title = tokens[i + 1].content
+                heading_text = tokens[i + 1].content.strip()
+                # <br/>, 빈 문자열, 숫자만 등은 section 제목으로 부적합
+                clean_heading = re.sub(r'<br\s*/?>', '', heading_text).strip()
+                if clean_heading and len(clean_heading) > 1:
+                    _flush_section()
+                    current_section_title = clean_heading
                    current_section_level = 2 if token.tag == "h2" else 3
        elif current_section_title and token.type in ("paragraph_open", "bullet_list_open",
                                                       "ordered_list_open", "fence"):