03번 MDX sections 매핑 수정: 상단 level=2 합침 + 하단 대목차 정확히 찾기

- _assemble_type_b: 상단에 해당하는 모든 level=2 section을 합침 (03번처럼 기술/사람/자연이 별도 section으로 분리된 경우 대응) - 하단 대목차: level=3 바로 앞의 level=2 section으로 정확히 찾기 - 03번 결과: 상단 카드(기술/사람/자연) + 하단(과정혁신/결과변화) 정상 - 02번 영향 없음 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-06 13:33:05 +09:00
parent 42d60e44a5
commit 4f0105926d
2 changed files with 46 additions and 14 deletions
--- a/src/mdx_normalizer.py
+++ b/src/mdx_normalizer.py
@@ -145,10 +145,10 @@ def _process_mdx_patterns(text: str) -> tuple[str, list[dict]]:
        # 팝업 content 정화: JSX style 제거 + 마크다운 → HTML
        content = re.sub(r"<div\s+style=\{\{[^}]*\}\}\s*>", "", content)
        content = content.replace("</div>", "")
+        # 마크다운 테이블 → HTML 테이블 (br 치환보다 먼저 — 셀 내 <br/>로 행이 쪼개지는 것 방지)
+        content = _convert_md_table_to_html(content)
        content = re.sub(r"<br\s*/?>", "\n", content)
        content = re.sub(r"\*\*(.+?)\*\*", r"<strong>\1</strong>", content)
-        # 마크다운 테이블 → HTML 테이블
-        content = _convert_md_table_to_html(content)
        popups.append({"title": title, "content": content})
        return f"[팝업: {title}]"

@@ -288,11 +288,15 @@ def _extract_structure(text: str) -> dict[str, Any]:

        # 섹션 추출 (## 및 ### 기준 — 대목차/소목차 모두)
        if token.type == "heading_open" and token.tag in ("h2", "h3"):
-            _flush_section()
-            # 다음 토큰이 inline (제목 텍스트)
+            # 다음 토큰이 inline (제목 텍스트) — 무의미한 제목(<br/> 등)은 건너뜀
            if i + 1 < len(tokens) and tokens[i + 1].type == "inline":
-                current_section_title = tokens[i + 1].content
-                current_section_level = 2 if token.tag == "h2" else 3
+                heading_text = tokens[i + 1].content.strip()
+                # <br/>, 빈 문자열, 숫자만 등은 section 제목으로 부적합
+                clean_heading = re.sub(r'<br\s*/?>', '', heading_text).strip()
+                if clean_heading and len(clean_heading) > 1:
+                    _flush_section()
+                    current_section_title = clean_heading
+                    current_section_level = 2 if token.tag == "h2" else 3
        elif current_section_title and token.type in ("paragraph_open", "bullet_list_open",
                                                       "ordered_list_open", "fence"):
            # 섹션 내용 수집 — inline 토큰의 content만