Phase W + V' 완료: before→filled→after 파이프라인 + 조립 로직 수정

Phase W: - weight 비율 초기 배정 (space_allocator header 높이 반영) - block_assembler 공통 조립 함수 (filled/assembled 통합) - filled → Selenium 측정 → context 저장 - sidebar overflow 확장 + body 재배분 - sub_layouts 사전 계산 (이미지 누락 해결) Phase V': - 팝업 링크 우측상단 배치 (인라인 → position:absolute) - 표 내용 Kei 판단 (공란 크기 계산 → 행/열 산출 → Kei 요약) - 출처 라벨 삭제 + 이미지 아래 캡션 배치 - after 공란 제거 (결론 바로 위까지 body/sidebar 채움) 추가: - V-10 bold 키워드: 기계적 추출 → Kei 문맥 판단 - ** 마크다운 → <strong> 변환 - [이미지:] 마커 제거 (bold 변환 전 처리) - grid-template-rows AFTER 크기 반영 (Sonnet final) - assemble_stage2 CSS font-size override, white-space fix - 하드코딩 전수 검토 완료 - 본심 여러 topic 텍스트 합침 Phase X 계획 문서 작성 (동적 역할 구조) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-06 05:00:52 +09:00
parent 24eb1bc5ad
commit 1f7579cf64
64 changed files with 13955 additions and 696 deletions
--- a/src/mdx_normalizer.py
+++ b/src/mdx_normalizer.py
@@ -0,0 +1,434 @@
+"""Phase T-1: MDX 4-Layer 파서.
+
+Stage 0에서 호출. 원본 MDX를 정규화하여 이후 모든 Stage에 깨끗한 입력 제공.
+
+Layer 1: python-frontmatter — YAML frontmatter 분리, title 추출
+Layer 2: regex — 코드블록 보호 + MDX 전용 패턴 (details, :::, JSX, import)
+Layer 3: markdown-it-py — AST 파싱 → 이미지/표/헤딩 구조 추출
+Layer 4: regex — 텍스트 정리, 빈 줄 정리, clean_text
+
+조사 결과 (T-1):
+- python-frontmatter: parse() → (dict, str). frontmatter 없으면 안전하게 {}
+- markdown-it-py: js-default 프리셋에 table 기본 포함. 한국어 정상
+- 코드블록 보호: backtick 10→3 순서 매칭. 중첩/inline 검증됨
+"""
+from __future__ import annotations
+
+import re
+import logging
+from typing import Any
+
+import frontmatter
+from markdown_it import MarkdownIt
+
+logger = logging.getLogger(__name__)
+
+
+# ══════════════════════════════════════
+# 코드블록 보호 (Layer 2 선행)
+# ══════════════════════════════════════
+
+class _CodeBlockProtector:
+    """코드블록을 placeholder로 보호하고 복원.
+
+    backtick 개수가 많은 순서(10→3)로 매칭하여 중첩 코드블록 안전 처리.
+    """
+
+    def __init__(self):
+        self._store: dict[str, str] = {}
+        self._counter = 0
+
+    def _make_key(self) -> str:
+        self._counter += 1
+        return f"__CODEBLOCK_{self._counter}__"
+
+    def protect(self, text: str) -> str:
+        # fenced code blocks (큰 backtick부터)
+        for n in range(10, 2, -1):
+            pattern = rf"^(`{{{n}}})([^\n]*)\n(.*?)\n\1\s*$"
+
+            def _replacer(m, _n=n):
+                key = self._make_key()
+                self._store[key] = m.group(0)
+                return key
+
+            text = re.sub(pattern, _replacer, text, flags=re.MULTILINE | re.DOTALL)
+
+        # inline code
+        def _inline_replacer(m):
+            key = self._make_key()
+            self._store[key] = m.group(0)
+            return key
+
+        text = re.sub(r"`[^`\n]+`", _inline_replacer, text)
+        return text
+
+    def restore(self, text: str) -> str:
+        for key, original in self._store.items():
+            text = text.replace(key, original)
+        return text
+
+
+# ══════════════════════════════════════
+# Layer 2: MDX 전용 패턴 처리
+# ══════════════════════════════════════
+
+def _convert_md_table_to_html(text: str) -> str:
+    """마크다운 테이블(| col | col |)을 HTML <table>로 변환.
+
+    어떤 마크다운 테이블이든 동작. 하드코딩 없음.
+    """
+    lines = text.split("\n")
+    result = []
+    table_lines = []
+    in_table = False
+
+    for line in lines:
+        stripped = line.strip()
+        if stripped.startswith("|") and stripped.endswith("|"):
+            table_lines.append(stripped)
+            in_table = True
+        else:
+            if in_table and table_lines:
+                result.append(_render_md_table(table_lines))
+                table_lines = []
+                in_table = False
+            result.append(line)
+
+    if table_lines:
+        result.append(_render_md_table(table_lines))
+
+    return "\n".join(result)
+
+
+def _render_md_table(table_lines: list[str]) -> str:
+    """마크다운 테이블 라인들을 HTML 테이블로."""
+    if len(table_lines) < 2:
+        return "\n".join(table_lines)
+
+    def _parse_row(line):
+        cells = [c.strip() for c in line.split("|")]
+        # 앞뒤 빈 셀 제거 (| col1 | col2 | → ['', 'col1', 'col2', ''])
+        return [c for c in cells if c or c == ""].__getitem__(slice(1, -1)) if cells[0] == "" else cells
+
+    headers = _parse_row(table_lines[0])
+
+    # 구분선(|---|---|) 스킵
+    data_start = 1
+    if len(table_lines) > 1 and all(c.strip().replace("-", "").replace(":", "") == "" for c in table_lines[1].split("|") if c.strip()):
+        data_start = 2
+
+    rows = [_parse_row(line) for line in table_lines[data_start:]]
+
+    # HTML 생성
+    header_html = "".join(f"<th>{h}</th>" for h in headers)
+    rows_html = ""
+    for row in rows:
+        cells = "".join(f"<td>{c}</td>" for c in row)
+        rows_html += f"<tr>{cells}</tr>\n"
+
+    return f"<table><thead><tr>{header_html}</tr></thead><tbody>{rows_html}</tbody></table>"
+
+
+def _process_mdx_patterns(text: str) -> tuple[str, list[dict]]:
+    """MDX 전용 패턴 처리. popups를 추출하고 텍스트에서 마커로 교체.
+
+    Returns:
+        (처리된 텍스트, popups 리스트)
+    """
+    popups = []
+
+    # <details><summary>제목</summary>내용</details> → 팝업 추출
+    def _extract_popup(m):
+        title = m.group(1).strip()
+        content = m.group(2).strip()
+        # 팝업 content 정화: JSX style 제거 + 마크다운 → HTML
+        content = re.sub(r"<div\s+style=\{\{[^}]*\}\}\s*>", "", content)
+        content = content.replace("</div>", "")
+        content = re.sub(r"<br\s*/?>", "\n", content)
+        content = re.sub(r"\*\*(.+?)\*\*", r"<strong>\1</strong>", content)
+        # 마크다운 테이블 → HTML 테이블
+        content = _convert_md_table_to_html(content)
+        popups.append({"title": title, "content": content})
+        return f"[팝업: {title}]"
+
+    text = re.sub(
+        r"<details>\s*<summary[^>]*>(.+?)</summary>(.*?)</details>",
+        _extract_popup,
+        text,
+        flags=re.DOTALL,
+    )
+
+    # import 문 제거
+    text = re.sub(r"^import\s+.+$", "", text, flags=re.MULTILINE)
+    text = re.sub(r"^export\s+.+$", "", text, flags=re.MULTILINE)
+
+    # <br/> 제거
+    text = re.sub(r"<br\s*/?>", "", text)
+
+    # JSX div style → 태그만 제거 (내용 유지)
+    text = re.sub(r"<div\s+style=\{\{[^}]*\}\}\s*>", "", text)
+    text = text.replace("</div>", "")
+
+    # 커스텀 컴포넌트 (<Component />, <Component>...</Component>)
+    text = re.sub(r"<[A-Z]\w+\s*/>", "", text)
+    text = re.sub(r"<[A-Z]\w+[^>]*>.*?</[A-Z]\w+>", "", text, flags=re.DOTALL)
+
+    # :::directive[제목] → ## 승격 + 핵심요약 마킹
+    def _process_directive(m):
+        directive = m.group(1)
+        title = m.group(2)
+        if directive in ("note", "tip", "caution", "danger"):
+            return f"[핵심요약: {title}]"
+        return f"## {title}"
+
+    text = re.sub(r":::(\w+)\[(.+?)\]", _process_directive, text)
+    text = re.sub(r"^:::\s*$", "", text, flags=re.MULTILINE)
+
+    # ## N. 제목 → ## 제목 (번호 제거, 공백 1개 이상 필수 — T-1 조사 버그 수정)
+    text = re.sub(r"^## \d+\.\s+", "## ", text, flags=re.MULTILINE)
+
+    # ### N.N 제목 → ### 제목
+    text = re.sub(r"^### \d+\.\d+\s+", "### ", text, flags=re.MULTILINE)
+
+    # * **제목** → ## 승격 (## 전 도입부에서만)
+    first_hash = text.find("\n## ")
+    if first_hash == -1:
+        first_hash = len(text)
+    intro = text[:first_hash]
+    rest = text[first_hash:]
+    intro = re.sub(r"^\* \*\*(.+?)\*\*\s*$", r"## \1", intro, flags=re.MULTILINE)
+    text = intro + rest
+
+    # 이탤릭 출처 (단독 줄)
+    text = re.sub(r"^\s*\*([^*\n]+)\*\s*$", r"출처: \1", text, flags=re.MULTILINE)
+
+    # 장식용 --- 제거
+    text = re.sub(r"^---\s*$", "", text, flags=re.MULTILINE)
+
+    return text, popups
+
+
+# ══════════════════════════════════════
+# Layer 3: AST 파싱
+# ══════════════════════════════════════
+
+def _extract_structure(text: str) -> dict[str, Any]:
+    """markdown-it-py AST 파싱으로 구조 추출.
+
+    Returns:
+        {"images": [...], "tables": [...], "sections": [...]}
+    """
+    md = MarkdownIt("js-default")
+    tokens = md.parse(text)
+
+    images = []
+    tables = []
+    sections = []
+
+    current_section_title = ""
+    current_section_lines = []
+
+    def _flush_section():
+        nonlocal current_section_title, current_section_lines
+        if current_section_title:
+            sections.append({
+                "level": 2,
+                "title": current_section_title,
+                "content": "\n".join(current_section_lines).strip(),
+            })
+            current_section_lines = []
+
+    for i, token in enumerate(tokens):
+        # 이미지 추출 (inline children)
+        if token.type == "inline" and token.children:
+            for child in token.children:
+                if child.type == "image":
+                    attrs = child.attrs or {}
+                    images.append({
+                        "alt": child.content or attrs.get("alt", ""),
+                        "path": attrs.get("src", ""),
+                    })
+
+        # 표 추출
+        if token.type == "table_open":
+            table = {"headers": [], "rows": []}
+            # 이후 토큰에서 thead/tbody 파싱
+            j = i + 1
+            in_thead = False
+            in_tbody = False
+            current_row = []
+            while j < len(tokens) and tokens[j].type != "table_close":
+                t = tokens[j]
+                if t.type == "thead_open":
+                    in_thead = True
+                elif t.type == "thead_close":
+                    in_thead = False
+                    if current_row:
+                        table["headers"] = current_row
+                        current_row = []
+                elif t.type == "tbody_open":
+                    in_tbody = True
+                elif t.type == "tbody_close":
+                    in_tbody = False
+                elif t.type == "tr_close":
+                    if in_tbody and current_row:
+                        table["rows"].append(current_row)
+                    elif in_thead and current_row:
+                        table["headers"] = current_row
+                    current_row = []
+                elif t.type == "inline" and (in_thead or in_tbody):
+                    current_row.append(t.content)
+                j += 1
+            if table["headers"] or table["rows"]:
+                tables.append(table)
+
+        # 섹션 추출 (## 기준)
+        if token.type == "heading_open" and token.tag == "h2":
+            _flush_section()
+            # 다음 토큰이 inline (제목 텍스트)
+            if i + 1 < len(tokens) and tokens[i + 1].type == "inline":
+                current_section_title = tokens[i + 1].content
+        elif current_section_title and token.type in ("paragraph_open", "bullet_list_open",
+                                                       "ordered_list_open", "fence"):
+            # 섹션 내용 수집 — inline 토큰의 content만
+            pass
+        if current_section_title and token.type == "inline" and token.tag == "":
+            # heading의 inline은 제목이므로 건너뜀 (이미 current_section_title에 저장)
+            parent_type = tokens[i - 1].type if i > 0 else ""
+            if parent_type != "heading_open":
+                current_section_lines.append(token.content)
+
+    _flush_section()
+
+    return {"images": images, "tables": tables, "sections": sections}
+
+
+# ══════════════════════════════════════
+# Layer 4: 텍스트 정리
+# ══════════════════════════════════════
+
+def _clean_text(text: str) -> str:
+    """최종 텍스트 정리: 남은 HTML 태그 제거, 빈 줄 정리."""
+    # 이미지 참조 보존 (markdown 형식 → 마커)
+    text = re.sub(r"!\[(.+?)\]\((.+?)\)", r"[이미지: \1]", text)
+
+    # 남은 HTML 태그 제거 (self-closing)
+    text = re.sub(r"<[^>]+/?>", "", text)
+
+    # 연속 빈 줄 정리
+    text = re.sub(r"\n{3,}", "\n\n", text)
+
+    return text.strip()
+
+
+# ══════════════════════════════════════
+# 메인 함수
+# ══════════════════════════════════════
+
+def normalize_mdx_content(raw_mdx: str) -> dict[str, Any]:
+    """MDX 원본을 4-Layer 파서로 정규화.
+
+    Stage 0에서 호출. 결과는 PipelineContext.normalized에 저장.
+
+    Returns:
+        {
+            "clean_text": str,
+            "title": str,
+            "images": [{"alt": str, "path": str}],
+            "popups": [{"title": str, "content": str}],
+            "tables": [{"headers": list, "rows": list}],
+            "sections": [{"level": int, "title": str, "content": str}],
+        }
+    """
+    # ── Layer 1: frontmatter 분리 ──
+    metadata, body = frontmatter.parse(raw_mdx)
+    title = metadata.get("title", "")
+    logger.info(f"[Layer 1] title='{title}', metadata keys={list(metadata.keys())}")
+
+    # ── Layer 2: 코드블록 보호 → MDX 패턴 처리 ──
+    protector = _CodeBlockProtector()
+    protected = protector.protect(body)
+    processed, popups = _process_mdx_patterns(protected)
+    restored = protector.restore(processed)
+    logger.info(f"[Layer 2] popups={len(popups)}개, 코드블록={protector._counter}개 보호/복원")
+
+    # ── Layer 3: AST 파싱 → 구조 추출 ──
+    structure = _extract_structure(restored)
+    images = structure["images"]
+    tables = structure["tables"]
+    sections = structure["sections"]
+    logger.info(f"[Layer 3] images={len(images)}, tables={len(tables)}, sections={len(sections)}")
+
+    # ── Layer 4: 텍스트 정리 ──
+    clean_text = _clean_text(restored)
+    logger.info(f"[Layer 4] clean_text={len(clean_text)}자")
+
+    return {
+        "clean_text": clean_text,
+        "title": title,
+        "images": images,
+        "popups": popups,
+        "tables": tables,
+        "sections": sections,
+    }
+
+
+# ══════════════════════════════════════
+# Stage 0 검증
+# ══════════════════════════════════════
+
+def validate_stage0(result: dict, raw_mdx: str) -> list[dict]:
+    """Stage 0 출력 검증.
+
+    Returns:
+        에러 리스트 (빈 리스트 = 통과)
+    """
+    errors = []
+
+    clean_text = result.get("clean_text", "")
+    if not clean_text.strip():
+        errors.append({
+            "severity": "FATAL",
+            "field": "clean_text",
+            "localization": "clean_text가 비어있음",
+            "instruction": "원본 MDX를 확인하세요",
+        })
+        return errors
+
+    # 원본 대비 텍스트 보존율 (30% 이상)
+    raw_text_len = len(re.sub(r"<[^>]+>|\{[^}]+\}|---\n.*?\n---", "", raw_mdx, flags=re.DOTALL).strip())
+    if raw_text_len > 0:
+        preservation = len(clean_text) / raw_text_len
+        if preservation < 0.3:
+            errors.append({
+                "severity": "FATAL",
+                "field": "clean_text",
+                "localization": f"텍스트 보존율 {preservation:.0%} < 30%",
+                "evidence": f"원본 {raw_text_len}자 → clean {len(clean_text)}자",
+                "instruction": "파서가 너무 많은 텍스트를 제거함",
+            })
+
+    # 이미지 수 대조
+    raw_img_count = len(re.findall(r"!\[", raw_mdx))
+    result_img_count = len(result.get("images", []))
+    if raw_img_count > 0 and result_img_count == 0:
+        errors.append({
+            "severity": "ADJUSTABLE",
+            "field": "images",
+            "localization": f"원본 이미지 {raw_img_count}개, 추출 0개",
+            "instruction": "이미지 추출 패턴 확인",
+        })
+
+    # 팝업 수 대조
+    raw_details_count = raw_mdx.count("<details>")
+    result_popup_count = len(result.get("popups", []))
+    if raw_details_count > 0 and result_popup_count == 0:
+        errors.append({
+            "severity": "ADJUSTABLE",
+            "field": "popups",
+            "localization": f"원본 details {raw_details_count}개, 추출 0개",
+            "instruction": "details 추출 패턴 확인",
+        })
+
+    return errors