C.E.L_Slide_test2/src/mdx_normalizer.py

"""Phase T-1: MDX 4-Layer 파서.

Stage 0에서 호출. 원본 MDX를 정규화하여 이후 모든 Stage에 깨끗한 입력 제공.

Layer 1: python-frontmatter — YAML frontmatter 분리, title 추출
Layer 2: regex — 코드블록 보호 + MDX 전용 패턴 (details, :::, JSX, import)
Layer 3: markdown-it-py — AST 파싱 → 이미지/표/헤딩 구조 추출
Layer 4: regex — 텍스트 정리, 빈 줄 정리, clean_text

조사 결과 (T-1):
- python-frontmatter: parse() → (dict, str). frontmatter 없으면 안전하게 {}
- markdown-it-py: js-default 프리셋에 table 기본 포함. 한국어 정상
- 코드블록 보호: backtick 10→3 순서 매칭. 중첩/inline 검증됨
"""
from __future__ import annotations

import re
import logging
from typing import Any

import frontmatter
from markdown_it import MarkdownIt

logger = logging.getLogger(__name__)


# ══════════════════════════════════════
# 코드블록 보호 (Layer 2 선행)
# ══════════════════════════════════════

class _CodeBlockProtector:
    """코드블록을 placeholder로 보호하고 복원.

    backtick 개수가 많은 순서(10→3)로 매칭하여 중첩 코드블록 안전 처리.
    """

    def __init__(self):
        self._store: dict[str, str] = {}
        self._counter = 0

    def _make_key(self) -> str:
        self._counter += 1
        return f"__CODEBLOCK_{self._counter}__"

    def protect(self, text: str) -> str:
        # fenced code blocks (큰 backtick부터)
        for n in range(10, 2, -1):
            pattern = rf"^(`{{{n}}})([^\n]*)\n(.*?)\n\1\s*$"

            def _replacer(m, _n=n):
                key = self._make_key()
                self._store[key] = m.group(0)
                return key

            text = re.sub(pattern, _replacer, text, flags=re.MULTILINE | re.DOTALL)

        # inline code
        def _inline_replacer(m):
            key = self._make_key()
            self._store[key] = m.group(0)
            return key

        text = re.sub(r"`[^`\n]+`", _inline_replacer, text)
        return text

    def restore(self, text: str) -> str:
        for key, original in self._store.items():
            text = text.replace(key, original)
        return text


# ══════════════════════════════════════
# Layer 2: MDX 전용 패턴 처리
# ══════════════════════════════════════

def _convert_md_list_to_html(text: str) -> str:
    """마크다운 리스트(* item, - item)를 HTML <ul><li>로 변환.

    들여쓰기 수준(2-4칸)을 감지하여 중첩 <ul>을 생성한다.
    """
    lines = text.split("\n")
    result = []
    list_stack: list[int] = []  # 현재 열린 리스트의 들여쓰기 레벨들

    for line in lines:
        m = re.match(r"^(\s*)[*\-]\s+(.+)$", line)
        if m:
            indent = len(m.group(1))
            content = m.group(2)
            if not list_stack:
                result.append("<ul>")
                list_stack.append(indent)
            elif indent > list_stack[-1]:
                result.append("<ul>")
                list_stack.append(indent)
            else:
                while len(list_stack) > 1 and indent < list_stack[-1]:
                    result.append("</ul></li>")
                    list_stack.pop()
            result.append(f"<li>{content}")
        else:
            while list_stack:
                result.append("</li></ul>")
                list_stack.pop()
            result.append(line)

    while list_stack:
        result.append("</li></ul>")
        list_stack.pop()

    return "\n".join(result)


def _convert_md_table_to_html(text: str) -> str:
    """마크다운 테이블(| col | col |)을 HTML <table>로 변환.

    어떤 마크다운 테이블이든 동작. 하드코딩 없음.
    """
    lines = text.split("\n")
    result = []
    table_lines = []
    in_table = False

    for line in lines:
        stripped = line.strip()
        if stripped.startswith("|") and stripped.endswith("|"):
            table_lines.append(stripped)
            in_table = True
        else:
            if in_table and table_lines:
                result.append(_render_md_table(table_lines))
                table_lines = []
                in_table = False
            result.append(line)

    if table_lines:
        result.append(_render_md_table(table_lines))

    return "\n".join(result)


def _render_md_table(table_lines: list[str]) -> str:
    """마크다운 테이블 라인들을 HTML 테이블로."""
    if len(table_lines) < 2:
        return "\n".join(table_lines)

    def _parse_row(line):
        cells = [c.strip() for c in line.split("|")]
        # 앞뒤 빈 셀 제거 (| col1 | col2 | → ['', 'col1', 'col2', ''])
        return [c for c in cells if c or c == ""].__getitem__(slice(1, -1)) if cells[0] == "" else cells

    headers = _parse_row(table_lines[0])

    # 구분선(|---|---|) 스킵
    data_start = 1
    if len(table_lines) > 1 and all(c.strip().replace("-", "").replace(":", "") == "" for c in table_lines[1].split("|") if c.strip()):
        data_start = 2

    rows = [_parse_row(line) for line in table_lines[data_start:]]

    # HTML 생성 — 셀 내 <br/> → <br> 유지 (줄바꿈 역할)
    header_html = "".join(f"<th>{h}</th>" for h in headers)
    rows_html = ""
    for row in rows:
        cells = ""
        for c in row:
            c = re.sub(r"<br\s*/?>", "<br>", c)
            cells += f"<td>{c}</td>"
        rows_html += f"<tr>{cells}</tr>\n"

    return f"<table><thead><tr>{header_html}</tr></thead><tbody>{rows_html}</tbody></table>"


def _process_mdx_patterns(text: str) -> tuple[str, list[dict]]:
    """MDX 전용 패턴 처리. popups를 추출하고 텍스트에서 마커로 교체.

    Returns:
        (처리된 텍스트, popups 리스트)
    """
    popups = []

    # <details><summary>제목</summary>내용</details> → 팝업 추출
    def _extract_popup(m):
        title = m.group(1).strip()
        content = m.group(2).strip()
        # 팝업 content 정화: JSX style 제거 + 마크다운 → HTML
        content = re.sub(r"<div\s+style=\{\{[^}]*\}\}\s*>", "", content)
        content = content.replace("</div>", "")
        # 마크다운 테이블 → HTML 테이블 (br 치환보다 먼저 — 셀 내 <br/>로 행이 쪼개지는 것 방지)
        content = _convert_md_table_to_html(content)
        # 테이블 밖 <br/> → \n (테이블 안은 이미 <br>로 변환 완료)
        content = re.sub(r"<br\s*/>", "\n", content)
        content = re.sub(r"\*\*(.+?)\*\*", r"<strong>\1</strong>", content)
        # 마크다운 리스트(* item) → HTML <ul><li>
        content = _convert_md_list_to_html(content)
        popups.append({"title": title, "content": content})
        return f"[팝업: {title}]"

    text = re.sub(
        r"<details>\s*<summary[^>]*>(.+?)</summary>(.*?)</details>",
        _extract_popup,
        text,
        flags=re.DOTALL,
    )

    # import 문 제거
    text = re.sub(r"^import\s+.+$", "", text, flags=re.MULTILINE)
    text = re.sub(r"^export\s+.+$", "", text, flags=re.MULTILINE)

    # <br/> 제거
    text = re.sub(r"<br\s*/?>", "", text)

    # JSX div style → 태그만 제거 (내용 유지)
    text = re.sub(r"<div\s+style=\{\{[^}]*\}\}\s*>", "", text)
    text = text.replace("</div>", "")

    # 커스텀 컴포넌트 (<Component />, <Component>...</Component>)
    text = re.sub(r"<[A-Z]\w+\s*/>", "", text)
    text = re.sub(r"<[A-Z]\w+[^>]*>.*?</[A-Z]\w+>", "", text, flags=re.DOTALL)

    # :::directive[제목] → ## 승격 + 핵심요약 마킹
    def _process_directive(m):
        directive = m.group(1)
        title = m.group(2)
        if directive in ("note", "tip", "caution", "danger"):
            return f"[핵심요약: {title}]"
        return f"## {title}"

    text = re.sub(r":::(\w+)\[(.+?)\]", _process_directive, text)
    text = re.sub(r"^:::\s*$", "", text, flags=re.MULTILINE)

    # ## N. 제목 → ## 제목 (번호 제거, 공백 1개 이상 필수 — T-1 조사 버그 수정)
    text = re.sub(r"^## \d+\.\s+", "## ", text, flags=re.MULTILINE)

    # ### N.N 제목 → ### 제목
    text = re.sub(r"^### \d+\.\d+\s+", "### ", text, flags=re.MULTILINE)

    # * **제목** → ## 승격 (## 전 도입부에서만)
    first_hash = text.find("\n## ")
    if first_hash == -1:
        first_hash = len(text)
    intro = text[:first_hash]
    rest = text[first_hash:]
    intro = re.sub(r"^\* \*\*(.+?)\*\*\s*$", r"## \1", intro, flags=re.MULTILINE)
    text = intro + rest

    # 이탤릭 출처 (단독 줄)
    text = re.sub(r"^\s*\*([^*\n]+)\*\s*$", r"출처: \1", text, flags=re.MULTILINE)

    # 장식용 --- 제거
    text = re.sub(r"^---\s*$", "", text, flags=re.MULTILINE)

    return text, popups


# ══════════════════════════════════════
# Layer 3: AST 파싱
# ══════════════════════════════════════

def _extract_structure(text: str) -> dict[str, Any]:
    """markdown-it-py AST 파싱으로 구조 추출.

    Returns:
        {"images": [...], "tables": [...], "sections": [...]}
    """
    md = MarkdownIt("js-default")
    tokens = md.parse(text)

    images = []
    tables = []
    sections = []

    current_section_title = ""
    current_section_lines = []

    current_section_level = 2
    bullet_depth = 0  # 불릿 중첩 깊이 추적 (bullet_list_open/close)

    def _flush_section():
        nonlocal current_section_title, current_section_lines, current_section_level, bullet_depth
        if current_section_title:
            sections.append({
                "level": current_section_level,
                "title": current_section_title,
                "content": "\n".join(current_section_lines).strip(),
            })
            current_section_lines = []
            current_section_level = 2
            bullet_depth = 0

    for i, token in enumerate(tokens):
        # 이미지 추출 (inline children)
        if token.type == "inline" and token.children:
            for child in token.children:
                if child.type == "image":
                    attrs = child.attrs or {}
                    images.append({
                        "alt": child.content or attrs.get("alt", ""),
                        "path": attrs.get("src", ""),
                    })

        # 표 추출
        if token.type == "table_open":
            table = {"headers": [], "rows": []}
            # 이후 토큰에서 thead/tbody 파싱
            j = i + 1
            in_thead = False
            in_tbody = False
            current_row = []
            while j < len(tokens) and tokens[j].type != "table_close":
                t = tokens[j]
                if t.type == "thead_open":
                    in_thead = True
                elif t.type == "thead_close":
                    in_thead = False
                    if current_row:
                        table["headers"] = current_row
                        current_row = []
                elif t.type == "tbody_open":
                    in_tbody = True
                elif t.type == "tbody_close":
                    in_tbody = False
                elif t.type == "tr_close":
                    if in_tbody and current_row:
                        table["rows"].append(current_row)
                    elif in_thead and current_row:
                        table["headers"] = current_row
                    current_row = []
                elif t.type == "inline" and (in_thead or in_tbody):
                    current_row.append(t.content)
                j += 1
            if table["headers"] or table["rows"]:
                tables.append(table)

        # 불릿 depth 추적 (섹션 내용 수집 시 계층 보존)
        if current_section_title:
            if token.type == "bullet_list_open":
                bullet_depth += 1
            elif token.type == "bullet_list_close":
                bullet_depth = max(0, bullet_depth - 1)

        # 섹션 추출 (## 및 ### 기준 — 대목차/소목차 모두)
        if token.type == "heading_open" and token.tag in ("h2", "h3"):
            # 다음 토큰이 inline (제목 텍스트) — 무의미한 제목(<br/> 등)은 건너뜀
            if i + 1 < len(tokens) and tokens[i + 1].type == "inline":
                heading_text = tokens[i + 1].content.strip()
                # <br/>, 빈 문자열, 숫자만 등은 section 제목으로 부적합
                clean_heading = re.sub(r'<br\s*/?>', '', heading_text).strip()
                if clean_heading and len(clean_heading) > 1:
                    _flush_section()
                    current_section_title = clean_heading
                    current_section_level = 2 if token.tag == "h2" else 3
        elif current_section_title and token.type in ("paragraph_open", "bullet_list_open",
                                                       "ordered_list_open", "fence"):
            # 섹션 내용 수집 — inline 토큰의 content만
            pass
        if current_section_title and token.type == "inline" and token.tag == "":
            # heading의 inline은 제목이므로 건너뜀 (이미 current_section_title에 저장)
            parent_type = tokens[i - 1].type if i > 0 else ""
            if parent_type != "heading_open":
                # depth prefix 추가: D1=1단 불릿, D2=2단 불릿, D3=3단 불릿
                depth = max(1, bullet_depth) if bullet_depth > 0 else 0
                if depth > 0:
                    current_section_lines.append(f"D{depth}: {token.content}")
                else:
                    current_section_lines.append(token.content)

    _flush_section()

    return {"images": images, "tables": tables, "sections": sections}


# ══════════════════════════════════════
# Layer 4: 텍스트 정리
# ══════════════════════════════════════

def _clean_text(text: str) -> str:
    """최종 텍스트 정리: 남은 HTML 태그 제거, 빈 줄 정리."""
    # 이미지 참조 보존 (markdown 형식 → 마커)
    text = re.sub(r"!\[(.+?)\]\((.+?)\)", r"[이미지: \1]", text)

    # 남은 HTML 태그 제거 (self-closing)
    text = re.sub(r"<[^>]+/?>", "", text)

    # 연속 빈 줄 정리
    text = re.sub(r"\n{3,}", "\n\n", text)

    return text.strip()


# ══════════════════════════════════════
# 메인 함수
# ══════════════════════════════════════

def normalize_mdx_content(raw_mdx: str) -> dict[str, Any]:
    """MDX 원본을 4-Layer 파서로 정규화.

    Stage 0에서 호출. 결과는 PipelineContext.normalized에 저장.

    Returns:
        {
            "clean_text": str,
            "title": str,
            "images": [{"alt": str, "path": str}],
            "popups": [{"title": str, "content": str}],
            "tables": [{"headers": list, "rows": list}],
            "sections": [{"level": int, "title": str, "content": str}],
        }
    """
    # ── Layer 1: frontmatter 분리 ──
    metadata, body = frontmatter.parse(raw_mdx)
    title = metadata.get("title", "")
    logger.info(f"[Layer 1] title='{title}', metadata keys={list(metadata.keys())}")

    # ── Layer 2: 코드블록 보호 → MDX 패턴 처리 ──
    protector = _CodeBlockProtector()
    protected = protector.protect(body)
    processed, popups = _process_mdx_patterns(protected)
    restored = protector.restore(processed)
    logger.info(f"[Layer 2] popups={len(popups)}개, 코드블록={protector._counter}개 보호/복원")

    # ── Layer 3: AST 파싱 → 구조 추출 ──
    structure = _extract_structure(restored)
    images = structure["images"]
    tables = structure["tables"]
    sections = structure["sections"]
    logger.info(f"[Layer 3] images={len(images)}, tables={len(tables)}, sections={len(sections)}")

    # ── Layer 4: 텍스트 정리 ──
    clean_text = _clean_text(restored)
    logger.info(f"[Layer 4] clean_text={len(clean_text)}자")

    return {
        "clean_text": clean_text,
        "title": title,
        "images": images,
        "popups": popups,
        "tables": tables,
        "sections": sections,
    }


# ══════════════════════════════════════
# Stage 0 검증
# ══════════════════════════════════════

def validate_stage0(result: dict, raw_mdx: str) -> list[dict]:
    """Stage 0 출력 검증.

    Returns:
        에러 리스트 (빈 리스트 = 통과)
    """
    errors = []

    clean_text = result.get("clean_text", "")
    if not clean_text.strip():
        errors.append({
            "severity": "FATAL",
            "field": "clean_text",
            "localization": "clean_text가 비어있음",
            "instruction": "원본 MDX를 확인하세요",
        })
        return errors

    # 원본 대비 텍스트 보존율 (30% 이상)
    raw_text_len = len(re.sub(r"<[^>]+>|\{[^}]+\}|---\n.*?\n---", "", raw_mdx, flags=re.DOTALL).strip())
    if raw_text_len > 0:
        preservation = len(clean_text) / raw_text_len
        if preservation < 0.3:
            errors.append({
                "severity": "FATAL",
                "field": "clean_text",
                "localization": f"텍스트 보존율 {preservation:.0%} < 30%",
                "evidence": f"원본 {raw_text_len}자 → clean {len(clean_text)}자",
                "instruction": "파서가 너무 많은 텍스트를 제거함",
            })

    # 이미지 수 대조
    raw_img_count = len(re.findall(r"!\[", raw_mdx))
    result_img_count = len(result.get("images", []))
    if raw_img_count > 0 and result_img_count == 0:
        errors.append({
            "severity": "ADJUSTABLE",
            "field": "images",
            "localization": f"원본 이미지 {raw_img_count}개, 추출 0개",
            "instruction": "이미지 추출 패턴 확인",
        })

    # 팝업 수 대조
    raw_details_count = raw_mdx.count("<details>")
    result_popup_count = len(result.get("popups", []))
    if raw_details_count > 0 and result_popup_count == 0:
        errors.append({
            "severity": "ADJUSTABLE",
            "field": "popups",
            "localization": f"원본 details {raw_details_count}개, 추출 0개",
            "instruction": "details 추출 패턴 확인",
        })

    return errors