"""Phase T-1: MDX 4-Layer 파서. Stage 0에서 호출. 원본 MDX를 정규화하여 이후 모든 Stage에 깨끗한 입력 제공. Layer 1: python-frontmatter — YAML frontmatter 분리, title 추출 Layer 2: regex — 코드블록 보호 + MDX 전용 패턴 (details, :::, JSX, import) Layer 3: markdown-it-py — AST 파싱 → 이미지/표/헤딩 구조 추출 Layer 4: regex — 텍스트 정리, 빈 줄 정리, clean_text 조사 결과 (T-1): - python-frontmatter: parse() → (dict, str). frontmatter 없으면 안전하게 {} - markdown-it-py: js-default 프리셋에 table 기본 포함. 한국어 정상 - 코드블록 보호: backtick 10→3 순서 매칭. 중첩/inline 검증됨 """ from __future__ import annotations import re import logging from typing import Any import frontmatter from markdown_it import MarkdownIt logger = logging.getLogger(__name__) # ══════════════════════════════════════ # 코드블록 보호 (Layer 2 선행) # ══════════════════════════════════════ class _CodeBlockProtector: """코드블록을 placeholder로 보호하고 복원. backtick 개수가 많은 순서(10→3)로 매칭하여 중첩 코드블록 안전 처리. """ def __init__(self): self._store: dict[str, str] = {} self._counter = 0 def _make_key(self) -> str: self._counter += 1 return f"__CODEBLOCK_{self._counter}__" def protect(self, text: str) -> str: # fenced code blocks (큰 backtick부터) for n in range(10, 2, -1): pattern = rf"^(`{{{n}}})([^\n]*)\n(.*?)\n\1\s*$" def _replacer(m, _n=n): key = self._make_key() self._store[key] = m.group(0) return key text = re.sub(pattern, _replacer, text, flags=re.MULTILINE | re.DOTALL) # inline code def _inline_replacer(m): key = self._make_key() self._store[key] = m.group(0) return key text = re.sub(r"`[^`\n]+`", _inline_replacer, text) return text def restore(self, text: str) -> str: for key, original in self._store.items(): text = text.replace(key, original) return text # ══════════════════════════════════════ # Layer 2: MDX 전용 패턴 처리 # ══════════════════════════════════════ def _convert_md_list_to_html(text: str) -> str: """마크다운 리스트(* item, - item)를 HTML

로 변환. 들여쓰기 수준(2-4칸)을 감지하여 중첩
") list_stack.pop() return "\n".join(result) def _convert_md_table_to_html(text: str) -> str: """마크다운 테이블(| col | col |)을 HTML 로 변환. 어떤 마크다운 테이블이든 동작. 하드코딩 없음. """ lines = text.split("\n") result = [] table_lines = [] in_table = False for line in lines: stripped = line.strip() if stripped.startswith("|") and stripped.endswith("|"): table_lines.append(stripped) in_table = True else: if in_table and table_lines: result.append(_render_md_table(table_lines)) table_lines = [] in_table = False result.append(line) if table_lines: result.append(_render_md_table(table_lines)) return "\n".join(result) def _render_md_table(table_lines: list[str]) -> str: """마크다운 테이블 라인들을 HTML 테이블로.""" if len(table_lines) < 2: return "\n".join(table_lines) def _parse_row(line): cells = [c.strip() for c in line.split("|")] # 앞뒤 빈 셀 제거 (| col1 | col2 | → ['', 'col1', 'col2', '']) return [c for c in cells if c or c == ""].__getitem__(slice(1, -1)) if cells[0] == "" else cells headers = _parse_row(table_lines[0]) # 구분선(|---|---|) 스킵 data_start = 1 if len(table_lines) > 1 and all(c.strip().replace("-", "").replace(":", "") == "" for c in table_lines[1].split("|") if c.strip()): data_start = 2 rows = [_parse_row(line) for line in table_lines[data_start:]] # HTML 생성 — 셀 내
→
유지 (줄바꿈 역할) header_html = "".join(f"" for h in headers) rows_html = "" for row in rows: cells = "" for c in row: c = re.sub(r"", "
", c) cells += f"" rows_html += f"{cells}\n" return f"
{h} {c}
{header_html}{rows_html}
" def _process_mdx_patterns(text: str) -> tuple[str, list[dict]]: """MDX 전용 패턴 처리. popups를 추출하고 텍스트에서 마커로 교체. Returns: (처리된 텍스트, popups 리스트) """ popups = [] #
제목
내용
→ 팝업 추출 def _extract_popup(m): title = m.group(1).strip() content = m.group(2).strip() # 팝업 content 정화: JSX style 제거 + 마크다운 → HTML content = re.sub(r"
", "", content) content = content.replace("", "") # 마크다운 테이블 → HTML 테이블 (br 치환보다 먼저 — 셀 내
로 행이 쪼개지는 것 방지) content = _convert_md_table_to_html(content) # 테이블 밖
→ \n (테이블 안은 이미
로 변환 완료) content = re.sub(r"
", "\n", content) content = re.sub(r"\*\*(.+?)\*\*", r"\1", content) # 마크다운 리스트(* item) → HTML
- content = _convert_md_list_to_html(content) popups.append({"title": title, "content": content}) return f"[팝업: {title}]" text = re.sub( r"
  \s*]*>(.+?)(.*?)
  ", _extract_popup, text, flags=re.DOTALL, ) # import 문 제거 text = re.sub(r"^import\s+.+$", "", text, flags=re.MULTILINE) text = re.sub(r"^export\s+.+$", "", text, flags=re.MULTILINE) #
  제거 text = re.sub(r"", "", text) # JSX div style → 태그만 제거 (내용 유지) text = re.sub(r"", "", text) text = text.replace("", "") # 커스텀 컴포넌트 (, ...) text = re.sub(r"<[A-Z]\w+\s*/>", "", text) text = re.sub(r"<[A-Z]\w+[^>]*>.*?", "", text, flags=re.DOTALL) # :::directive[제목] → ## 승격 + 핵심요약 마킹 def _process_directive(m): directive = m.group(1) title = m.group(2) if directive in ("note", "tip", "caution", "danger"): return f"[핵심요약: {title}]" return f"## {title}" text = re.sub(r":::(\w+)\[(.+?)\]", _process_directive, text) text = re.sub(r"^:::\s*$", "", text, flags=re.MULTILINE) # ## N. 제목 → ## 제목 (번호 제거, 공백 1개 이상 필수 — T-1 조사 버그 수정) text = re.sub(r"^## \d+\.\s+", "## ", text, flags=re.MULTILINE) # ### N.N 제목 → ### 제목 text = re.sub(r"^### \d+\.\d+\s+", "### ", text, flags=re.MULTILINE) # * **제목** → ## 승격 (## 전 도입부에서만) first_hash = text.find("\n## ") if first_hash == -1: first_hash = len(text) intro = text[:first_hash] rest = text[first_hash:] intro = re.sub(r"^\* \*\*(.+?)\*\*\s*$", r"## \1", intro, flags=re.MULTILINE) text = intro + rest # 이탤릭 출처 (단독 줄) text = re.sub(r"^\s*\*([^*\n]+)\*\s*$", r"출처: \1", text, flags=re.MULTILINE) # 장식용 --- 제거 text = re.sub(r"^---\s*$", "", text, flags=re.MULTILINE) return text, popups # ══════════════════════════════════════ # Layer 3: AST 파싱 # ══════════════════════════════════════ def _extract_structure(text: str) -> dict[str, Any]: """markdown-it-py AST 파싱으로 구조 추출. Returns: {"images": [...], "tables": [...], "sections": [...]} """ md = MarkdownIt("js-default") tokens = md.parse(text) images = [] tables = [] sections = [] current_section_title = "" current_section_lines = [] current_section_level = 2 bullet_depth = 0 # 불릿 중첩 깊이 추적 (bullet_list_open/close) def _flush_section(): nonlocal current_section_title, current_section_lines, current_section_level, bullet_depth if current_section_title: sections.append({ "level": current_section_level, "title": current_section_title, "content": "\n".join(current_section_lines).strip(), }) current_section_lines = [] current_section_level = 2 bullet_depth = 0 for i, token in enumerate(tokens): # 이미지 추출 (inline children) if token.type == "inline" and token.children: for child in token.children: if child.type == "image": attrs = child.attrs or {} images.append({ "alt": child.content or attrs.get("alt", ""), "path": attrs.get("src", ""), }) # 표 추출 if token.type == "table_open": table = {"headers": [], "rows": []} # 이후 토큰에서 thead/tbody 파싱 j = i + 1 in_thead = False in_tbody = False current_row = [] while j < len(tokens) and tokens[j].type != "table_close": t = tokens[j] if t.type == "thead_open": in_thead = True elif t.type == "thead_close": in_thead = False if current_row: table["headers"] = current_row current_row = [] elif t.type == "tbody_open": in_tbody = True elif t.type == "tbody_close": in_tbody = False elif t.type == "tr_close": if in_tbody and current_row: table["rows"].append(current_row) elif in_thead and current_row: table["headers"] = current_row current_row = [] elif t.type == "inline" and (in_thead or in_tbody): current_row.append(t.content) j += 1 if table["headers"] or table["rows"]: tables.append(table) # 불릿 depth 추적 (섹션 내용 수집 시 계층 보존) if current_section_title: if token.type == "bullet_list_open": bullet_depth += 1 elif token.type == "bullet_list_close": bullet_depth = max(0, bullet_depth - 1) # 섹션 추출 (## 및 ### 기준 — 대목차/소목차 모두) if token.type == "heading_open" and token.tag in ("h2", "h3"): # 다음 토큰이 inline (제목 텍스트) — 무의미한 제목(
  등)은 건너뜀 if i + 1 < len(tokens) and tokens[i + 1].type == "inline": heading_text = tokens[i + 1].content.strip() #
  , 빈 문자열, 숫자만 등은 section 제목으로 부적합 clean_heading = re.sub(r'', '', heading_text).strip() if clean_heading and len(clean_heading) > 1: _flush_section() current_section_title = clean_heading current_section_level = 2 if token.tag == "h2" else 3 elif current_section_title and token.type in ("paragraph_open", "bullet_list_open", "ordered_list_open", "fence"): # 섹션 내용 수집 — inline 토큰의 content만 pass if current_section_title and token.type == "inline" and token.tag == "": # heading의 inline은 제목이므로 건너뜀 (이미 current_section_title에 저장) parent_type = tokens[i - 1].type if i > 0 else "" if parent_type != "heading_open": # depth prefix 추가: D1=1단 불릿, D2=2단 불릿, D3=3단 불릿 depth = max(1, bullet_depth) if bullet_depth > 0 else 0 if depth > 0: current_section_lines.append(f"D{depth}: {token.content}") else: current_section_lines.append(token.content) _flush_section() return {"images": images, "tables": tables, "sections": sections} # ══════════════════════════════════════ # Layer 4: 텍스트 정리 # ══════════════════════════════════════ def _clean_text(text: str) -> str: """최종 텍스트 정리: 남은 HTML 태그 제거, 빈 줄 정리.""" # 이미지 참조 보존 (markdown 형식 → 마커) text = re.sub(r"!\[(.+?)\]$(.+?)$", r"[이미지: \1]", text) # 남은 HTML 태그 제거 (self-closing) text = re.sub(r"<[^>]+/?>", "", text) # 연속 빈 줄 정리 text = re.sub(r"\n{3,}", "\n\n", text) return text.strip() # ══════════════════════════════════════ # 메인 함수 # ══════════════════════════════════════ def normalize_mdx_content(raw_mdx: str) -> dict[str, Any]: """MDX 원본을 4-Layer 파서로 정규화. Stage 0에서 호출. 결과는 PipelineContext.normalized에 저장. Returns: { "clean_text": str, "title": str, "images": [{"alt": str, "path": str}], "popups": [{"title": str, "content": str}], "tables": [{"headers": list, "rows": list}], "sections": [{"level": int, "title": str, "content": str}], } """ # ── Layer 1: frontmatter 분리 ── metadata, body = frontmatter.parse(raw_mdx) title = metadata.get("title", "") logger.info(f"[Layer 1] title='{title}', metadata keys={list(metadata.keys())}") # ── Layer 2: 코드블록 보호 → MDX 패턴 처리 ── protector = _CodeBlockProtector() protected = protector.protect(body) processed, popups = _process_mdx_patterns(protected) restored = protector.restore(processed) logger.info(f"[Layer 2] popups={len(popups)}개, 코드블록={protector._counter}개 보호/복원") # ── Layer 3: AST 파싱 → 구조 추출 ── structure = _extract_structure(restored) images = structure["images"] tables = structure["tables"] sections = structure["sections"] logger.info(f"[Layer 3] images={len(images)}, tables={len(tables)}, sections={len(sections)}") # ── Layer 4: 텍스트 정리 ── clean_text = _clean_text(restored) logger.info(f"[Layer 4] clean_text={len(clean_text)}자") return { "clean_text": clean_text, "title": title, "images": images, "popups": popups, "tables": tables, "sections": sections, } # ══════════════════════════════════════ # Stage 0 검증 # ══════════════════════════════════════ def validate_stage0(result: dict, raw_mdx: str) -> list[dict]: """Stage 0 출력 검증. Returns: 에러 리스트 (빈 리스트 = 통과) """ errors = [] clean_text = result.get("clean_text", "") if not clean_text.strip(): errors.append({ "severity": "FATAL", "field": "clean_text", "localization": "clean_text가 비어있음", "instruction": "원본 MDX를 확인하세요", }) return errors # 원본 대비 텍스트 보존율 (30% 이상) raw_text_len = len(re.sub(r"<[^>]+>|\{[^}]+\}|---\n.*?\n---", "", raw_mdx, flags=re.DOTALL).strip()) if raw_text_len > 0: preservation = len(clean_text) / raw_text_len if preservation < 0.3: errors.append({ "severity": "FATAL", "field": "clean_text", "localization": f"텍스트 보존율 {preservation:.0%} < 30%", "evidence": f"원본 {raw_text_len}자 → clean {len(clean_text)}자", "instruction": "파서가 너무 많은 텍스트를 제거함", }) # 이미지 수 대조 raw_img_count = len(re.findall(r"!\[", raw_mdx)) result_img_count = len(result.get("images", [])) if raw_img_count > 0 and result_img_count == 0: errors.append({ "severity": "ADJUSTABLE", "field": "images", "localization": f"원본 이미지 {raw_img_count}개, 추출 0개", "instruction": "이미지 추출 패턴 확인", }) # 팝업 수 대조 raw_details_count = raw_mdx.count("
  ") result_popup_count = len(result.get("popups", [])) if raw_details_count > 0 and result_popup_count == 0: errors.append({ "severity": "ADJUSTABLE", "field": "popups", "localization": f"원본 details {raw_details_count}개, 추출 0개", "instruction": "details 추출 패턴 확인", }) return errors