C.E.L_Slide_test2/scripts/match_mdx_to_frames_tfidf.py

"""MDX → Figma Frame 매칭 (TF-IDF) — 대목차 / 중목차 / 소목차 3단계 모두 출력.

프레임은 data/figma_previews/index.json 의 번호(01~32)로 표기한다.
"""
from __future__ import annotations

import json
import re
import sys
from datetime import datetime
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent.parent))

from src.block_matcher_tfidf import TfidfBlockMatcher
from src.mdx_normalizer import normalize_mdx_content
from src.section_parser import extract_major_sections


TOP_K = 3
THRESHOLD = 0.15  # pipeline_v2 direct-fit 커트오프 (표기에 사용 안 함)

PREVIEW_DIR = Path("data/figma_previews")
INDEX_PATH = PREVIEW_DIR / "index.json"

# index.json 로드: {"01": {"frame_id": "1171281172", ...}, ...}
_INDEX: dict[str, dict] = json.loads(INDEX_PATH.read_text(encoding="utf-8"))
FRAME_TO_NUM: dict[str, str] = {v["frame_id"]: k for k, v in _INDEX.items()}
NUM_TO_FRAME: dict[str, str] = {k: v["frame_id"] for k, v in _INDEX.items()}

MDX_FILES = [
    ("01", Path("samples/mdx/01. 건설산업 DX의 올바른 이해(0127).mdx")),
    ("02", Path("samples/mdx/02. DX의 시행 목표 및 기대효과.mdx")),
    ("03", Path("samples/mdx/03. DX 시행을 위한 필수 요건 및 혁신 방안.mdx")),
]


def num_of(frame_id: str) -> str:
    return FRAME_TO_NUM.get(frame_id, f"?({frame_id})")


def extract_d1_items(content: str) -> list[str]:
    return [
        re.sub(r"\*+", "", d).strip()
        for d in re.findall(r"^D1:\s*(.*)", content, re.MULTILINE)
    ]


def frame_title(matcher: TfidfBlockMatcher, fid: str) -> str:
    for f in matcher.frames:
        if f["frame_id"] == fid:
            return (f.get("title_text") or "").replace("\n", " ")[:60]
    return ""


def print_ranking(label: str, top: list[dict], matcher: TfidfBlockMatcher, indent: str = "  "):
    if not top or top[0]["score"] <= 0:
        print(f"{indent}(매칭 없음, score=0)")
        return
    for rank, r in enumerate(top[:TOP_K], start=1):
        if r["score"] <= 0:
            break
        num = num_of(r["frame_id"])
        print(
            f"{indent}  {rank}. #{num}  score={r['score']*100:5.1f}%  "
            f"| frame {r['frame_id']} | {frame_title(matcher, r['frame_id'])}"
        )


def md_ranking_table(top: list[dict], matcher: TfidfBlockMatcher) -> list[str]:
    rel = Path("..") / ".." / ".." / PREVIEW_DIR  # run dir 기준
    lines = [
        "| rank | # | preview | score | frame_id | title_text |",
        "|---|---|---|---|---|---|",
    ]
    for rank, r in enumerate(top[:TOP_K], start=1):
        if r["score"] <= 0:
            break
        num = num_of(r["frame_id"])
        preview = f"![]({(rel / (num + '.png')).as_posix()})"
        lines.append(
            f"| {rank} | **#{num}** | {preview} | **{r['score']*100:.1f}%** | "
            f"`{r['frame_id']}` | {frame_title(matcher, r['frame_id'])} |"
        )
    if len(lines) == 2:
        lines.append("| — | — | — | 0% | — | (매칭 없음) |")
    return lines


def evaluate_mdx(
    matcher: TfidfBlockMatcher,
    mdx_id: str,
    mdx_path: Path,
    md_lines: list[str],
) -> None:
    content = mdx_path.read_text(encoding="utf-8")
    norm = normalize_mdx_content(content)
    flat_sections = norm.get("sections", [])
    zones = extract_major_sections(flat_sections)
    doc_title = norm.get("title") or mdx_path.stem

    print("\n" + "=" * 100)
    print(f"MDX {mdx_id}: {doc_title}   ({mdx_path.name})")
    print(f"flat sections: {len(flat_sections)}  |  zones(중목차): {len(zones)}")
    print("=" * 100)

    md_lines.append(f"\n## MDX {mdx_id} — {doc_title}\n")
    md_lines.append(
        f"파일: `{mdx_path.as_posix()}`  ·  "
        f"평면 section {len(flat_sections)}개  ·  zone(중목차) {len(zones)}개\n"
    )

    # ═══════════ L1: 대목차 (MDX 전체) ═══════════
    l1_subs = [z["title"] for z in zones] + [
        st for z in zones for st in z.get("sub_titles", [])
    ]
    l1_top = matcher.match(doc_title, l1_subs, d1_items=None, top_k=len(matcher.frames))

    print(f"\n┌─ L1 대목차 [전체 MDX]  '{doc_title}'")
    print(f"│  zones: {[z['title'] for z in zones]}")
    print_ranking("L1", l1_top, matcher, indent="│  ")

    md_lines.append("### 🟦 L1 — 대목차 (전체 MDX)\n")
    md_lines.append(f"- 쿼리: `{doc_title}` + 모든 zone/sub title")
    md_lines.append(f"- zone 목록: {[z['title'] for z in zones]}")
    md_lines.append("")
    md_lines.extend(md_ranking_table(l1_top, matcher))
    md_lines.append("")

    # ═══════════ L2: 중목차 (zone 단위) ═══════════
    print(f"\n┌─ L2 중목차 [zone 단위]")
    md_lines.append("### 🟩 L2 — 중목차 (zone 단위)\n")

    for zi, zone in enumerate(zones, start=1):
        z_title = zone["title"]
        sub_titles = zone.get("sub_titles", [])
        z_content = zone.get("content", "")
        d1 = extract_d1_items(z_content)
        top = matcher.match(z_title, sub_titles, d1, top_k=len(matcher.frames))

        print(f"│\n│  [zone {zi}] {z_title}")
        print(f"│    sub_titles: {sub_titles}")
        print(f"│    d1_items: {len(d1)}개")
        print_ranking("L2", top, matcher, indent="│    ")

        md_lines.append(f"\n#### zone {zi}: **{z_title}**")
        md_lines.append(f"- sub_titles: {sub_titles}")
        md_lines.append(f"- d1_items: {len(d1)}개")
        md_lines.append("")
        md_lines.extend(md_ranking_table(top, matcher))
        md_lines.append("")

    # ═══════════ L3: 소목차 (평면 section 각각) ═══════════
    # normalize의 sections 중 content가 있는 것만 = 실제 소목차
    sub_sections = [s for s in flat_sections if s.get("content", "").strip()]
    print(f"\n┌─ L3 소목차 [개별 sub-section, {len(sub_sections)}개]")
    md_lines.append("### 🟨 L3 — 소목차 (개별 sub-section)\n")

    for si, sec in enumerate(sub_sections, start=1):
        s_title = sec.get("title", "")
        s_content = sec.get("content", "")
        d1 = extract_d1_items(s_content)
        top = matcher.match(s_title, sub_titles=None, d1_items=d1, top_k=len(matcher.frames))

        # 이 섹션이 어느 zone에 속하는지 찾기
        parent_zone = "—"
        for z in zones:
            if s_title in z.get("sub_titles", []):
                parent_zone = z["title"]
                break

        print(f"│\n│  [sub {si}] {s_title}   (zone: {parent_zone})")
        print(f"│    d1_items: {len(d1)}개")
        print_ranking("L3", top, matcher, indent="│    ")

        md_lines.append(f"\n#### sub {si}: **{s_title}**  _(zone: {parent_zone})_")
        md_lines.append(f"- d1_items: {len(d1)}개")
        md_lines.append("")
        md_lines.extend(md_ranking_table(top, matcher))
        md_lines.append("")


def build_frame_legend(matcher: TfidfBlockMatcher, md_lines: list[str]) -> None:
    md_lines.append("\n## 프레임 번호 전체 색인 (01 ~ 32)\n")
    md_lines.append("| # | preview | frame_id | title_text |")
    md_lines.append("|---|---|---|---|")
    rel = Path("..") / ".." / ".." / PREVIEW_DIR
    for num in sorted(_INDEX.keys()):
        entry = _INDEX[num]
        preview = f"![]({(rel / (num + '.png')).as_posix()})"
        md_lines.append(
            f"| **#{num}** | {preview} | `{entry['frame_id']}` | "
            f"{frame_title(matcher, entry['frame_id'])} |"
        )
    md_lines.append("")


def main() -> int:
    print("[init] TF-IDF 인덱스 로딩...")
    matcher = TfidfBlockMatcher()
    print(f"[init] 프레임 {len(matcher.frames)}개 인덱싱 완료")
    print(f"[init] direct-fit 임계값 = {THRESHOLD*100:.0f}%")

    md_lines: list[str] = [
        "# MDX ↔ Figma Frame 매칭 (TF-IDF 순수 점수) — L1/L2/L3 3단계",
        "",
        "프레임은 `data/figma_previews/{번호}.png` 의 번호로 표기. 하단에 번호-프레임 색인.",
        "",
        "| 단계 | 입도 | 쿼리 구성 |",
        "|---|---|---|",
        "| 🟦 L1 대목차 | MDX 전체 1개 | doc title + 모든 zone/sub title |",
        "| 🟩 L2 중목차 | zone 단위 | zone title + sub_titles + d1_items |",
        "| 🟨 L3 소목차 | 개별 sub-section 각각 | sub title + 자기 content의 d1_items |",
        "",
        f"- 인덱싱된 프레임: {len(matcher.frames)}개",
        "- **각 표는 순수 TF-IDF cosine similarity × 100 을 %로 표시한 점수 랭킹.**",
        "- 판정/분기(recipe/direct-fit) 라벨은 출력하지 않음. 점수만 그대로 본다.",
    ]

    for mdx_id, p in MDX_FILES:
        if p.exists():
            evaluate_mdx(matcher, mdx_id, p, md_lines)
        else:
            print(f"[skip] 없음: {p}")

    build_frame_legend(matcher, md_lines)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    out_dir = Path("data/runs") / f"{timestamp}_mdx_match"
    out_dir.mkdir(parents=True, exist_ok=True)
    out = out_dir / "match_report.md"
    out.write_text("\n".join(md_lines), encoding="utf-8")
    print(f"\n[saved] {out}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())