docs + V4 catalog + samples + Phase Q legacy 보존

전체 26 files (20 추가 + 6 수정), 10507 insertions. Phase Z 문서 : - docs/architecture/PHASE-Z-CHANGE-LOG.md (신설) — axis-by-axis 의사결정 history (newest-on-top). Step 7-A 부터 6 entry 박힘 + 2026-05-08 / 2026-05-08 #2 (compat 매트릭스 폐기 / 6-B 폐기 / F14 표현 정정 / label gate policy 분리). - docs/architecture/PHASE-Z-PIPELINE-OVERVIEW.md (수정) — Step 5/6/9 Gap note append (구조 무변, append-only). 6-B 폐기 사실 + Refinement F. - docs/architecture/PHASE-Z-PIPELINE-STATUS-BOARD.md (수정) — snapshot date 2026-05-08 갱신. §3 핵심 missing item 5 (Step 5/6/9 boundary axis breakdown + 폐기 기록). §6 한 줄 갱신 — 다음 axis 후보 A~F. Project root docs : - PLAN.md / PROGRESS.md / README.md (수정) — 토큰 체계 / 폴더 구조 / 설계 문서 / 역할 분리 반영. - IMPROVEMENT-REDESIGN.md (신설) — Phase Z 설계 핵심 문서. - PROCESS_OVERVIEW.html (신설) — 파이프라인 개요 시각. - docs/tasks/* (신설) — Phase Z task 문서. V4 catalog (Phase Z runtime 필수 의존성) : - tests/matching/v4_full32_result.yaml (신설, 4888 줄) — V4 매칭 결과 32 frame × 10 MDX section. lookup_v4_match() / lookup_v4_candidates() 가 본 파일 read. Phase Z runtime 이 *없으면 즉시 abort* — clone 후 즉시 동작 가능 보장. Samples : - samples/mdx_batch/04.mdx (신설) — MDX04 기본 sample. - samples/mdx/04. DX 지연 요인.mdx (신설) — MDX04 원본. Phase Q legacy 보존 (별 axis "Phase Q audit & salvage" 영역) : - src/block_matcher_tfidf.py / catalog_blocks.py / frame_extractor.py / pipeline_v2.py — Phase Q (옛 파이프라인) src 신규 untracked 파일들. Phase Z runtime 와 의존성 0. Phase Q audit axis 에서 검토 예정. - scripts/eval_block_matcher.py / fetch_all_frame_screenshots.py / match_17_units_my_matcher.py / match_mdx_strict.py / match_mdx_to_frames_tfidf.py / ocr_augment_texts.py / run_pipeline_v2.py / previews/ — Phase Q 작업 시 사용한 옛 script. 같이 보존. - run_mdx03_pipeline.py (수정) — Phase Q 진입점 (no flag) + Phase Z 진입점 (--phase-z2 flag) 동시 wrapper. Phase Z 만 사용 시 `python -m src.phase_z2_pipeline samples/mdx_batch/03.mdx <run_id>` 직접 호출. 비-scope : - tests/matching/ (v4_full32_result.yaml 외 ~63MB) — V4 진화 history / reports / DECK / ATTACH. Phase Q audit axis 에서 검토. - tests/pipeline/ (~15MB) — pipeline data. Phase Q audit 영역. - templates/catalog/blocks.yaml — 옛 block catalog. Phase Q audit. - templates/phase_z2/frames/ — 옛 frame partial 위치. Phase Q audit. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-08 09:47:58 +09:00
parent ec83405770
commit 85c680f02a
26 changed files with 10507 additions and 46 deletions
--- a/src/block_matcher_tfidf.py
+++ b/src/block_matcher_tfidf.py
@@ -0,0 +1,185 @@
+"""TF-IDF 기반 블록 매칭 엔진.
+
+texts.md의 원본 텍스트를 직접 사용 — keywords 수동 생성 불필요.
+frame_extractor가 텍스트를 추출하고, 여기서 TF-IDF 유사도를 계산.
+
+사용법:
+    matcher = TfidfBlockMatcher()
+    result = matcher.match("DX 시행을 위한 필수 요건", ["기술(디지털)", "사람(역량)"])
+"""
+from __future__ import annotations
+
+import logging
+import math
+import re
+from collections import Counter
+from pathlib import Path
+from typing import Any
+
+from src.frame_extractor import extract_all_frames
+
+logger = logging.getLogger(__name__)
+
+
+class TfidfBlockMatcher:
+    """TF-IDF 기반 블록 매칭기. texts.md 직접 사용."""
+
+    def __init__(
+        self,
+        blocks_dir: str | Path = "figma_to_html_agent/blocks",
+        catalog_path: str | Path = "templates/catalog/blocks.yaml",
+    ):
+        self.frames: list[dict] = extract_all_frames(blocks_dir)
+        self.catalog = self._load_catalog(catalog_path)
+
+        # 프레임별 전체 텍스트 + catalog 텍스트 합침
+        self.doc_texts: list[str] = []
+        self.doc_ids: list[str] = []
+        for frame in self.frames:
+            # texts.md 원본 텍스트 사용
+            text = frame.get("all_text", "")
+            # catalog에서 추가 정보 (when, description)
+            cat_entry = self._find_catalog_entry(frame["frame_id"])
+            if cat_entry:
+                text += " " + cat_entry.get("when", "")
+                text += " " + cat_entry.get("description", "")
+                self.doc_ids.append(cat_entry.get("id", frame["frame_id"]))
+            else:
+                self.doc_ids.append(frame["frame_id"])
+            self.doc_texts.append(text)
+
+        # IDF 사전 계산
+        self.idf = self._compute_idf(self.doc_texts)
+        logger.info(f"[tfidf] {len(self.frames)}개 프레임 인덱싱 완료 (texts.md 직접 사용)")
+
+    def _load_catalog(self, path: Path | str) -> list[dict]:
+        """catalog 로드 (있으면)."""
+        path = Path(path)
+        if not path.exists():
+            return []
+        import yaml
+        try:
+            data = yaml.safe_load(path.read_text(encoding="utf-8"))
+            return data if isinstance(data, list) else data.get("blocks", [])
+        except Exception:
+            return []
+
+    def _find_catalog_entry(self, frame_id: str) -> dict | None:
+        """frame_id로 catalog 항목 찾기."""
+        for entry in self.catalog:
+            if entry.get("source_frame") == frame_id:
+                return entry
+        return None
+
+    def _compute_idf(self, documents: list[str]) -> dict[str, float]:
+        """IDF 계산."""
+        N = len(documents)
+        doc_freq = Counter()
+        for doc in documents:
+            words = set(doc.split())
+            for w in words:
+                doc_freq[w] += 1
+        return {w: math.log(N / (freq + 1)) for w, freq in doc_freq.items()}
+
+    def _tfidf_vec(self, text: str) -> dict[str, float]:
+        """텍스트 → TF-IDF 벡터."""
+        words = text.split()
+        tf = Counter(words)
+        total = len(words) if words else 1
+        vec = {}
+        for w in tf:
+            idf = self.idf.get(w, math.log(len(self.doc_texts) + 1))
+            vec[w] = (tf[w] / total) * idf
+        return vec
+
+    def _cosine(self, a: dict, b: dict) -> float:
+        """cosine similarity."""
+        keys = set(a) | set(b)
+        dot = sum(a.get(k, 0) * b.get(k, 0) for k in keys)
+        mag_a = math.sqrt(sum(v ** 2 for v in a.values())) if a else 0
+        mag_b = math.sqrt(sum(v ** 2 for v in b.values())) if b else 0
+        if mag_a == 0 or mag_b == 0:
+            return 0.0
+        return dot / (mag_a * mag_b)
+
+    def _preprocess_query(self, text: str) -> str:
+        """MDX 쿼리 텍스트 전처리 (프레임 전처리와 동일 규칙)."""
+        text = text.replace("S/W", "SW 소프트웨어")
+        text = text.replace("H/W", "HW 하드웨어")
+        text = re.sub(r'\bDX\b', 'DX 디지털전환', text)
+        text = re.sub(r'\bBIM\b', 'BIM 건설정보모델링', text)
+        text = text.replace("(", " ").replace(")", " ")
+        text = text.replace("[", " ").replace("]", " ")
+        text = re.sub(r'[·•→←↔×+/]', ' ', text)
+        text = re.sub(r'\s+', ' ', text).strip()
+        return text
+
+    def match(
+        self,
+        zone_title: str,
+        sub_titles: list[str] | None = None,
+        d1_items: list[str] | None = None,
+        top_k: int = 3,
+    ) -> list[dict]:
+        """중목차/소목차 텍스트로 프레임 매칭.
+
+        Returns:
+            [{"block_id": str, "frame_id": str, "score": float,
+              "title_text": str, "rough_structure": str}]
+        """
+        if not self.doc_texts:
+            return []
+
+        # 쿼리 구성
+        parts = [zone_title]
+        if sub_titles:
+            parts.extend(sub_titles)
+        if d1_items:
+            parts.extend(d1_items)
+        query = self._preprocess_query(" ".join(parts))
+
+        # TF-IDF 유사도 계산
+        query_vec = self._tfidf_vec(query)
+        scores = []
+        for i, doc_text in enumerate(self.doc_texts):
+            doc_vec = self._tfidf_vec(doc_text)
+            score = self._cosine(query_vec, doc_vec)
+            scores.append((i, score))
+
+        # 상위 K개
+        scores.sort(key=lambda x: -x[1])
+        results = []
+        for idx, score in scores[:top_k]:
+            if score <= 0:
+                continue
+            frame = self.frames[idx]
+            results.append({
+                "block_id": self.doc_ids[idx],
+                "frame_id": frame["frame_id"],
+                "score": round(score, 4),
+                "method": "tfidf",
+                "title_text": frame.get("title_text", ""),
+                "rough_structure": frame.get("rough_structure", ""),
+                "item_count": frame.get("item_count", 0),
+            })
+
+        if results:
+            logger.info(
+                f"[tfidf] '{zone_title}' → top: {results[0]['block_id']} "
+                f"(score={results[0]['score']}, frame={results[0]['frame_id']})"
+            )
+
+        return results
+
+    def match_with_threshold(
+        self,
+        zone_title: str,
+        sub_titles: list[str] | None = None,
+        d1_items: list[str] | None = None,
+        threshold: float = 0.10,
+    ) -> dict | None:
+        """threshold 이상이면 best match, 아니면 None → recipe 경로."""
+        results = self.match(zone_title, sub_titles, d1_items, top_k=1)
+        if results and results[0]["score"] >= threshold:
+            return results[0]
+        return None