analyze.md - 텍스트 비교 방식으로 분석

2026-02-26 17:49:23 +09:00
parent af9d27bee8
commit feb7cb9004
6 changed files with 1725 additions and 69 deletions
--- a/analyze.py
+++ b/analyze.py
@@ -0,0 +1,92 @@
+import os
+import re
+import unicodedata
+from pypdf import PdfReader
+try:
+    import pytesseract
+    from pdf2image import convert_from_path
+    from PIL import Image
+    TESSERACT_PATH = r'C:\Users\User\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'
+    POPPLER_PATH = r'D:\이태훈\00크롬다운로드\poppler-25.12.0\Library\bin' 
+    pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH
+    OCR_AVAILABLE = True
+except ImportError:
+    OCR_AVAILABLE = False
+
+def analyze_file_content(filename: str):
+    file_path = os.path.join("sample", filename)
+    if not os.path.exists(file_path):
+        return {"error": "File not found"}
+    
+    log_steps = []
+    
+    # Layer 1: 제목 분석 (Quick)
+    log_steps.append("1. 레이어: 파일 제목(Title) 스캔 중...")
+    title_text = filename.lower().replace(" ", "")
+    
+    # Layer 2: 텍스트 추출 (Fast)
+    log_steps.append("2. 레이어: PDF 텍스트 엔진(Extraction) 가동...")
+    text_content = ""
+    try:
+        if filename.lower().endswith(".pdf"):
+            reader = PdfReader(file_path)
+            for page in reader.pages[:5]: # 전체가 아닌 핵심 페이지 위주
+                page_txt = page.extract_text()
+                if page_txt: text_content += page_txt + "\n"
+        text_content = unicodedata.normalize('NFC', text_content)
+        log_steps.append(f"   - 텍스트 데이터 확보 완료 ({len(text_content)}자)")
+    except:
+        log_steps.append("   - 텍스트 추출 실패")
+
+    # Layer 3: OCR 정밀 분석 (Deep)
+    log_steps.append("3. 레이어: OCR 이미지 스캔(Vision) 강제 실행...")
+    ocr_content = ""
+    if OCR_AVAILABLE and os.path.exists(TESSERACT_PATH):
+        try:
+            # 상징적인 첫 페이지 위주 OCR (성능과 정확도 타협)
+            images = convert_from_path(file_path, first_page=1, last_page=2, poppler_path=POPPLER_PATH)
+            for i, img in enumerate(images):
+                page_ocr = pytesseract.image_to_string(img, lang='kor+eng')
+                ocr_content += unicodedata.normalize('NFC', page_ocr) + "\n"
+            log_steps.append(f"   - OCR 스캔 완료 ({len(ocr_content)}자)")
+        except Exception as e:
+            log_steps.append(f"   - OCR 오류: {str(e)[:20]}")
+    
+    # 3중 레이어 데이터 통합
+    full_pool = (title_text + " | " + text_content + " | " + ocr_content).lower().replace(" ", "").replace("\n", "")
+    
+    # 분석 초기화
+    result = {
+        "suggested_path": "분석실패",
+        "confidence": "Low",
+        "log_steps": log_steps,
+        "raw_text": f"--- TITLE ---\n{filename}\n\n--- TEXT ---\n{text_content[:1000]}\n\n--- OCR ---\n{ocr_content[:1000]}",
+        "reason": "학습된 키워드 일치 항목 없음"
+    }
+
+    # 최종 추천 로직 (합의 알고리즘)
+    is_eocheon = any(k in full_pool for k in ["어천", "공주"])
+    
+    if "실정보고" in full_pool or "실정" in full_pool:
+        if is_eocheon:
+            if "품질" in full_pool:
+                result["suggested_path"] = "설계변경 > 실정보고(어천~공주) > 품질관리"
+                result["reason"] = "3중 레이어 분석: 실정보고+어천공주+품질관리 키워드 통합 검출"
+            elif any(k in full_pool for k in ["토지", "임대"]):
+                result["suggested_path"] = "설계변경 > 실정보고(어천~공주) > 기타"
+                result["reason"] = "3중 레이어 분석: 토지임대 관련 실정보고(어천-공주) 확인"
+            else:
+                result["suggested_path"] = "설계변경 > 실정보고(어천~공주) > 기타"
+                result["reason"] = "3중 레이어 분석: 실정보고(어천-공주) 문서 판정"
+            result["confidence"] = "100%"
+        else:
+            result["suggested_path"] = "설계변경 > 실정보고(어천~공주) > 기타" # 폴백
+            result["confidence"] = "80%"
+            result["reason"] = "실정보고 키워드는 발견되었으나 프로젝트명 교차 검증 실패 (기본값 제안)"
+
+    elif "품질" in full_pool:
+        result["suggested_path"] = "공사관리 > 품질 관리 > 품질시험계획서"
+        result["confidence"] = "90%"
+        result["reason"] = "텍스트/OCR 레이어에서 품질 관리 지표 다수 식별"
+
+    return result