From 604c29403f6cbbe6d616793c73498788e0072db9 Mon Sep 17 00:00:00 2001
From: Taehoon <thlee3@samaneng.com>
Date: Thu, 26 Feb 2026 17:54:51 +0900
Subject: [PATCH] =?UTF-8?q?analyze.py=20-=20=ED=85=8D=EC=8A=A4=ED=8A=B8=20?=
 =?UTF-8?q?=EB=B9=84=EA=B5=90=20=EB=B0=A9=EC=8B=9D=EC=9C=BC=EB=A1=9C=20?=
 =?UTF-8?q?=EB=B6=84=EC=84=9D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 analyze.py | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 analyze.py

diff --git a/analyze.py b/analyze.py
new file mode 100644
index 0000000..abf2233
--- /dev/null
+++ b/analyze.py
@@ -0,0 +1,68 @@
+import os
+import re
+import unicodedata
+from pypdf import PdfReader
+try:
+    import pytesseract
+    from pdf2image import convert_from_path
+    from PIL import Image
+    TESSERACT_PATH = r'C:\Users\User\AppData\Local\Programs\Tesseract-OCR	esseract.exe'
+    POPPLER_PATH = r'D:\이태훈\00크롬다운로드\poppler-25.12.0\Library\bin' 
+    pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH
+    OCR_AVAILABLE = True
+except ImportError:
+    OCR_AVAILABLE = False
+
+def analyze_file_content(filename: str):
+    file_path = os.path.join("sample", filename)
+    if not os.path.exists(file_path):
+        return {"error": "File not found"}
+    
+    content_parts = []
+    log_steps = []
+    
+    try:
+        if filename.lower().endswith(".pdf"):
+            log_steps.append("1. 레이어: 파일 제목 분석 중...")
+            log_steps.append("2. 레이어: PDF 텍스트 엔진 가동...")
+            reader = PdfReader(file_path)
+            text_extracted = ""
+            for page in reader.pages[:5]:
+                text = page.extract_text()
+                if text: text_extracted += text + "
+"
+            if text_extracted.strip():
+                content_parts.append(unicodedata.normalize('NFC', text_extracted))
+            
+            log_steps.append("3. 레이어: OCR 이미지 스캔 강제 실행...")
+            if OCR_AVAILABLE and os.path.exists(TESSERACT_PATH):
+                images = convert_from_path(file_path, first_page=1, last_page=2, poppler_path=POPPLER_PATH)
+                for i, img in enumerate(images):
+                    page_text = pytesseract.image_to_string(img, lang='kor+eng')
+                    content_parts.append(unicodedata.normalize('NFC', page_text))
+        else:
+            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
+                content_parts.append(unicodedata.normalize('NFC', f.read(5000)))
+    except: pass
+
+    full_content = "
+".join(content_parts)
+    search_pool = (full_content + " " + filename).lower().replace(" ", "").replace("
+", "")
+    
+    result = {
+        "suggested_path": "분석실패",
+        "confidence": "Low",
+        "log_steps": log_steps,
+        "raw_text": full_content,
+        "reason": "일치 키워드 없음"
+    }
+
+    if "실정보고" in search_pool:
+        if any(k in search_pool for k in ["어천", "공주"]):
+            if "품질" in search_pool: result["suggested_path"] = "설계변경 > 실정보고(어천~공주) > 품질관리"
+            else: result["suggested_path"] = "설계변경 > 실정보고(어천~공주) > 기타"
+            result["confidence"] = "100%"
+            result["reason"] = "3중 레이어 분석: 실정보고+어천공주 키워드 통합 검출"
+
+    return result