From 604c29403f6cbbe6d616793c73498788e0072db9 Mon Sep 17 00:00:00 2001 From: Taehoon Date: Thu, 26 Feb 2026 17:54:51 +0900 Subject: [PATCH] =?UTF-8?q?analyze.py=20-=20=ED=85=8D=EC=8A=A4=ED=8A=B8=20?= =?UTF-8?q?=EB=B9=84=EA=B5=90=20=EB=B0=A9=EC=8B=9D=EC=9C=BC=EB=A1=9C=20?= =?UTF-8?q?=EB=B6=84=EC=84=9D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- analyze.py | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 analyze.py diff --git a/analyze.py b/analyze.py new file mode 100644 index 0000000..abf2233 --- /dev/null +++ b/analyze.py @@ -0,0 +1,68 @@ +import os +import re +import unicodedata +from pypdf import PdfReader +try: + import pytesseract + from pdf2image import convert_from_path + from PIL import Image + TESSERACT_PATH = r'C:\Users\User\AppData\Local\Programs\Tesseract-OCR esseract.exe' + POPPLER_PATH = r'D:\이태훈\00크롬다운로드\poppler-25.12.0\Library\bin' + pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH + OCR_AVAILABLE = True +except ImportError: + OCR_AVAILABLE = False + +def analyze_file_content(filename: str): + file_path = os.path.join("sample", filename) + if not os.path.exists(file_path): + return {"error": "File not found"} + + content_parts = [] + log_steps = [] + + try: + if filename.lower().endswith(".pdf"): + log_steps.append("1. 레이어: 파일 제목 분석 중...") + log_steps.append("2. 레이어: PDF 텍스트 엔진 가동...") + reader = PdfReader(file_path) + text_extracted = "" + for page in reader.pages[:5]: + text = page.extract_text() + if text: text_extracted += text + " +" + if text_extracted.strip(): + content_parts.append(unicodedata.normalize('NFC', text_extracted)) + + log_steps.append("3. 레이어: OCR 이미지 스캔 강제 실행...") + if OCR_AVAILABLE and os.path.exists(TESSERACT_PATH): + images = convert_from_path(file_path, first_page=1, last_page=2, poppler_path=POPPLER_PATH) + for i, img in enumerate(images): + page_text = pytesseract.image_to_string(img, lang='kor+eng') + content_parts.append(unicodedata.normalize('NFC', page_text)) + else: + with open(file_path, "r", encoding="utf-8", errors="ignore") as f: + content_parts.append(unicodedata.normalize('NFC', f.read(5000))) + except: pass + + full_content = " +".join(content_parts) + search_pool = (full_content + " " + filename).lower().replace(" ", "").replace(" +", "") + + result = { + "suggested_path": "분석실패", + "confidence": "Low", + "log_steps": log_steps, + "raw_text": full_content, + "reason": "일치 키워드 없음" + } + + if "실정보고" in search_pool: + if any(k in search_pool for k in ["어천", "공주"]): + if "품질" in search_pool: result["suggested_path"] = "설계변경 > 실정보고(어천~공주) > 품질관리" + else: result["suggested_path"] = "설계변경 > 실정보고(어천~공주) > 기타" + result["confidence"] = "100%" + result["reason"] = "3중 레이어 분석: 실정보고+어천공주 키워드 통합 검출" + + return result