first commit

2025-10-27 09:18:24 +09:00
commit a5e23e8da5
20 changed files with 1283 additions and 0 deletions
--- a/utils/file_handler.py
+++ b/utils/file_handler.py
@@ -0,0 +1,99 @@
+import asyncio
+import logging
+import os
+import re
+
+import docx
+import fitz
+from pdf2image import convert_from_path
+from PIL import Image
+
+logger = logging.getLogger(__name__)
+
+
+async def process_file(file_path, ocr_model):
+    """
+    파일 경로를 기반으로 파일 유형을 확인하고 적절한 처리를 수행합니다.
+    - PDF, 이미지는 OCR을 위해 이미지 객체 리스트를 반환합니다.
+    - DOCX는 직접 텍스트를 추출하여 반환합니다.
+    - 지원하지 않는 형식은 ValueError를 발생시킵니다.
+    """
+    ext = os.path.splitext(file_path)[-1].lower()
+    images = []
+    text_only = None
+    needs_ocr = False
+
+    # Upstage는 원본 파일 업로드 → 변환 불필요
+    if ocr_model == "upstage":
+        # if ext == ".pdf":
+        #     text_only = await asyncio.to_thread(extract_text_from_pdf_direct, file_path)
+        #     if text_only.strip():  # 텍스트가 충분히 추출되었다면 OCR 생략
+        #         logger.info(f"[UTILS-TEXT] {ocr_model}: PDF 텍스트 충분 → OCR 생략")
+        #         needs_ocr = False
+        #         return images, text_only, needs_ocr
+        #     else:  # 텍스트가 충분하지 않다면 OCR 필요
+        #         logger.info(f"[FILE-HANDLER] {ocr_model}: PDF 텍스트 부족 → OCR 필요")
+        #         needs_ocr = True
+        #         return images, text_only, needs_ocr
+        # else:
+        logger.info(f"[FILE-HANDLER] {ocr_model}: PDF 외 파일은 OCR 필요 (파일 변환 불필요) ")
+        needs_ocr = True
+        return images, text_only, needs_ocr
+
+    # Upstage가 아닌 경우 파일 형식에 따라 처리
+    if ext == ".pdf":
+        # text_only = await asyncio.to_thread(extract_text_from_pdf_direct, file_path)
+        # if text_only.strip():  # 텍스트가 충분히 추출되었다면 OCR 생략
+        #     logger.info(f"[UTILS-TEXT] {ocr_model}: PDF 텍스트 충분 → OCR 생략")
+        #     needs_ocr = False
+        #     return images, text_only, needs_ocr
+
+        images = await asyncio.to_thread(convert_from_path, file_path, dpi=400)
+        logger.info(f"[FILE-HANDLER] {ocr_model}: PDF → 이미지 변환 완료 ({len(images)} 페이지)")
+        needs_ocr = True
+
+    elif ext in [".jpg", ".jpeg", ".png"]:
+        img = await asyncio.to_thread(Image.open, file_path)
+        images = [img]
+        logger.info(f"[FILE-HANDLER] {ocr_model}: 이미지 파일 로딩 완료")
+        needs_ocr = True
+
+    elif ext == ".docx":
+        text_only = await asyncio.to_thread(extract_text_from_docx, file_path)
+        logger.info(f"[FILE-HANDLER] {ocr_model}: Word 문서 텍스트 추출 완료")
+        needs_ocr = False
+
+    else:
+        logger.error(f"[ERROR] 지원하지 않는 파일 형식: {ext}")
+        raise ValueError("지원하지 않는 파일 형식입니다. (PDF, JPG, JPEG, PNG, DOCX)")
+
+    return images, text_only, needs_ocr
+
+
+def extract_text_from_pdf_direct(pdf_path):
+    text = ""
+    try:
+        with fitz.open(pdf_path) as doc:
+            for page in doc:
+                text += page.get_text()
+                valid_chars = re.findall(r"[가-힣a-zA-Z]", text)
+                logger.info(f"len(valid_chars): {len(valid_chars)}")
+                if len(valid_chars) < 10:
+                    return text  # 텍스트가 충분하지 않으면 바로 반환
+                else:
+                    text += page.get_text()
+    except Exception as e:
+        logger.info("[ERROR] PDF 텍스트 추출 실패:", e)
+    return text
+
+
+def extract_text_from_docx(docx_path):
+    """DOCX 파일에서 텍스트를 추출합니다."""
+    text = ""
+    try:
+        doc = docx.Document(docx_path)
+        for para in doc.paragraphs:
+            text += para.text + "\n"
+    except Exception as e:
+        logger.error(f"[ERROR] DOCX 텍스트 추출 실패: {e}")
+    return text