ocr_gateway_test/utils/file_handler.py

import asyncio
import logging
import os
import re

import docx
import fitz
from pdf2image import convert_from_path
from PIL import Image

logger = logging.getLogger(__name__)


async def process_file(file_path, ocr_model):
    """
    파일 경로를 기반으로 파일 유형을 확인하고 적절한 처리를 수행합니다.
    - PDF, 이미지는 OCR을 위해 이미지 객체 리스트를 반환합니다.
    - DOCX는 직접 텍스트를 추출하여 반환합니다.
    - 지원하지 않는 형식은 ValueError를 발생시킵니다.
    """
    ext = os.path.splitext(file_path)[-1].lower()
    images = []
    text_only = None
    needs_ocr = False

    # Upstage는 원본 파일 업로드 → 변환 불필요
    if ocr_model == "upstage":
        # if ext == ".pdf":
        #     text_only = await asyncio.to_thread(extract_text_from_pdf_direct, file_path)
        #     if text_only.strip():  # 텍스트가 충분히 추출되었다면 OCR 생략
        #         logger.info(f"[UTILS-TEXT] {ocr_model}: PDF 텍스트 충분 → OCR 생략")
        #         needs_ocr = False
        #         return images, text_only, needs_ocr
        #     else:  # 텍스트가 충분하지 않다면 OCR 필요
        #         logger.info(f"[FILE-HANDLER] {ocr_model}: PDF 텍스트 부족 → OCR 필요")
        #         needs_ocr = True
        #         return images, text_only, needs_ocr
        # else:
        logger.info(f"[FILE-HANDLER] {ocr_model}: PDF 외 파일은 OCR 필요 (파일 변환 불필요) ")
        needs_ocr = True
        return images, text_only, needs_ocr

    # Upstage가 아닌 경우 파일 형식에 따라 처리
    if ext == ".pdf":
        # text_only = await asyncio.to_thread(extract_text_from_pdf_direct, file_path)
        # if text_only.strip():  # 텍스트가 충분히 추출되었다면 OCR 생략
        #     logger.info(f"[UTILS-TEXT] {ocr_model}: PDF 텍스트 충분 → OCR 생략")
        #     needs_ocr = False
        #     return images, text_only, needs_ocr

        images = await asyncio.to_thread(convert_from_path, file_path, dpi=400)
        logger.info(f"[FILE-HANDLER] {ocr_model}: PDF → 이미지 변환 완료 ({len(images)} 페이지)")
        needs_ocr = True

    elif ext in [".jpg", ".jpeg", ".png"]:
        img = await asyncio.to_thread(Image.open, file_path)
        images = [img]
        logger.info(f"[FILE-HANDLER] {ocr_model}: 이미지 파일 로딩 완료")
        needs_ocr = True

    elif ext == ".docx":
        text_only = await asyncio.to_thread(extract_text_from_docx, file_path)
        logger.info(f"[FILE-HANDLER] {ocr_model}: Word 문서 텍스트 추출 완료")
        needs_ocr = False

    else:
        logger.error(f"[ERROR] 지원하지 않는 파일 형식: {ext}")
        raise ValueError("지원하지 않는 파일 형식입니다. (PDF, JPG, JPEG, PNG, DOCX)")

    return images, text_only, needs_ocr


def extract_text_from_pdf_direct(pdf_path):
    text = ""
    try:
        with fitz.open(pdf_path) as doc:
            for page in doc:
                text += page.get_text()
                valid_chars = re.findall(r"[가-힣a-zA-Z]", text)
                logger.info(f"len(valid_chars): {len(valid_chars)}")
                if len(valid_chars) < 10:
                    return text  # 텍스트가 충분하지 않으면 바로 반환
                else:
                    text += page.get_text()
    except Exception as e:
        logger.info("[ERROR] PDF 텍스트 추출 실패:", e)
    return text


def extract_text_from_docx(docx_path):
    """DOCX 파일에서 텍스트를 추출합니다."""
    text = ""
    try:
        doc = docx.Document(docx_path)
        for para in doc.paragraphs:
            text += para.text + "\n"
    except Exception as e:
        logger.error(f"[ERROR] DOCX 텍스트 추출 실패: {e}")
    return text