first commit

2025-10-27 09:18:24 +09:00
commit a5e23e8da5
20 changed files with 1283 additions and 0 deletions
--- a/utils/init.py
+++ b/utils/init.py
--- a/utils/celery_utils.py
+++ b/utils/celery_utils.py
@@ -0,0 +1,13 @@
+# utils/celery_utils.py
+from celery import Celery
+from config.setting import CELERY_BROKER_URL, CELERY_RESULT_BACKEND
+
+# Define and export the single Celery app instance
+celery_app = Celery(
+    "ocr_tasks", broker=CELERY_BROKER_URL, backend=CELERY_RESULT_BACKEND
+)
+
+
+@celery_app.task(name="health_check")
+def health_check():
+    return {"status": "ok"}
--- a/utils/checking_keys.py
+++ b/utils/checking_keys.py
@@ -0,0 +1,14 @@
+import logging
+
+from dotenv import load_dotenv
+from snowflake import SnowflakeGenerator
+
+logger = logging.getLogger(__name__)
+load_dotenv()
+
+def create_key(node: int = 1) -> str:
+    """
+    Snowflake 알고리즘 기반 고유 키 생성기 (request_id용)
+    """
+    generator = SnowflakeGenerator(node)
+    return str(next(generator))
--- a/utils/file_handler.py
+++ b/utils/file_handler.py
@@ -0,0 +1,99 @@
+import asyncio
+import logging
+import os
+import re
+
+import docx
+import fitz
+from pdf2image import convert_from_path
+from PIL import Image
+
+logger = logging.getLogger(__name__)
+
+
+async def process_file(file_path, ocr_model):
+    """
+    파일 경로를 기반으로 파일 유형을 확인하고 적절한 처리를 수행합니다.
+    - PDF, 이미지는 OCR을 위해 이미지 객체 리스트를 반환합니다.
+    - DOCX는 직접 텍스트를 추출하여 반환합니다.
+    - 지원하지 않는 형식은 ValueError를 발생시킵니다.
+    """
+    ext = os.path.splitext(file_path)[-1].lower()
+    images = []
+    text_only = None
+    needs_ocr = False
+
+    # Upstage는 원본 파일 업로드 → 변환 불필요
+    if ocr_model == "upstage":
+        # if ext == ".pdf":
+        #     text_only = await asyncio.to_thread(extract_text_from_pdf_direct, file_path)
+        #     if text_only.strip():  # 텍스트가 충분히 추출되었다면 OCR 생략
+        #         logger.info(f"[UTILS-TEXT] {ocr_model}: PDF 텍스트 충분 → OCR 생략")
+        #         needs_ocr = False
+        #         return images, text_only, needs_ocr
+        #     else:  # 텍스트가 충분하지 않다면 OCR 필요
+        #         logger.info(f"[FILE-HANDLER] {ocr_model}: PDF 텍스트 부족 → OCR 필요")
+        #         needs_ocr = True
+        #         return images, text_only, needs_ocr
+        # else:
+        logger.info(f"[FILE-HANDLER] {ocr_model}: PDF 외 파일은 OCR 필요 (파일 변환 불필요) ")
+        needs_ocr = True
+        return images, text_only, needs_ocr
+
+    # Upstage가 아닌 경우 파일 형식에 따라 처리
+    if ext == ".pdf":
+        # text_only = await asyncio.to_thread(extract_text_from_pdf_direct, file_path)
+        # if text_only.strip():  # 텍스트가 충분히 추출되었다면 OCR 생략
+        #     logger.info(f"[UTILS-TEXT] {ocr_model}: PDF 텍스트 충분 → OCR 생략")
+        #     needs_ocr = False
+        #     return images, text_only, needs_ocr
+
+        images = await asyncio.to_thread(convert_from_path, file_path, dpi=400)
+        logger.info(f"[FILE-HANDLER] {ocr_model}: PDF → 이미지 변환 완료 ({len(images)} 페이지)")
+        needs_ocr = True
+
+    elif ext in [".jpg", ".jpeg", ".png"]:
+        img = await asyncio.to_thread(Image.open, file_path)
+        images = [img]
+        logger.info(f"[FILE-HANDLER] {ocr_model}: 이미지 파일 로딩 완료")
+        needs_ocr = True
+
+    elif ext == ".docx":
+        text_only = await asyncio.to_thread(extract_text_from_docx, file_path)
+        logger.info(f"[FILE-HANDLER] {ocr_model}: Word 문서 텍스트 추출 완료")
+        needs_ocr = False
+
+    else:
+        logger.error(f"[ERROR] 지원하지 않는 파일 형식: {ext}")
+        raise ValueError("지원하지 않는 파일 형식입니다. (PDF, JPG, JPEG, PNG, DOCX)")
+
+    return images, text_only, needs_ocr
+
+
+def extract_text_from_pdf_direct(pdf_path):
+    text = ""
+    try:
+        with fitz.open(pdf_path) as doc:
+            for page in doc:
+                text += page.get_text()
+                valid_chars = re.findall(r"[가-힣a-zA-Z]", text)
+                logger.info(f"len(valid_chars): {len(valid_chars)}")
+                if len(valid_chars) < 10:
+                    return text  # 텍스트가 충분하지 않으면 바로 반환
+                else:
+                    text += page.get_text()
+    except Exception as e:
+        logger.info("[ERROR] PDF 텍스트 추출 실패:", e)
+    return text
+
+
+def extract_text_from_docx(docx_path):
+    """DOCX 파일에서 텍스트를 추출합니다."""
+    text = ""
+    try:
+        doc = docx.Document(docx_path)
+        for para in doc.paragraphs:
+            text += para.text + "\n"
+    except Exception as e:
+        logger.error(f"[ERROR] DOCX 텍스트 추출 실패: {e}")
+    return text
--- a/utils/ocr_processor.py
+++ b/utils/ocr_processor.py
@@ -0,0 +1,14 @@
+def ocr_process(filename, ocr_model, coord, text, start_time, end_time):
+    json_data = {
+        "filename": filename,
+        "model": {"ocr_model": ocr_model},
+        "time": {
+            "duration_sec": f"{end_time - start_time:.2f}",
+            "started_at": start_time,
+            "ended_at": end_time,
+        },
+        "fields": coord,
+        "parsed": text,
+    }
+
+    return json_data
--- a/utils/preprocessor.py
+++ b/utils/preprocessor.py
@@ -0,0 +1,61 @@
+import cv2
+import numpy as np
+from PIL import Image
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def to_rgb_uint8(img_np: np.ndarray) -> np.ndarray:
+    """
+    입력 이미지를 3채널 RGB, uint8 [0,255] 로 표준화
+    허용 입력: HxW, HxWx1, HxWx3, HxWx4, float[0..1]/[0..255], int 등
+    """
+    if img_np is None:
+        raise ValueError("Input image is None")
+
+    # dtype/범위 표준화
+    if img_np.dtype != np.uint8:
+        arr = img_np.astype(np.float32)
+        if arr.max() <= 1.0:  # [0,1]로 보이면 스케일업
+            arr *= 255.0
+        arr = np.clip(arr, 0, 255).astype(np.uint8)
+        img_np = arr
+
+    # 채널 표준화
+    if img_np.ndim == 2:  # HxW
+        img_np = cv2.cvtColor(img_np, cv2.COLOR_GRAY2RGB)
+    elif img_np.ndim == 3:
+        h, w, c = img_np.shape
+        if c == 1:
+            img_np = cv2.cvtColor(img_np, cv2.COLOR_GRAY2RGB)
+        elif c == 4:
+            img_np = cv2.cvtColor(img_np, cv2.COLOR_RGBA2RGB)
+        elif c == 3:
+            pass  # 그대로 사용
+        else:
+            raise ValueError(f"Unsupported channel count: {c}")
+    else:
+        raise ValueError(f"Unsupported ndim: {img_np.ndim}")
+
+    return img_np
+
+# tesseract 전처리 함수
+def tess_prep_cv2(pil_img):
+    logger.info("[UTILS-OCR] 이미지 전처리 시작")
+    img = np.array(pil_img.convert("RGB"))  # PIL → OpenCV 변환
+    img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)  # 그레이스케일 변환
+    img = cv2.bilateralFilter(img, 9, 75, 75)  # 노이즈 제거
+    img = cv2.adaptiveThreshold(
+        img,
+        255,
+        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+        cv2.THRESH_BINARY,
+        31,
+        10,  # 대비 향상
+    )
+    img = cv2.resize(
+        img, None, fx=2, fy=2, interpolation=cv2.INTER_LINEAR
+    )  # 해상도 확대
+
+    return Image.fromarray(img)
--- a/utils/redis_utils.py
+++ b/utils/redis_utils.py
@@ -0,0 +1,22 @@
+# utils/redis_utils.py
+
+import redis
+from config.setting import REDIS_DB, REDIS_HOST, REDIS_PORT
+
+
+def get_redis_client():
+    """
+    Redis 클라이언트를 반환합니다. decode_responses=True 설정으로 문자열을 자동 디코딩합니다.
+    """
+    try:
+        redis_client = redis.Redis(
+            host=REDIS_HOST,
+            port=REDIS_PORT,
+            db=REDIS_DB,
+            decode_responses=True,
+        )
+        # 연결 확인 (ping)
+        redis_client.ping()
+        return redis_client
+    except redis.ConnectionError as e:
+        raise RuntimeError(f"Redis 연결 실패: {e}")
--- a/utils/text_extractor.py
+++ b/utils/text_extractor.py
@@ -0,0 +1,316 @@
+import asyncio
+import logging
+import os
+from pathlib import Path
+
+import cv2
+import httpx
+import numpy as np
+import paddle
+import pytesseract
+from config.setting import UPSTAGE_API_KEY, UPSTAGE_API_URL
+from paddleocr import PaddleOCR, PPStructureV3
+
+from .file_handler import process_file
+from .preprocessor import tess_prep_cv2, to_rgb_uint8
+
+logger = logging.getLogger(__name__)
+
+
+# PaddleOCR 및 PPStructure 모델을 전역 변수로 초기화
+# 이렇게 하면 Celery 워커가 시작될 때 한 번만 모델을 로드합니다.
+_paddle_ocr_model = None
+_paddle_structure_model = None
+
+
+def get_paddle_ocr_model():
+    """PaddleOCR 모델 인스턴스를 반환합니다 (Singleton)."""
+    global _paddle_ocr_model
+    if _paddle_ocr_model is None:
+        device = os.getenv("PADDLE_DEVICE", "cpu")
+        logger.info(f"Initializing PaddleOCR model on device: {device}")
+        _paddle_ocr_model = PaddleOCR(
+            use_doc_orientation_classify=False,
+            use_doc_unwarping=False,
+            device=device,
+            lang="korean",
+        )
+        logger.info("PaddleOCR model initialized.")
+    return _paddle_ocr_model
+
+
+def get_paddle_structure_model():
+    """PPStructure 모델 인스턴스를 반환합니다 (Singleton)."""
+    global _paddle_structure_model
+    if _paddle_structure_model is None:
+        device = os.getenv("PADDLE_DEVICE", "cpu")
+        logger.info(f"Initializing PPStructure model on device: {device}")
+        _paddle_structure_model = PPStructureV3(
+            use_doc_orientation_classify=False,
+            use_doc_unwarping=False,
+            device=device,
+            lang="korean",
+            layout_threshold=0.3,  # 레이아웃 인식 실패로 임계값 수정됨
+        )
+        logger.info("PPStructure model initialized.")
+    return _paddle_structure_model
+
+
+async def extract_text_from_file(file_path, ocr_model):
+    """
+    파일을 처리하고 OCR 모델을 적용하여 텍스트를 추출합니다.
+    """
+    images, text_only, needs_ocr = await process_file(file_path, ocr_model)
+
+    if not needs_ocr:
+        return text_only, [], "OCR not used"
+
+    if ocr_model == "tesseract":
+        logger.info(f"[TESSERACT] {ocr_model} 로 이미지에서 텍스트 추출 중...")
+        full_response, coord_response = await asyncio.to_thread(
+            extract_tesseract_ocr, images
+        )
+    elif ocr_model == "pp-ocr":
+        logger.info(f"[PP-OCR] {ocr_model}로 이미지에서 텍스트 추출 중...")
+        full_response, coord_response = await asyncio.to_thread(
+            extract_paddle_ocr, images
+        )
+    elif ocr_model == "pp-structure":
+        logger.info(f"[PP-STRUCTURE] {ocr_model}로 이미지에서 텍스트 추출 중...")
+        full_response, coord_response = await asyncio.to_thread(
+            extract_paddle_structure, images
+        )
+    elif ocr_model == "upstage":
+        logger.info(f"[UPSTAGE] {ocr_model}로 이미지에서 텍스트 추출 중...")
+        full_response, coord_response = await extract_upstage_ocr(file_path)
+    else:
+        logger.error(f"[OCR MODEL] 지원하지 않는 모델입니다. ({ocr_model})")
+        raise ValueError(f"지원하지 않는 OCR 모델입니다: {ocr_model}")
+
+    return full_response, coord_response, ocr_model
+
+
+# ✅ tesseract
+def extract_tesseract_ocr(images):
+    """
+    tesseract를 사용하여 이미지에서 텍스트 추출 및 좌표 정보 반환
+    """
+    all_texts = []
+    coord_response = []
+
+    for page_idx, img in enumerate(images):
+        logger.info(f"[UTILS-OCR] 페이지 {page_idx + 1} OCR로 텍스트 추출 중...")
+        pre_img = tess_prep_cv2(img)
+        text = pytesseract.image_to_string(
+            pre_img, lang="kor+eng", config="--oem 3 --psm 6"
+        )
+        all_texts.append(text)
+
+        ocr_data = pytesseract.image_to_data(
+            pre_img,
+            output_type=pytesseract.Output.DICT,
+            lang="kor+eng",
+            config="--oem 3 --psm 6",
+        )
+        for i in range(len(ocr_data["text"])):
+            word = ocr_data["text"][i].strip()
+            if word == "":
+                continue
+            x, y, w, h = (
+                ocr_data["left"][i],
+                ocr_data["top"][i],
+                ocr_data["width"][i],
+                ocr_data["height"][i],
+            )
+            coord_response.append(
+                {"text": word, "coords": [x, y, x + w, y + h], "page": page_idx + 1}
+            )
+
+        logger.info(f"[UTILS-OCR] 페이지 {page_idx + 1} 텍스트 및 좌표 추출 완료")
+
+    full_response = "\n".join(all_texts)
+    return full_response, coord_response
+
+
+# ✅ PaddleOCR
+def extract_paddle_ocr(images):
+    """
+    PaddleOCR를 사용하여 이미지에서 텍스트 추출 및 좌표 정보 반환
+    """
+    ocr = get_paddle_ocr_model()
+
+    full_response = []
+    coord_response = []
+
+    for page_idx, img in enumerate(images):
+        print(f"[PaddleOCR] 페이지 {page_idx + 1} OCR로 텍스트 추출 중...")
+        img_np = np.array(img)
+
+        # ✅ 채널/타입 표준화 (grayscale/rgba/float 등 대응)
+        try:
+            img_np = to_rgb_uint8(img_np)
+        except Exception as e:
+            print(f"[PaddleOCR] 페이지 {page_idx + 1} 입력 표준화 실패: {e}")
+            continue  # 문제 페이지 스킵 후 다음 페이지 진행
+
+        # ✅ 과도한 해상도 안정화 (최대 변 4000px)
+        h, w = img_np.shape[:2]
+        max_side = max(h, w)
+        max_side_limit = 4000
+        if max_side > max_side_limit:
+            scale = max_side_limit / max_side
+            new_size = (int(w * scale), int(h * scale))
+            img_np = cv2.resize(img_np, new_size, interpolation=cv2.INTER_AREA)
+            print(f"[PaddleOCR] Resized to {img_np.shape[1]}x{img_np.shape[0]}")
+
+        results = ocr.predict(input=img_np)
+
+        try:
+            if paddle.is_compiled_with_cuda():
+                paddle.device.cuda.synchronize()
+                paddle.device.cuda.empty_cache()
+        except Exception:
+            pass
+
+        print(f"[PaddleOCR] 페이지 {page_idx + 1} OCR 결과 개수: {len(results)}")
+        for res_idx, res in enumerate(results):
+            print(f"[PaddleOCR] 페이지 {page_idx + 1} 결과 {res_idx + 1}개 추출 완료")
+            res_dic = dict(res.items())
+
+            texts = res_dic.get("rec_texts", [])
+            boxes = res_dic.get("rec_boxes", [])
+
+            for text, bbox in zip(texts, boxes):
+                full_response.append(text)
+                coord_response.append(
+                    {"text": text, "coords": bbox.tolist(), "page": page_idx + 1}
+                )
+
+    print("[PaddleOCR] 전체 페이지 텍스트 및 좌표 추출 완료")
+    return "\n".join(full_response), coord_response
+
+
+# ✅ PaddleStructure
+def extract_paddle_structure(images):
+    """
+    PaddleSTRUCTURE  사용하여 이미지에서 텍스트 추출 및 좌표 정보 반환
+    """
+    structure = get_paddle_structure_model()
+
+    full_response = []
+    coord_response = []
+
+    for page_idx, img in enumerate(images):
+        print(f"[PaddleSTRUCTURE] 페이지 {page_idx + 1} OCR로 텍스트 추출 중...")
+        img_np = np.array(img)
+        print(f"[Padddle-IMG]{img}")
+
+        # ✅ 채널/타입 표준화 (grayscale/rgba/float 등 대응)
+        try:
+            img_np = to_rgb_uint8(img_np)
+        except Exception as e:
+            print(f"[PaddleSTRUCTURE] 페이지 {page_idx + 1} 입력 표준화 실패: {e}")
+            continue  # 문제 페이지 스킵 후 다음 페이지 진행
+
+        # ✅ 과도한 해상도 안정화 (최대 변 4000px)
+        h, w = img_np.shape[:2]
+        max_side = max(h, w)
+        max_side_limit = 4000
+        if max_side > max_side_limit:
+            scale = max_side_limit / max_side
+            new_size = (int(w * scale), int(h * scale))
+            img_np = cv2.resize(img_np, new_size, interpolation=cv2.INTER_AREA)
+            print(f"[PaddleSTRUCTURE] Resized to {img_np.shape[1]}x{img_np.shape[0]}")
+
+        results = structure.predict(input=img_np)
+
+        try:
+            if paddle.is_compiled_with_cuda():
+                paddle.device.cuda.empty_cache()
+        except Exception:
+            pass
+
+        print(f"[PaddleSTRUCTURE] 페이지 {page_idx + 1} OCR 결과 개수: {len(results)}")
+        for res_idx, res in enumerate(results):
+            print(
+                f"[PaddleSTRUCTURE] 페이지 {page_idx + 1} 결과 {res_idx + 1}개 추출 완료"
+            )
+            res_dic = dict(res.items())
+            blocks = res_dic.get("parsing_res_list", []) or []
+
+            for block in blocks:
+                bd = block.to_dict()
+
+                content = bd.get("content", [])
+                bbox = bd.get("bbox", [])
+
+                full_response.append(content)
+
+                coord_response.append(
+                    {"text": content, "coords": bbox, "page": page_idx + 1}
+                )
+
+    print("[PaddleSTRUCTURE] 전체 페이지 텍스트 및 좌표 추출 완료")
+    return "\n".join(full_response), coord_response
+
+
+# ✅ Upstage OCR API
+async def extract_upstage_ocr(file_path: str):
+    """
+    Upstage OCR API를 사용하여 이미지에서 텍스트 및 좌표 추출
+    """
+    if not UPSTAGE_API_KEY:
+        raise ValueError("Upstage API 키가 설정되지 않았습니다.")
+    if not file_path or not os.path.exists(file_path):
+        raise FileNotFoundError(f"파일이 존재하지 않습니다: {file_path}")
+
+    url = UPSTAGE_API_URL
+    if not url:
+        url = "https://api.upstage.ai/v1/document-ai/ocr"
+        logger.warning(f"UPSTAGE_API_URL not set in config, using default: {url}")
+
+    headers = {"Authorization": f"Bearer {UPSTAGE_API_KEY}"}
+    data = {"model": "ocr"}
+    filename = Path(file_path).name
+    full_text_parts = []
+    coord_response = []
+
+    with open(file_path, "rb") as f:
+        files = {"document": (filename, f, "application/octet-stream")}
+        try:
+            async with httpx.AsyncClient(timeout=60.0, follow_redirects=True) as client:
+                response = await client.post(
+                    url, headers=headers, files=files, data=data
+                )
+                response.raise_for_status()
+                result = response.json()
+        except httpx.HTTPStatusError as e:
+            logger.error(f"Upstage API 오류: {e.response.text}")
+            raise RuntimeError(f"Upstage API 오류: {e.response.status_code}")
+
+    try:
+        pages = result.get("pages", [])
+        for page_idx, p in enumerate(pages, start=1):
+            txt = p.get("text")
+            if txt:
+                full_text_parts.append(txt)
+
+            for w in p.get("words", []):
+                verts = (w.get("boundingBox", {}) or {}).get("vertices")
+                if not verts or len(verts) != 4:
+                    continue
+                xs = [v.get("x", 0) for v in verts]
+                ys = [v.get("y", 0) for v in verts]
+                coord_response.append(
+                    {
+                        "text": w.get("text"),
+                        "coords": [min(xs), min(ys), max(xs), max(ys)],
+                        "page": page_idx,
+                    }
+                )
+    except Exception as e:
+        logger.error(f"[UPSTAGE] JSON 파싱 실패: {e} / 원본 result: {result}")
+        return "", []
+
+    full_response = "\n".join(full_text_parts)
+    return full_response, coord_response