From 87d0200a3430ef8e4926d0daa4a7d24d4b8a8ce9 Mon Sep 17 00:00:00 2001 From: kyy Date: Tue, 9 Sep 2025 16:23:09 +0900 Subject: [PATCH] =?UTF-8?q?/tesseract=20&=20/tesstrain=20=EC=B6=94?= =?UTF-8?q?=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 4 ++- requirements.txt | 1 + router/ocr_router.py | 16 +++++++-- tasks.py | 86 +++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 103 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 28f5466..18021c6 100644 --- a/.gitignore +++ b/.gitignore @@ -170,4 +170,6 @@ workspace/data venv2 /workspace/audio /workspace/results -.venv_stt \ No newline at end of file +.venv_stt + +config/model \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 9cfe91a..c12542a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,3 +19,4 @@ flower minio opencv-python-headless python-dotenv +pytesseract \ No newline at end of file diff --git a/router/ocr_router.py b/router/ocr_router.py index 867637b..6e6a13c 100644 --- a/router/ocr_router.py +++ b/router/ocr_router.py @@ -8,9 +8,11 @@ from config.setting import MINIO_BUCKET_NAME from fastapi import APIRouter, File, HTTPException, UploadFile from fastapi.responses import JSONResponse from tasks import ( + call_paddle_ocr, + call_tesseract_ocr, + call_tesstrain_ocr, call_upstage_ocr_api, celery_app, - parse_ocr_text, store_ocr_result, ) from utils.checking_keys import create_key @@ -73,7 +75,7 @@ async def _process_ocr_request(file: UploadFile, ocr_task): @router.post("/paddle", summary="[Paddle] 파일 업로드 기반 비동기 OCR") async def ocr_paddle_endpoint(file: UploadFile = File(...)): - return await _process_ocr_request(file, parse_ocr_text) + return await _process_ocr_request(file, call_paddle_ocr) @router.post("/upstage", summary="[Upstage] 파일 업로드 기반 비동기 OCR") @@ -81,6 +83,16 @@ async def ocr_upstage_endpoint(file: UploadFile = File(...)): return await _process_ocr_request(file, call_upstage_ocr_api) +@router.post("/tesseract", summary="[Tesseract] 기본 모델 비동기 OCR") +async def ocr_tesseract_endpoint(file: UploadFile = File(...)): + return await _process_ocr_request(file, call_tesseract_ocr) + + +@router.post("/tesstrain", summary="[Tesseract] 훈련된 모델 비동기 OCR") +async def ocr_tesstrain_endpoint(file: UploadFile = File(...)): + return await _process_ocr_request(file, call_tesstrain_ocr) + + @router.get("/progress/{request_id}", summary="OCR 진행 상태 및 결과 조회") async def check_progress(request_id: str): task_id = redis_client.hget("ocr_task_mapping", request_id) diff --git a/tasks.py b/tasks.py index f35df0f..372e3ca 100644 --- a/tasks.py +++ b/tasks.py @@ -5,8 +5,10 @@ import os import tempfile import time from datetime import datetime, timezone +from io import BytesIO import httpx +import pytesseract import redis from celery import Task from config.setting import ( @@ -15,6 +17,8 @@ from config.setting import ( REDIS_PORT, UPSTAGE_API_KEY, ) +from PIL import Image +from pdf2image import convert_from_path from utils.celery_utils import celery_app from utils.ocr_processor import ocr_process from utils.text_extractor import extract_text_from_file @@ -70,7 +74,7 @@ async def download_file_from_presigned_url(file_url: str, save_path: str): # (Paddle) OCR + 후처리 @celery_app.task(bind=True, base=BaseTaskWithProgress) -def parse_ocr_text(self, presigned_url: str, request_id: str, file_name: str): +def call_paddle_ocr(self, presigned_url: str, request_id: str, file_name: str): self.update_progress(request_id, "Paddle OCR 작업 시작") suffix = os.path.splitext(file_name)[-1] @@ -220,6 +224,86 @@ def call_upstage_ocr_api(self, presigned_url: str, request_id: str, file_name: s os.remove(tmp_path) +# (Tesseract) 기본 모델 OCR +@celery_app.task(bind=True, base=BaseTaskWithProgress) +def call_tesseract_ocr(self, presigned_url: str, request_id: str, file_name: str): + self.update_progress(request_id, "Tesseract (기본) OCR 작업 시작") + + suffix = os.path.splitext(file_name)[-1] + with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file: + tmp_path = tmp_file.name + + try: + self.update_progress(request_id, "파일 다운로드 중") + asyncio.run(download_file_from_presigned_url(presigned_url, tmp_path)) + self.update_progress(request_id, "파일 다운로드 완료") + + start_time = time.time() + if file_name.lower().endswith(".pdf"): + images = convert_from_path(tmp_path) + text = "" + for image in images: + text += pytesseract.image_to_string(image, lang="kor") + else: + with open(tmp_path, "rb") as f: + image_bytes = f.read() + image = Image.open(BytesIO(image_bytes)) + text = pytesseract.image_to_string(image, lang="kor") + end_time = time.time() + self.update_progress(request_id, "Tesseract OCR 완료") + + # 좌표(coord) 정보는 pytesseract 기본 출력에서 얻기 어려우므로 빈 리스트로 처리 + result_json = ocr_process( + file_name, "tesseract_default", [], text, start_time, end_time + ) + return result_json + finally: + if os.path.exists(tmp_path): + os.remove(tmp_path) + + +# (Tesseract) 훈련된 모델 OCR +@celery_app.task(bind=True, base=BaseTaskWithProgress) +def call_tesstrain_ocr(self, presigned_url: str, request_id: str, file_name: str): + self.update_progress(request_id, "Tesseract (훈련 모델) OCR 작업 시작") + + TESSDATA_DIR = "/tesseract_trainer/tesstrain/workspace/" + MODEL_NAME = "kor_fonts" + + suffix = os.path.splitext(file_name)[-1] + with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file: + tmp_path = tmp_file.name + + try: + self.update_progress(request_id, "파일 다운로드 중") + asyncio.run(download_file_from_presigned_url(presigned_url, tmp_path)) + self.update_progress(request_id, "파일 다운로드 완료") + + start_time = time.time() + if file_name.lower().endswith(".pdf"): + images = convert_from_path(tmp_path) + text = "" + custom_config = f"--tessdata-dir {TESSDATA_DIR} -l {MODEL_NAME}" + for image in images: + text += pytesseract.image_to_string(image, config=custom_config) + else: + with open(tmp_path, "rb") as f: + image_bytes = f.read() + image = Image.open(BytesIO(image_bytes)) + custom_config = f"--tessdata-dir {TESSDATA_DIR} -l {MODEL_NAME}" + text = pytesseract.image_to_string(image, config=custom_config) + end_time = time.time() + self.update_progress(request_id, "Tesseract (훈련 모델) OCR 완료") + + result_json = ocr_process( + file_name, f"tesstrain_{MODEL_NAME}", [], text, start_time, end_time + ) + return result_json + finally: + if os.path.exists(tmp_path): + os.remove(tmp_path) + + # 결과 Redis 저장 (체인의 두 번째 스텝) # router 체인: store_ocr_result.s(request_id=request_id, task_id=task_id) @celery_app.task(bind=True, base=BaseTaskWithProgress, ignore_result=True)