/tesseract & /tesstrain 추가
This commit is contained in:
86
tasks.py
86
tasks.py
@@ -5,8 +5,10 @@ import os
|
||||
import tempfile
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from io import BytesIO
|
||||
|
||||
import httpx
|
||||
import pytesseract
|
||||
import redis
|
||||
from celery import Task
|
||||
from config.setting import (
|
||||
@@ -15,6 +17,8 @@ from config.setting import (
|
||||
REDIS_PORT,
|
||||
UPSTAGE_API_KEY,
|
||||
)
|
||||
from PIL import Image
|
||||
from pdf2image import convert_from_path
|
||||
from utils.celery_utils import celery_app
|
||||
from utils.ocr_processor import ocr_process
|
||||
from utils.text_extractor import extract_text_from_file
|
||||
@@ -70,7 +74,7 @@ async def download_file_from_presigned_url(file_url: str, save_path: str):
|
||||
|
||||
# (Paddle) OCR + 후처리
|
||||
@celery_app.task(bind=True, base=BaseTaskWithProgress)
|
||||
def parse_ocr_text(self, presigned_url: str, request_id: str, file_name: str):
|
||||
def call_paddle_ocr(self, presigned_url: str, request_id: str, file_name: str):
|
||||
self.update_progress(request_id, "Paddle OCR 작업 시작")
|
||||
|
||||
suffix = os.path.splitext(file_name)[-1]
|
||||
@@ -220,6 +224,86 @@ def call_upstage_ocr_api(self, presigned_url: str, request_id: str, file_name: s
|
||||
os.remove(tmp_path)
|
||||
|
||||
|
||||
# (Tesseract) 기본 모델 OCR
|
||||
@celery_app.task(bind=True, base=BaseTaskWithProgress)
|
||||
def call_tesseract_ocr(self, presigned_url: str, request_id: str, file_name: str):
|
||||
self.update_progress(request_id, "Tesseract (기본) OCR 작업 시작")
|
||||
|
||||
suffix = os.path.splitext(file_name)[-1]
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
|
||||
tmp_path = tmp_file.name
|
||||
|
||||
try:
|
||||
self.update_progress(request_id, "파일 다운로드 중")
|
||||
asyncio.run(download_file_from_presigned_url(presigned_url, tmp_path))
|
||||
self.update_progress(request_id, "파일 다운로드 완료")
|
||||
|
||||
start_time = time.time()
|
||||
if file_name.lower().endswith(".pdf"):
|
||||
images = convert_from_path(tmp_path)
|
||||
text = ""
|
||||
for image in images:
|
||||
text += pytesseract.image_to_string(image, lang="kor")
|
||||
else:
|
||||
with open(tmp_path, "rb") as f:
|
||||
image_bytes = f.read()
|
||||
image = Image.open(BytesIO(image_bytes))
|
||||
text = pytesseract.image_to_string(image, lang="kor")
|
||||
end_time = time.time()
|
||||
self.update_progress(request_id, "Tesseract OCR 완료")
|
||||
|
||||
# 좌표(coord) 정보는 pytesseract 기본 출력에서 얻기 어려우므로 빈 리스트로 처리
|
||||
result_json = ocr_process(
|
||||
file_name, "tesseract_default", [], text, start_time, end_time
|
||||
)
|
||||
return result_json
|
||||
finally:
|
||||
if os.path.exists(tmp_path):
|
||||
os.remove(tmp_path)
|
||||
|
||||
|
||||
# (Tesseract) 훈련된 모델 OCR
|
||||
@celery_app.task(bind=True, base=BaseTaskWithProgress)
|
||||
def call_tesstrain_ocr(self, presigned_url: str, request_id: str, file_name: str):
|
||||
self.update_progress(request_id, "Tesseract (훈련 모델) OCR 작업 시작")
|
||||
|
||||
TESSDATA_DIR = "/tesseract_trainer/tesstrain/workspace/"
|
||||
MODEL_NAME = "kor_fonts"
|
||||
|
||||
suffix = os.path.splitext(file_name)[-1]
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
|
||||
tmp_path = tmp_file.name
|
||||
|
||||
try:
|
||||
self.update_progress(request_id, "파일 다운로드 중")
|
||||
asyncio.run(download_file_from_presigned_url(presigned_url, tmp_path))
|
||||
self.update_progress(request_id, "파일 다운로드 완료")
|
||||
|
||||
start_time = time.time()
|
||||
if file_name.lower().endswith(".pdf"):
|
||||
images = convert_from_path(tmp_path)
|
||||
text = ""
|
||||
custom_config = f"--tessdata-dir {TESSDATA_DIR} -l {MODEL_NAME}"
|
||||
for image in images:
|
||||
text += pytesseract.image_to_string(image, config=custom_config)
|
||||
else:
|
||||
with open(tmp_path, "rb") as f:
|
||||
image_bytes = f.read()
|
||||
image = Image.open(BytesIO(image_bytes))
|
||||
custom_config = f"--tessdata-dir {TESSDATA_DIR} -l {MODEL_NAME}"
|
||||
text = pytesseract.image_to_string(image, config=custom_config)
|
||||
end_time = time.time()
|
||||
self.update_progress(request_id, "Tesseract (훈련 모델) OCR 완료")
|
||||
|
||||
result_json = ocr_process(
|
||||
file_name, f"tesstrain_{MODEL_NAME}", [], text, start_time, end_time
|
||||
)
|
||||
return result_json
|
||||
finally:
|
||||
if os.path.exists(tmp_path):
|
||||
os.remove(tmp_path)
|
||||
|
||||
|
||||
# 결과 Redis 저장 (체인의 두 번째 스텝)
|
||||
# router 체인: store_ocr_result.s(request_id=request_id, task_id=task_id)
|
||||
@celery_app.task(bind=True, base=BaseTaskWithProgress, ignore_result=True)
|
||||
|
||||
Reference in New Issue
Block a user