/tesseract & /tesstrain 추가

This commit is contained in:
kyy
2025-09-09 16:23:09 +09:00
parent 7ebd979521
commit 87d0200a34
4 changed files with 103 additions and 4 deletions

4
.gitignore vendored
View File

@@ -170,4 +170,6 @@ workspace/data
venv2 venv2
/workspace/audio /workspace/audio
/workspace/results /workspace/results
.venv_stt .venv_stt
config/model

View File

@@ -19,3 +19,4 @@ flower
minio minio
opencv-python-headless opencv-python-headless
python-dotenv python-dotenv
pytesseract

View File

@@ -8,9 +8,11 @@ from config.setting import MINIO_BUCKET_NAME
from fastapi import APIRouter, File, HTTPException, UploadFile from fastapi import APIRouter, File, HTTPException, UploadFile
from fastapi.responses import JSONResponse from fastapi.responses import JSONResponse
from tasks import ( from tasks import (
call_paddle_ocr,
call_tesseract_ocr,
call_tesstrain_ocr,
call_upstage_ocr_api, call_upstage_ocr_api,
celery_app, celery_app,
parse_ocr_text,
store_ocr_result, store_ocr_result,
) )
from utils.checking_keys import create_key from utils.checking_keys import create_key
@@ -73,7 +75,7 @@ async def _process_ocr_request(file: UploadFile, ocr_task):
@router.post("/paddle", summary="[Paddle] 파일 업로드 기반 비동기 OCR") @router.post("/paddle", summary="[Paddle] 파일 업로드 기반 비동기 OCR")
async def ocr_paddle_endpoint(file: UploadFile = File(...)): async def ocr_paddle_endpoint(file: UploadFile = File(...)):
return await _process_ocr_request(file, parse_ocr_text) return await _process_ocr_request(file, call_paddle_ocr)
@router.post("/upstage", summary="[Upstage] 파일 업로드 기반 비동기 OCR") @router.post("/upstage", summary="[Upstage] 파일 업로드 기반 비동기 OCR")
@@ -81,6 +83,16 @@ async def ocr_upstage_endpoint(file: UploadFile = File(...)):
return await _process_ocr_request(file, call_upstage_ocr_api) return await _process_ocr_request(file, call_upstage_ocr_api)
@router.post("/tesseract", summary="[Tesseract] 기본 모델 비동기 OCR")
async def ocr_tesseract_endpoint(file: UploadFile = File(...)):
return await _process_ocr_request(file, call_tesseract_ocr)
@router.post("/tesstrain", summary="[Tesseract] 훈련된 모델 비동기 OCR")
async def ocr_tesstrain_endpoint(file: UploadFile = File(...)):
return await _process_ocr_request(file, call_tesstrain_ocr)
@router.get("/progress/{request_id}", summary="OCR 진행 상태 및 결과 조회") @router.get("/progress/{request_id}", summary="OCR 진행 상태 및 결과 조회")
async def check_progress(request_id: str): async def check_progress(request_id: str):
task_id = redis_client.hget("ocr_task_mapping", request_id) task_id = redis_client.hget("ocr_task_mapping", request_id)

View File

@@ -5,8 +5,10 @@ import os
import tempfile import tempfile
import time import time
from datetime import datetime, timezone from datetime import datetime, timezone
from io import BytesIO
import httpx import httpx
import pytesseract
import redis import redis
from celery import Task from celery import Task
from config.setting import ( from config.setting import (
@@ -15,6 +17,8 @@ from config.setting import (
REDIS_PORT, REDIS_PORT,
UPSTAGE_API_KEY, UPSTAGE_API_KEY,
) )
from PIL import Image
from pdf2image import convert_from_path
from utils.celery_utils import celery_app from utils.celery_utils import celery_app
from utils.ocr_processor import ocr_process from utils.ocr_processor import ocr_process
from utils.text_extractor import extract_text_from_file from utils.text_extractor import extract_text_from_file
@@ -70,7 +74,7 @@ async def download_file_from_presigned_url(file_url: str, save_path: str):
# (Paddle) OCR + 후처리 # (Paddle) OCR + 후처리
@celery_app.task(bind=True, base=BaseTaskWithProgress) @celery_app.task(bind=True, base=BaseTaskWithProgress)
def parse_ocr_text(self, presigned_url: str, request_id: str, file_name: str): def call_paddle_ocr(self, presigned_url: str, request_id: str, file_name: str):
self.update_progress(request_id, "Paddle OCR 작업 시작") self.update_progress(request_id, "Paddle OCR 작업 시작")
suffix = os.path.splitext(file_name)[-1] suffix = os.path.splitext(file_name)[-1]
@@ -220,6 +224,86 @@ def call_upstage_ocr_api(self, presigned_url: str, request_id: str, file_name: s
os.remove(tmp_path) os.remove(tmp_path)
# (Tesseract) 기본 모델 OCR
@celery_app.task(bind=True, base=BaseTaskWithProgress)
def call_tesseract_ocr(self, presigned_url: str, request_id: str, file_name: str):
self.update_progress(request_id, "Tesseract (기본) OCR 작업 시작")
suffix = os.path.splitext(file_name)[-1]
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
tmp_path = tmp_file.name
try:
self.update_progress(request_id, "파일 다운로드 중")
asyncio.run(download_file_from_presigned_url(presigned_url, tmp_path))
self.update_progress(request_id, "파일 다운로드 완료")
start_time = time.time()
if file_name.lower().endswith(".pdf"):
images = convert_from_path(tmp_path)
text = ""
for image in images:
text += pytesseract.image_to_string(image, lang="kor")
else:
with open(tmp_path, "rb") as f:
image_bytes = f.read()
image = Image.open(BytesIO(image_bytes))
text = pytesseract.image_to_string(image, lang="kor")
end_time = time.time()
self.update_progress(request_id, "Tesseract OCR 완료")
# 좌표(coord) 정보는 pytesseract 기본 출력에서 얻기 어려우므로 빈 리스트로 처리
result_json = ocr_process(
file_name, "tesseract_default", [], text, start_time, end_time
)
return result_json
finally:
if os.path.exists(tmp_path):
os.remove(tmp_path)
# (Tesseract) 훈련된 모델 OCR
@celery_app.task(bind=True, base=BaseTaskWithProgress)
def call_tesstrain_ocr(self, presigned_url: str, request_id: str, file_name: str):
self.update_progress(request_id, "Tesseract (훈련 모델) OCR 작업 시작")
TESSDATA_DIR = "/tesseract_trainer/tesstrain/workspace/"
MODEL_NAME = "kor_fonts"
suffix = os.path.splitext(file_name)[-1]
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
tmp_path = tmp_file.name
try:
self.update_progress(request_id, "파일 다운로드 중")
asyncio.run(download_file_from_presigned_url(presigned_url, tmp_path))
self.update_progress(request_id, "파일 다운로드 완료")
start_time = time.time()
if file_name.lower().endswith(".pdf"):
images = convert_from_path(tmp_path)
text = ""
custom_config = f"--tessdata-dir {TESSDATA_DIR} -l {MODEL_NAME}"
for image in images:
text += pytesseract.image_to_string(image, config=custom_config)
else:
with open(tmp_path, "rb") as f:
image_bytes = f.read()
image = Image.open(BytesIO(image_bytes))
custom_config = f"--tessdata-dir {TESSDATA_DIR} -l {MODEL_NAME}"
text = pytesseract.image_to_string(image, config=custom_config)
end_time = time.time()
self.update_progress(request_id, "Tesseract (훈련 모델) OCR 완료")
result_json = ocr_process(
file_name, f"tesstrain_{MODEL_NAME}", [], text, start_time, end_time
)
return result_json
finally:
if os.path.exists(tmp_path):
os.remove(tmp_path)
# 결과 Redis 저장 (체인의 두 번째 스텝) # 결과 Redis 저장 (체인의 두 번째 스텝)
# router 체인: store_ocr_result.s(request_id=request_id, task_id=task_id) # router 체인: store_ocr_result.s(request_id=request_id, task_id=task_id)
@celery_app.task(bind=True, base=BaseTaskWithProgress, ignore_result=True) @celery_app.task(bind=True, base=BaseTaskWithProgress, ignore_result=True)