/tesseract & /tesstrain 추가

This commit is contained in:
kyy
2025-09-09 16:23:09 +09:00
parent 7ebd979521
commit 87d0200a34
4 changed files with 103 additions and 4 deletions

4
.gitignore vendored
View File

@@ -170,4 +170,6 @@ workspace/data
venv2
/workspace/audio
/workspace/results
.venv_stt
.venv_stt
config/model

View File

@@ -19,3 +19,4 @@ flower
minio
opencv-python-headless
python-dotenv
pytesseract

View File

@@ -8,9 +8,11 @@ from config.setting import MINIO_BUCKET_NAME
from fastapi import APIRouter, File, HTTPException, UploadFile
from fastapi.responses import JSONResponse
from tasks import (
call_paddle_ocr,
call_tesseract_ocr,
call_tesstrain_ocr,
call_upstage_ocr_api,
celery_app,
parse_ocr_text,
store_ocr_result,
)
from utils.checking_keys import create_key
@@ -73,7 +75,7 @@ async def _process_ocr_request(file: UploadFile, ocr_task):
@router.post("/paddle", summary="[Paddle] 파일 업로드 기반 비동기 OCR")
async def ocr_paddle_endpoint(file: UploadFile = File(...)):
return await _process_ocr_request(file, parse_ocr_text)
return await _process_ocr_request(file, call_paddle_ocr)
@router.post("/upstage", summary="[Upstage] 파일 업로드 기반 비동기 OCR")
@@ -81,6 +83,16 @@ async def ocr_upstage_endpoint(file: UploadFile = File(...)):
return await _process_ocr_request(file, call_upstage_ocr_api)
@router.post("/tesseract", summary="[Tesseract] 기본 모델 비동기 OCR")
async def ocr_tesseract_endpoint(file: UploadFile = File(...)):
return await _process_ocr_request(file, call_tesseract_ocr)
@router.post("/tesstrain", summary="[Tesseract] 훈련된 모델 비동기 OCR")
async def ocr_tesstrain_endpoint(file: UploadFile = File(...)):
return await _process_ocr_request(file, call_tesstrain_ocr)
@router.get("/progress/{request_id}", summary="OCR 진행 상태 및 결과 조회")
async def check_progress(request_id: str):
task_id = redis_client.hget("ocr_task_mapping", request_id)

View File

@@ -5,8 +5,10 @@ import os
import tempfile
import time
from datetime import datetime, timezone
from io import BytesIO
import httpx
import pytesseract
import redis
from celery import Task
from config.setting import (
@@ -15,6 +17,8 @@ from config.setting import (
REDIS_PORT,
UPSTAGE_API_KEY,
)
from PIL import Image
from pdf2image import convert_from_path
from utils.celery_utils import celery_app
from utils.ocr_processor import ocr_process
from utils.text_extractor import extract_text_from_file
@@ -70,7 +74,7 @@ async def download_file_from_presigned_url(file_url: str, save_path: str):
# (Paddle) OCR + 후처리
@celery_app.task(bind=True, base=BaseTaskWithProgress)
def parse_ocr_text(self, presigned_url: str, request_id: str, file_name: str):
def call_paddle_ocr(self, presigned_url: str, request_id: str, file_name: str):
self.update_progress(request_id, "Paddle OCR 작업 시작")
suffix = os.path.splitext(file_name)[-1]
@@ -220,6 +224,86 @@ def call_upstage_ocr_api(self, presigned_url: str, request_id: str, file_name: s
os.remove(tmp_path)
# (Tesseract) 기본 모델 OCR
@celery_app.task(bind=True, base=BaseTaskWithProgress)
def call_tesseract_ocr(self, presigned_url: str, request_id: str, file_name: str):
self.update_progress(request_id, "Tesseract (기본) OCR 작업 시작")
suffix = os.path.splitext(file_name)[-1]
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
tmp_path = tmp_file.name
try:
self.update_progress(request_id, "파일 다운로드 중")
asyncio.run(download_file_from_presigned_url(presigned_url, tmp_path))
self.update_progress(request_id, "파일 다운로드 완료")
start_time = time.time()
if file_name.lower().endswith(".pdf"):
images = convert_from_path(tmp_path)
text = ""
for image in images:
text += pytesseract.image_to_string(image, lang="kor")
else:
with open(tmp_path, "rb") as f:
image_bytes = f.read()
image = Image.open(BytesIO(image_bytes))
text = pytesseract.image_to_string(image, lang="kor")
end_time = time.time()
self.update_progress(request_id, "Tesseract OCR 완료")
# 좌표(coord) 정보는 pytesseract 기본 출력에서 얻기 어려우므로 빈 리스트로 처리
result_json = ocr_process(
file_name, "tesseract_default", [], text, start_time, end_time
)
return result_json
finally:
if os.path.exists(tmp_path):
os.remove(tmp_path)
# (Tesseract) 훈련된 모델 OCR
@celery_app.task(bind=True, base=BaseTaskWithProgress)
def call_tesstrain_ocr(self, presigned_url: str, request_id: str, file_name: str):
self.update_progress(request_id, "Tesseract (훈련 모델) OCR 작업 시작")
TESSDATA_DIR = "/tesseract_trainer/tesstrain/workspace/"
MODEL_NAME = "kor_fonts"
suffix = os.path.splitext(file_name)[-1]
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
tmp_path = tmp_file.name
try:
self.update_progress(request_id, "파일 다운로드 중")
asyncio.run(download_file_from_presigned_url(presigned_url, tmp_path))
self.update_progress(request_id, "파일 다운로드 완료")
start_time = time.time()
if file_name.lower().endswith(".pdf"):
images = convert_from_path(tmp_path)
text = ""
custom_config = f"--tessdata-dir {TESSDATA_DIR} -l {MODEL_NAME}"
for image in images:
text += pytesseract.image_to_string(image, config=custom_config)
else:
with open(tmp_path, "rb") as f:
image_bytes = f.read()
image = Image.open(BytesIO(image_bytes))
custom_config = f"--tessdata-dir {TESSDATA_DIR} -l {MODEL_NAME}"
text = pytesseract.image_to_string(image, config=custom_config)
end_time = time.time()
self.update_progress(request_id, "Tesseract (훈련 모델) OCR 완료")
result_json = ocr_process(
file_name, f"tesstrain_{MODEL_NAME}", [], text, start_time, end_time
)
return result_json
finally:
if os.path.exists(tmp_path):
os.remove(tmp_path)
# 결과 Redis 저장 (체인의 두 번째 스텝)
# router 체인: store_ocr_result.s(request_id=request_id, task_id=task_id)
@celery_app.task(bind=True, base=BaseTaskWithProgress, ignore_result=True)