/tesseract & /tesstrain 추가

paddle ocr 위치 기반 텍스트 정렬
2025-09-09 16:23:09 +09:00 · 2025-09-09 16:21:59 +09:00
5 changed files with 148 additions and 12 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -170,4 +170,6 @@ workspace/data
 venv2
 /workspace/audio
 /workspace/results
-.venv_stt
+.venv_stt
+
+config/model
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,3 +19,4 @@ flower
 minio
 opencv-python-headless
 python-dotenv
+pytesseract
--- a/router/ocr_router.py
+++ b/router/ocr_router.py
@@ -8,9 +8,11 @@ from config.setting import MINIO_BUCKET_NAME
 from fastapi import APIRouter, File, HTTPException, UploadFile
 from fastapi.responses import JSONResponse
 from tasks import (
+    call_paddle_ocr,
+    call_tesseract_ocr,
+    call_tesstrain_ocr,
    call_upstage_ocr_api,
    celery_app,
-    parse_ocr_text,
    store_ocr_result,
 )
 from utils.checking_keys import create_key
@@ -73,7 +75,7 @@ async def _process_ocr_request(file: UploadFile, ocr_task):

@router.post("/paddle", summary="[Paddle] 파일 업로드 기반 비동기 OCR")
 async def ocr_paddle_endpoint(file: UploadFile = File(...)):
-    return await _process_ocr_request(file, parse_ocr_text)
+    return await _process_ocr_request(file, call_paddle_ocr)


@router.post("/upstage", summary="[Upstage] 파일 업로드 기반 비동기 OCR")
@@ -81,6 +83,16 @@ async def ocr_upstage_endpoint(file: UploadFile = File(...)):
    return await _process_ocr_request(file, call_upstage_ocr_api)


+@router.post("/tesseract", summary="[Tesseract] 기본 모델 비동기 OCR")
+async def ocr_tesseract_endpoint(file: UploadFile = File(...)):
+    return await _process_ocr_request(file, call_tesseract_ocr)
+
+
+@router.post("/tesstrain", summary="[Tesseract] 훈련된 모델 비동기 OCR")
+async def ocr_tesstrain_endpoint(file: UploadFile = File(...)):
+    return await _process_ocr_request(file, call_tesstrain_ocr)
+
+
@router.get("/progress/{request_id}", summary="OCR 진행 상태 및 결과 조회")
 async def check_progress(request_id: str):
    task_id = redis_client.hget("ocr_task_mapping", request_id)
--- a/tasks.py
+++ b/tasks.py
@@ -5,8 +5,10 @@ import os
 import tempfile
 import time
 from datetime import datetime, timezone
+from io import BytesIO

 import httpx
+import pytesseract
 import redis
 from celery import Task
 from config.setting import (
@@ -15,6 +17,8 @@ from config.setting import (
    REDIS_PORT,
    UPSTAGE_API_KEY,
 )
+from PIL import Image
+from pdf2image import convert_from_path
 from utils.celery_utils import celery_app
 from utils.ocr_processor import ocr_process
 from utils.text_extractor import extract_text_from_file
@@ -70,7 +74,7 @@ async def download_file_from_presigned_url(file_url: str, save_path: str):

 # (Paddle) OCR + 후처리
@celery_app.task(bind=True, base=BaseTaskWithProgress)
-def parse_ocr_text(self, presigned_url: str, request_id: str, file_name: str):
+def call_paddle_ocr(self, presigned_url: str, request_id: str, file_name: str):
    self.update_progress(request_id, "Paddle OCR 작업 시작")

    suffix = os.path.splitext(file_name)[-1]
@@ -220,6 +224,86 @@ def call_upstage_ocr_api(self, presigned_url: str, request_id: str, file_name: s
            os.remove(tmp_path)


+# (Tesseract) 기본 모델 OCR
+@celery_app.task(bind=True, base=BaseTaskWithProgress)
+def call_tesseract_ocr(self, presigned_url: str, request_id: str, file_name: str):
+    self.update_progress(request_id, "Tesseract (기본) OCR 작업 시작")
+
+    suffix = os.path.splitext(file_name)[-1]
+    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
+        tmp_path = tmp_file.name
+
+    try:
+        self.update_progress(request_id, "파일 다운로드 중")
+        asyncio.run(download_file_from_presigned_url(presigned_url, tmp_path))
+        self.update_progress(request_id, "파일 다운로드 완료")
+
+        start_time = time.time()
+        if file_name.lower().endswith(".pdf"):
+            images = convert_from_path(tmp_path)
+            text = ""
+            for image in images:
+                text += pytesseract.image_to_string(image, lang="kor")
+        else:
+            with open(tmp_path, "rb") as f:
+                image_bytes = f.read()
+                image = Image.open(BytesIO(image_bytes))
+                text = pytesseract.image_to_string(image, lang="kor")
+        end_time = time.time()
+        self.update_progress(request_id, "Tesseract OCR 완료")
+
+        # 좌표(coord) 정보는 pytesseract 기본 출력에서 얻기 어려우므로 빈 리스트로 처리
+        result_json = ocr_process(
+            file_name, "tesseract_default", [], text, start_time, end_time
+        )
+        return result_json
+    finally:
+        if os.path.exists(tmp_path):
+            os.remove(tmp_path)
+
+
+# (Tesseract) 훈련된 모델 OCR
+@celery_app.task(bind=True, base=BaseTaskWithProgress)
+def call_tesstrain_ocr(self, presigned_url: str, request_id: str, file_name: str):
+    self.update_progress(request_id, "Tesseract (훈련 모델) OCR 작업 시작")
+
+    TESSDATA_DIR = "/tesseract_trainer/tesstrain/workspace/"
+    MODEL_NAME = "kor_fonts"
+
+    suffix = os.path.splitext(file_name)[-1]
+    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
+        tmp_path = tmp_file.name
+
+    try:
+        self.update_progress(request_id, "파일 다운로드 중")
+        asyncio.run(download_file_from_presigned_url(presigned_url, tmp_path))
+        self.update_progress(request_id, "파일 다운로드 완료")
+
+        start_time = time.time()
+        if file_name.lower().endswith(".pdf"):
+            images = convert_from_path(tmp_path)
+            text = ""
+            custom_config = f"--tessdata-dir {TESSDATA_DIR} -l {MODEL_NAME}"
+            for image in images:
+                text += pytesseract.image_to_string(image, config=custom_config)
+        else:
+            with open(tmp_path, "rb") as f:
+                image_bytes = f.read()
+                image = Image.open(BytesIO(image_bytes))
+                custom_config = f"--tessdata-dir {TESSDATA_DIR} -l {MODEL_NAME}"
+                text = pytesseract.image_to_string(image, config=custom_config)
+        end_time = time.time()
+        self.update_progress(request_id, "Tesseract (훈련 모델) OCR 완료")
+
+        result_json = ocr_process(
+            file_name, f"tesstrain_{MODEL_NAME}", [], text, start_time, end_time
+        )
+        return result_json
+    finally:
+        if os.path.exists(tmp_path):
+            os.remove(tmp_path)
+
+
 # 결과 Redis 저장 (체인의 두 번째 스텝)
 # router 체인: store_ocr_result.s(request_id=request_id, task_id=task_id)
@celery_app.task(bind=True, base=BaseTaskWithProgress, ignore_result=True)
--- a/utils/text_extractor.py
+++ b/utils/text_extractor.py
@@ -149,8 +149,8 @@ def extract_text_paddle_ocr(images):
        use_doc_orientation_classify=False, use_doc_unwarping=False, lang="korean"
    )

-    full_response = []
    coord_response = []
+    all_text_boxes = []  # (y_center, x_center, text, box) 저장용

    for page_idx, img in enumerate(images):
        print(f"[PaddleOCR] 페이지 {page_idx + 1} OCR로 텍스트 추출 중...")
@@ -183,13 +183,50 @@ def extract_text_paddle_ocr(images):
            texts = res_dic.get("rec_texts", [])
            boxes = res_dic.get("rec_boxes", [])

-            full_response.extend(texts)
+            for text, box in zip(texts, boxes):
+                if isinstance(box, np.ndarray):
+                    box = box.tolist()
+                # ✅ box 정규화
+                if all(isinstance(p, (int, float)) for p in box):
+                    if len(box) % 2 == 0:
+                        box = [[box[i], box[i + 1]] for i in range(0, len(box), 2)]
+                    else:
+                        print(f"[PaddleOCR] 잘못된 box 형식: {box}")
+                        continue

-            # ndarray → list 변환
-            clean_boxes = [
-                box.tolist() if isinstance(box, np.ndarray) else box for box in boxes
-            ]
-            coord_response.extend(clean_boxes)
+                coord_response.append(box)
+
+                # 중심 좌표 계산 (y → 줄 순서, x → 단어 순서)
+                x_coords = [p[0] for p in box]
+                y_coords = [p[1] for p in box]
+                x_center = sum(x_coords) / len(x_coords)
+                y_center = sum(y_coords) / len(y_coords)
+
+                all_text_boxes.append((y_center, x_center, text))
+
+    # ✅ 위치 기반 정렬
+    all_text_boxes.sort(key=lambda x: (x[0], x[1]))  # y 먼저, 그 다음 x 정렬
+
+    # ✅ 줄 단위 그룹핑
+    lines = []
+    current_line = []
+    prev_y = None
+    line_threshold = 15  # 줄 묶음 y 오차 허용값
+
+    for y, x, text in all_text_boxes:
+        if prev_y is None or abs(y - prev_y) < line_threshold:
+            current_line.append((x, text))
+        else:
+            current_line.sort(key=lambda xx: xx[0])
+            lines.append(" ".join(t for _, t in current_line))
+            current_line = [(x, text)]
+        prev_y = y
+
+    if current_line:
+        current_line.sort(key=lambda xx: xx[0])
+        lines.append(" ".join(t for _, t in current_line))
+
+    parsed_text = "\n".join(lines)

    print("[PaddleOCR] 전체 페이지 텍스트 및 좌표 추출 완료")
-    return " ".join(full_response), coord_response
+    return parsed_text, coord_response
Author	SHA1	Message	Date
kyy	87d0200a34	/tesseract & /tesstrain 추가	2025-09-09 16:23:09 +09:00
kyy	7ebd979521	paddle ocr 위치 기반 텍스트 정렬	2025-09-09 16:21:59 +09:00