commit 82d40f625aa92087ec5d52d09e79799936946af4 Author: kyy Date: Tue Aug 12 10:10:59 2025 +0900 Initial commit diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..b60060f --- /dev/null +++ b/.gitattributes @@ -0,0 +1,32 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +*.pdf filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..28f5466 --- /dev/null +++ b/.gitignore @@ -0,0 +1,173 @@ +#output, model, etc +output +cache + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class +.vscode/ +flagged/ + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ +workspace/.ruff_cache +workspace/__pycache__ +workspace/data +venv2 +/workspace/audio +/workspace/results +.venv_stt \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..5ce49fb --- /dev/null +++ b/Dockerfile @@ -0,0 +1,27 @@ +FROM paddlepaddle/paddle:3.1.0-gpu-cuda11.8-cudnn8.9 + +# 1. 필수 apt 패키지 설치 +RUN apt-get update && \ + apt-get install -y \ + poppler-utils \ + tesseract-ocr \ + tesseract-ocr-kor \ + libgl1 \ + curl \ + tree \ + git \ + build-essential && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +# 3. Python 패키지 설치 (이미 설치된 것과 중복 주의) +COPY requirements.txt . +RUN pip install --upgrade pip && \ + pip install --no-cache-dir -r requirements.txt && \ + pip install --no-cache-dir paddleocr + +# 4. 작업 디렉토리 및 소스 복사 +WORKDIR /workspace +COPY . . + +# 5. 실행 명령 +CMD ["sh", "-c", "uvicorn api:app --workers 2 --host 0.0.0.0 --port ${PORT:-8892}"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..4af313f --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 Lectom + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..c76f586 --- /dev/null +++ b/README.md @@ -0,0 +1,100 @@ +# OCR Performance Lab + +## 개요 + +이 프로젝트는 다양한 OCR(광학 문자 인식) 엔진의 성능을 비교하고 평가하기 위한 비동기 처리 API 서버입니다. FastAPI를 기반으로 구축되었으며, Celery를 사용하여 무거운 OCR 작업을 백그라운드에서 효율적으로 처리합니다. 현재 Paddle OCR과 Upstage OCR을 지원하며, 파일 업로드를 통해 OCR 작업을 요청하고 진행 상태를 추적할 수 있습니다. + +## 주요 기능 + +- **비동기 OCR 처리**: Celery를 사용하여 OCR 작업을 비동기적으로 처리하므로, 대용량 파일이나 다수의 요청에도 빠른 응답 시간을 보장합니다. +- **다중 OCR 엔진 지원**: Paddle OCR, Upstage OCR 등 여러 OCR 엔진을 지원하며, 새로운 엔진을 쉽게 추가할 수 있는 구조로 설계되었습니다. +- **실시간 진행 상태 조회**: 각 OCR 작업에 대한 고유 ID를 발급하고, 이를 통해 작업의 현재 상태와 최종 결과를 언제든지 조회할 수 있습니다. +- **Docker 기반 환경**: Docker Compose를 통해 API 서버, Celery 워커, Redis 등 프로젝트 실행에 필요한 모든 환경을 한 번에 구성할 수 있습니다. + +## 기술 스택 + +- **API 서버**: FastAPI +- **비동기 작업 큐**: Celery +- **메시지 브로커 및 결과 백엔드**: Redis +- **OCR 라이브러리**: PaddleOCR, Upstage API +- **배포**: Docker, Docker Compose + +## 프로젝트 설정 + +### 요구사항 + +- Docker +- Docker Compose + +### 설치 및 실행 + +1. **프로젝트 클론** + + ```bash + git clone https://github.com/your-repository/ocr_performance_lab.git + cd ocr_performance_lab + ``` + +2. **환경 변수 설정** + + 프로젝트 루트 디렉터리에 `.env` 파일을 생성하고, Upstage API 키를 추가합니다. + + ``` + UPSTAGE_API_KEY=YOUR_UPSTAGE_API_KEY + ``` + +3. **Docker 컨테이너 실행** + + Docker Compose를 사용하여 프로젝트의 모든 서비스를 실행합니다. + + ```bash + docker-compose up -d --build + ``` + + - API 서버는 `http://localhost:8892`에서 실행됩니다. + - Celery 모니터링 도구인 Flower는 `http://localhost:5557`에서 확인할 수 있습니다. + +## API 엔드포인트 + +API 문서는 서버 실행 후 `http://localhost:8892/docs`에서 확인할 수 있습니다. + +### OCR 작업 요청 + +- **POST /ocr/paddle** + - **설명**: Paddle OCR을 사용하여 OCR 작업을 요청합니다. + - **요청 본문**: `multipart/form-data` 형식의 파일 목록 + - **응답**: 접수된 각 파일에 대한 작업 정보 (request_id, task_id 등) + +- **POST /ocr/upstage** + - **설명**: Upstage OCR API를 사용하여 OCR 작업을 요청합니다. + - **요청 본문**: `multipart/form-data` 형식의 파일 목록 + - **응답**: 접수된 각 파일에 대한 작업 정보 (request_id, task_id 등) + +### OCR 진행 상태 및 결과 조회 + +- **GET /ocr/progress/{request_id}** + - **설명**: `request_id`를 사용하여 OCR 작업의 진행 상태와 최종 결과를 조회합니다. + - **경로 매개변수**: + - `request_id` (string): 작업 요청 시 발급된 고유 ID + - **응답**: 작업 상태, 진행 로그, 최종 OCR 결과 + +## 사용 예시 + +```python +import requests + +# Paddle OCR 요청 +files = [('files', open('document.pdf', 'rb'))] +response = requests.post("http://localhost:8892/ocr/paddle", files=files) +print(response.json()) + +# Upstage OCR 요청 +files = [('files', open('image.png', 'rb'))] +response = requests.post("http://localhost:8892/ocr/upstage", files=files) +print(response.json()) + +# 결과 조회 (위 응답에서 받은 request_id 사용) +request_id = "your_request_id" +response = requests.get(f"http://localhost:8892/ocr/progress/{request_id}") +print(response.json()) +``` diff --git a/api.py b/api.py new file mode 100644 index 0000000..7a81d79 --- /dev/null +++ b/api.py @@ -0,0 +1,126 @@ +# ocr/api.py +import logging + +import httpx +from config.setting import CELERY_FLOWER +from fastapi import FastAPI, HTTPException +from fastapi.middleware.cors import CORSMiddleware +from prometheus_fastapi_instrumentator import Instrumentator +from router import ocr_router +from utils.celery_utils import celery_app +from utils.celery_utils import health_check as celery_health_check_task +from utils.redis_utils import get_redis_client + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s - %(message)s" +) + + +app = FastAPI(title="OCR GATEWAY", description="OCR API 서비스", docs_url="/docs") + +app.add_middleware( + CORSMiddleware, + allow_origins=[ + "http://172.16.42.101", + "http://gsim.hanmaceng.co.kr", + "http://gsim.hanmaceng.co.kr:6464", + ], + allow_origin_regex=r"http://(172\.16\.\d{1,3}\.\d{1,3}|gsim\.hanmaceng\.co\.kr)(:\d+)?", + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# Prometheus Metrics Exporter 활성화 +Instrumentator().instrument(app).expose(app) + +app.include_router(ocr_router) + + +@app.get("/health/API") +async def health_check(): + """애플리케이션 상태 확인""" + return {"status": "API ok"} + + +@app.get("/health/Redis") +def redis_health_check(): + client = get_redis_client() + if client is None: + raise HTTPException(status_code=500, detail="Redis connection failed") + try: + client.ping() + return {"status": "Redis ok"} + except Exception: + raise HTTPException(status_code=500, detail="Redis ping failed") + + +@app.get("/health/Celery") +async def celery_health_check(): + """Celery 워커 상태 확인""" + # celery_app = get_celery_app() # 이제 celery_utils에서 직접 임포트합니다. + + try: + # 1. 워커들에게 ping 보내기 + active_workers = celery_app.control.ping(timeout=1.0) + if not active_workers: + raise HTTPException( + status_code=503, detail="No active Celery workers found." + ) + + # 2. 간단한 작업 실행하여 E2E 확인 + task = celery_health_check_task.delay() + result = task.get(timeout=10) # 10초 타임아웃 + + if task.state == "SUCCESS" and result.get("status") == "ok": + return { + "status": "Celery is healthy", + "active_workers": active_workers, + "task_status": "SUCCESS", + } + else: + raise HTTPException( + status_code=500, + detail=f"Celery health check task failed with state: {task.state}", + ) + + except HTTPException as e: + # 이미 HTTPException인 경우 그대로 전달 + raise e + except Exception as e: + logging.error(f"Celery health check failed: {e}", exc_info=True) + raise HTTPException( + status_code=500, + detail=f"An error occurred during Celery health check: {str(e)}", + ) + + +@app.get("/health/Flower") +async def flower_health_check(): + """Flower 모니터링 대시보드 상태 확인""" + try: + flower_api_url = CELERY_FLOWER # Use the full URL from settings + async with httpx.AsyncClient(timeout=5.0) as client: + response = await client.get(flower_api_url) + response.raise_for_status() + + # Just check if the API is reachable + if response.status_code == 200: + return {"status": "Flower is running"} + else: + raise HTTPException( + status_code=response.status_code, + detail=f"Flower API returned status {response.status_code}", + ) + + except httpx.RequestError as e: + logging.error(f"Could not connect to Flower: {e}", exc_info=True) + raise HTTPException( + status_code=503, detail=f"Could not connect to Flower: {str(e)}" + ) + except Exception as e: + logging.error(f"Flower health check failed: {e}", exc_info=True) + raise HTTPException( + status_code=500, + detail=f"An error occurred during Flower health check: {str(e)}", + ) diff --git a/config/__init__.py b/config/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/config/setting.py b/config/setting.py new file mode 100644 index 0000000..ed262d9 --- /dev/null +++ b/config/setting.py @@ -0,0 +1,21 @@ +import os + +from dotenv import load_dotenv + +load_dotenv() + +# Redis 기본 설정 +REDIS_HOST = "ocr_redis" +REDIS_PORT = 6379 +REDIS_DB = 0 + +# Celery 설정 +CELERY_BROKER_URL = f"redis://{REDIS_HOST}:{REDIS_PORT}/0" +CELERY_RESULT_BACKEND = f"redis://{REDIS_HOST}:{REDIS_PORT}/1" + +# Celery Flower 설정 +CELERY_FLOWER = "http://ocr_celery_flower:5557/api/workers" + +# Upstage API Key +UPSTAGE_API_KEY = os.getenv("UPSTAGE_API_KEY") + diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..7b2fbf8 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,113 @@ +services: + ocr_perf_api: + build: + context: . + image: ocr_perf_api + container_name: ocr_perf_lab_api + ports: + - "8892:8892" + volumes: + - ./:/workspace + env_file: + - .env + environment: + - TZ=Asia/Seoul + - CELERY_BROKER_URL=redis://ocr_perf_lab_redis:6379/0 + - CELERY_RESULT_BACKEND=redis://ocr_perf_lab_redis:6379/1 + - TESSDATA_PREFIX=/usr/share/tessdata + restart: always + networks: + - ocr_perf_net + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + depends_on: + ocr_perf_lab_redis: + condition: service_healthy + healthcheck: + test: + [ + "CMD-SHELL", + "curl -f http://localhost:8892/health/API && curl -f http://localhost:8892/health/Redis && curl -f http://localhost:8892/health/Celery && curl -f http://localhost:8892/health/Flower", + ] + interval: 60s + timeout: 5s + retries: 3 + start_period: 10s + + ocr_perf_lab_worker: + image: ocr_perf_api + container_name: ocr_perf_lab_worker + volumes: + - ./:/workspace + environment: + - TZ=Asia/Seoul + - CELERY_BROKER_URL=redis://ocr_perf_lab_redis:6379/0 + - CELERY_RESULT_BACKEND=redis://ocr_perf_lab_redis:6379/1 + - TESSDATA_PREFIX=/usr/share/tessdata + command: celery -A tasks worker --loglevel=info --concurrency=4 + networks: + - ocr_perf_net + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + depends_on: + ocr_perf_lab_redis: + condition: service_healthy + + ocr_perf_celery_flower: + image: ocr_perf_api + container_name: ocr_perf_celery_flower + environment: + - TZ=Asia/Seoul + - FLOWER_UNAUTHENTICATED_API=true + - TESSDATA_PREFIX=/usr/share/tessdata + entrypoint: celery --broker=redis://ocr_perf_lab_redis:6379/0 flower --port=5557 + ports: + - "5557:5557" + networks: + - ocr_perf_net + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + depends_on: + ocr_perf_lab_redis: + condition: service_healthy + + ocr_perf_lab_redis: + image: redis:7-alpine + container_name: ocr_perf_lab_redis + command: + [ + "redis-server", + "--maxmemory", + "256mb", + "--maxmemory-policy", + "allkeys-lru", + ] + ports: + - "6383:6379" + restart: always + networks: + - ocr_perf_net + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 10s + timeout: 5s + retries: 5 + +networks: + ocr_perf_net: + driver: bridge diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..6cc45da --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,46 @@ +exclude = [ + ".bzr", + ".direnv", + ".eggs", + ".git", + ".git-rewrite", + ".hg", + ".mypy_cache", + ".nox", + ".pants.d", + ".pytype", + ".ruff_cache", + ".svn", + ".tox", + ".venv", + "__pypackages__", + "_build", + "buck-out", + "build", + "dist", + "node_modules", + "venv", +] + +line-length = 120 +indent-width = 4 + +[lint] +# 기본적으로 Pyflakes('F')와 pycodestyle('E') 코드의 하위 집합을 활성화합니다. +select = ["E4", "E7", "E9", "F"] +ignore = [] + +# 활성화된 모든 규칙에 대한 수정 허용. +fixable = ["ALL"] +unfixable = [] + +# 밑줄 접두사가 붙은 경우 사용하지 않는 변수를 허용합니다. +dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" + +[format] +quote-style = "double" +indent-style = "space" +skip-magic-trailing-comma = false +line-ending = "auto" +docstring-code-format = false +docstring-code-line-length = "dynamic" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..4550144 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,20 @@ +fastapi +uvicorn[standard] +asyncio +pdf2image +PyMuPDF +python-docx +Pillow +aiofiles +httpx +aiofiles +snowflake-id + +prometheus-fastapi-instrumentator +python-multipart +redis +celery +flower + +minio +opencv-python-headlesspython-dotenv diff --git a/router/__init__.py b/router/__init__.py new file mode 100644 index 0000000..8a6ec31 --- /dev/null +++ b/router/__init__.py @@ -0,0 +1,3 @@ +from .ocr_router import router as ocr_router + +__all__ = ["ocr_router"] diff --git a/router/ocr_router.py b/router/ocr_router.py new file mode 100644 index 0000000..ff3bdf6 --- /dev/null +++ b/router/ocr_router.py @@ -0,0 +1,127 @@ +import json +import os +import tempfile +from datetime import datetime +from typing import List + +from celery import chain +from celery.result import AsyncResult +from fastapi import APIRouter, File, HTTPException, UploadFile +from fastapi.responses import JSONResponse +from tasks import ( + call_upstage_ocr_api, + celery_app, + parse_ocr_text, + store_ocr_result, +) +from utils.checking_keys import create_key +from utils.redis_utils import get_redis_client + +router = APIRouter(prefix="/ocr", tags=["OCR"]) +redis_client = get_redis_client() + + +async def _process_ocr_request(files: List[UploadFile], ocr_task): + results = [] + for file in files: + if not file.filename: + raise HTTPException(status_code=400, detail="파일 이름이 없습니다.") + + tmp_path = "" + try: + suffix = os.path.splitext(file.filename)[-1] + with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file: + content = await file.read() + tmp_file.write(content) + tmp_path = tmp_file.name + except Exception as e: + raise HTTPException(status_code=500, detail=f"파일 저장 실패: {str(e)}") + finally: + await file.close() + + request_id = create_key() + task_id = create_key() + + task_chain = chain( + ocr_task.s( + tmp_path=tmp_path, request_id=request_id, file_name=file.filename + ), + store_ocr_result.s(request_id=request_id, task_id=task_id), + ) + task_chain.apply_async(task_id=task_id) + + try: + redis_client.hset("ocr_task_mapping", request_id, task_id) + except Exception as e: + if tmp_path and os.path.exists(tmp_path): + os.remove(tmp_path) + raise HTTPException( + status_code=500, detail=f"작업 정보 저장 오류: {str(e)}" + ) + + try: + log_entry = { + "status": "작업 접수", + "timestamp": datetime.now().isoformat(), + "task_id": task_id, + "initial_file": file.filename, + } + redis_client.rpush(f"ocr_status:{request_id}", json.dumps(log_entry)) + except Exception: + pass + + results.append( + { + "message": "OCR 작업이 접수되었습니다.", + "request_id": request_id, + "task_id": task_id, + "status_check_url": f"/ocr/progress/{request_id}", + "filename": file.filename, + } + ) + return JSONResponse(content={"results": results}) + + +@router.post("/paddle", summary="[Paddle] 파일 업로드 기반 비동기 OCR") +async def ocr_paddle_endpoint(files: List[UploadFile] = File(...)): + return await _process_ocr_request(files, parse_ocr_text) + + +@router.post("/upstage", summary="[Upstage] 파일 업로드 기반 비동기 OCR") +async def ocr_upstage_endpoint(files: List[UploadFile] = File(...)): + return await _process_ocr_request(files, call_upstage_ocr_api) + + +@router.get("/progress/{request_id}", summary="📊 OCR 진행 상태 및 결과 조회") +async def check_progress(request_id: str): + task_id = redis_client.hget("ocr_task_mapping", request_id) + if not task_id: + raise HTTPException(status_code=404, detail=f"ID {request_id} 작업을 찾을 수 없습니다.") + + result = AsyncResult(task_id, app=celery_app) + status = result.status + + try: + logs = redis_client.lrange(f"ocr_status:{request_id}", 0, -1) + parsed_logs = [json.loads(log) for log in logs] + except Exception as e: + parsed_logs = [{"status": "로그 조회 실패", "error": str(e)}] + + final_result = None + if status == "SUCCESS": + try: + result_str = redis_client.get(f"ocr_result:{task_id}") + if result_str: + final_result = json.loads(result_str) + except Exception as e: + final_result = {"error": f"결과 조회 실패: {str(e)}"} + + return JSONResponse( + content={ + "request_id": request_id, + "task_id": task_id, + "celery_status": status, + "progress_logs": parsed_logs, + "final_result": final_result, + } + ) diff --git a/tasks.py b/tasks.py new file mode 100644 index 0000000..c743f1f --- /dev/null +++ b/tasks.py @@ -0,0 +1,125 @@ +import asyncio +import json +import logging +import os +import time +from datetime import datetime + +import httpx +import redis +from celery import Task +from config.setting import ( + REDIS_DB, + REDIS_HOST, + REDIS_PORT, + UPSTAGE_API_KEY, +) +from utils.celery_utils import celery_app +from utils.ocr_processor import ocr_process +from utils.text_extractor import extract_text_from_file + +# ✅ Redis 클라이언트 생성 +redis_client = redis.Redis( + host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB, decode_responses=True +) + +# ✅ 로깅 설정 +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +# ✅ 공통 Task 베이스 클래스 - 상태 로그 기록 및 예외 후킹 제공 +class BaseTaskWithProgress(Task): + abstract = True + + def update_progress(self, request_id, status_message, step_info=None): + log_entry = { + "status": status_message, + "timestamp": datetime.now().isoformat(), + "step_info": step_info, + } + redis_client.rpush(f"ocr_status:{request_id}", json.dumps(log_entry)) + logger.info(f"[{request_id}] Task Progress: {status_message}") + + def on_failure(self, exc, task_id, args, kwargs, einfo): + request_id = kwargs.get("request_id", "unknown") + self.update_progress( + request_id, + "작업 오류 발생", + {"error": str(exc), "traceback": str(einfo)}, + ) + logger.error(f"[{request_id}] Task Failed: {exc}") + # 실패 시 임시 파일 삭제 + tmp_path = kwargs.get("tmp_path") + if tmp_path and os.path.exists(tmp_path): + try: + os.remove(tmp_path) + self.update_progress(request_id, "임시 파일 삭제 완료") + except Exception as e: + logger.error(f"[{request_id}] 임시 파일 삭제 실패: {e}") + + super().on_failure(exc, task_id, args, kwargs, einfo) + + def on_success(self, retval, task_id, args, kwargs): + request_id = kwargs.get("request_id", "unknown") + self.update_progress(request_id, "작업 완료") + logger.info(f"[{request_id}] Task Succeeded") + super().on_success(retval, task_id, args, kwargs) + + +# ✅ (Paddle) Step 2: OCR 및 후처리 수행 +@celery_app.task(bind=True, base=BaseTaskWithProgress) +def parse_ocr_text(self, tmp_path: str, request_id: str, file_name: str): + self.update_progress(request_id, "Paddle OCR 작업 시작") + start_time = time.time() + text, coord, ocr_model = asyncio.run(extract_text_from_file(tmp_path)) + end_time = time.time() + self.update_progress(request_id, "텍스트 추출 및 후처리 완료") + result_json = ocr_process(file_name, ocr_model, coord, text, start_time, end_time) + return {"result": result_json, "tmp_path": tmp_path} + + +# ✅ (Upstage) Step 2: Upstage OCR API 호출 +@celery_app.task(bind=True, base=BaseTaskWithProgress) +def call_upstage_ocr_api(self, tmp_path: str, request_id: str, file_name: str): + self.update_progress(request_id, "Upstage OCR 작업 시작") + + if not UPSTAGE_API_KEY: + raise ValueError("Upstage API 키가 설정되지 않았습니다.") + + url = "https://api.upstage.ai/v1/document-digitization" + headers = {"Authorization": f"Bearer {UPSTAGE_API_KEY}"} + + try: + with open(tmp_path, "rb") as f: + files = {"document": (file_name, f, "application/octet-stream")} + data = {"model": "ocr"} + with httpx.Client() as client: + response = client.post(url, headers=headers, files=files, data=data) + response.raise_for_status() + self.update_progress(request_id, "Upstage API 호출 성공") + return {"result": response.json(), "tmp_path": tmp_path} + except httpx.HTTPStatusError as e: + logger.error(f"Upstage API 오류: {e.response.text}") + raise RuntimeError(f"Upstage API 오류: {e.response.status_code}") + except Exception as e: + logger.error(f"Upstage API 호출 중 예외 발생: {e}") + raise RuntimeError("Upstage API 호출 실패") + + +# ✅ Step 3: 결과 Redis 저장 및 임시 파일 삭제 +@celery_app.task(bind=True, base=BaseTaskWithProgress, ignore_result=True) +def store_ocr_result(self, data: dict, request_id: str, task_id: str): + self.update_progress(request_id, "결과 저장 중") + redis_key = f"ocr_result:{task_id}" + redis_client.set(redis_key, json.dumps(data.get("result", {}))) + + tmp_path = data.get("tmp_path") + if tmp_path and os.path.exists(tmp_path): + try: + os.remove(tmp_path) + self.update_progress(request_id, "임시 파일 삭제 완료") + except Exception as e: + logger.warning(f"[{request_id}] 임시 파일 삭제 실패: {e}") + + self.update_progress(request_id, "모든 작업 완료") diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/utils/celery_utils.py b/utils/celery_utils.py new file mode 100644 index 0000000..0b7958c --- /dev/null +++ b/utils/celery_utils.py @@ -0,0 +1,13 @@ +# utils/celery_utils.py +from celery import Celery +from config.setting import CELERY_BROKER_URL, CELERY_RESULT_BACKEND + +# Define and export the single Celery app instance +celery_app = Celery( + "ocr_tasks", broker=CELERY_BROKER_URL, backend=CELERY_RESULT_BACKEND +) + + +@celery_app.task(name="health_check") +def health_check(): + return {"status": "ok"} diff --git a/utils/checking_keys.py b/utils/checking_keys.py new file mode 100644 index 0000000..e2f5ca3 --- /dev/null +++ b/utils/checking_keys.py @@ -0,0 +1,15 @@ +import logging +import os + +from dotenv import load_dotenv +from snowflake import SnowflakeGenerator + +logger = logging.getLogger(__name__) +load_dotenv() + +def create_key(node: int = 1) -> str: + """ + Snowflake 알고리즘 기반 고유 키 생성기 (request_id용) + """ + generator = SnowflakeGenerator(node) + return str(next(generator)) diff --git a/utils/ocr_processor.py b/utils/ocr_processor.py new file mode 100644 index 0000000..4a621bd --- /dev/null +++ b/utils/ocr_processor.py @@ -0,0 +1,14 @@ +def ocr_process(filename, ocr_model, coord, text, start_time, end_time): + json_data = { + "filename": filename, + "model": {"ocr_model": ocr_model}, + "time": { + "duration_sec": f"{end_time - start_time:.2f}", + "started_at": start_time, + "ended_at": end_time, + }, + "fields": coord, + "parsed": text, + } + + return json_data diff --git a/utils/redis_utils.py b/utils/redis_utils.py new file mode 100644 index 0000000..1eda51f --- /dev/null +++ b/utils/redis_utils.py @@ -0,0 +1,22 @@ +# utils/redis_utils.py + +import redis +from config.setting import REDIS_DB, REDIS_HOST, REDIS_PORT + + +def get_redis_client(): + """ + Redis 클라이언트를 반환합니다. decode_responses=True 설정으로 문자열을 자동 디코딩합니다. + """ + try: + redis_client = redis.Redis( + host=REDIS_HOST, + port=REDIS_PORT, + db=REDIS_DB, + decode_responses=True, + ) + # 연결 확인 (ping) + redis_client.ping() + return redis_client + except redis.ConnectionError as e: + raise RuntimeError(f"Redis 연결 실패: {e}") diff --git a/utils/text_extractor.py b/utils/text_extractor.py new file mode 100644 index 0000000..f8ad74b --- /dev/null +++ b/utils/text_extractor.py @@ -0,0 +1,198 @@ +import asyncio +import logging +import os +import re + +import cv2 +import docx # PyMuPDF, python-docx +import fitz +import numpy as np +import pytesseract +from paddleocr import PaddleOCR +from pdf2image import convert_from_path +from PIL import Image + +logger = logging.getLogger(__name__) + + +async def extract_text_from_file(file_path): + ext = os.path.splitext(file_path)[-1].lower() + images = [] + + if ext == ".pdf": + # ① 먼저 PDF에서 텍스트 추출 시도 + text_only = await asyncio.to_thread(extract_text_from_pdf_direct, file_path) + if text_only.strip(): + logger.info( + "[UTILS-TEXT] PDF는 텍스트 기반입니다. (OCR 없이 텍스트 추출 완료)" + ) + return text_only, [], "OCR not used" + + # ② 텍스트가 없으면 이미지 변환 → OCR 수행 + images = await asyncio.to_thread(convert_from_path, file_path, dpi=400) + page_count = len(images) + logger.info(f"[UTILS-CONVERT] PDF에서 이미지로 변환 완료 ({page_count} 페이지)") + + elif ext in [".jpg", ".jpeg", ".png"]: + img = await asyncio.to_thread(Image.open, file_path) + images = [img] + logger.info("[UTILS-IMAGE] 이미지 파일 로딩 완료") + + elif ext == ".docx": + text_only = await asyncio.to_thread(extract_text_from_docx, file_path) + logger.info("[UTILS-DOCX] Word 문서 텍스트 추출 완료") + return text_only, [], "OCR not used" + + else: + logger.error( + "[ERROR] 지원하지 않는 파일 형식입니다. (PDF, JPG, JPEG, PNG, DOCX만 허용)" + ) + raise ValueError("지원하지 않는 파일 형식입니다.") + + # full_response, coord_response = await asyncio.to_thread( + # extract_text_ocr, images + # ) + # return full_response, coord_response, "pytesseract" + full_response, coord_response = await asyncio.to_thread( + extract_text_paddle_ocr, images + ) + return full_response, coord_response, "paddle_ocr" + + +# ✅ PDF 텍스트 기반 여부 확인 및 텍스트 추출 +def extract_text_from_pdf_direct(pdf_path): + text = "" + try: + with fitz.open(pdf_path) as doc: + for page in doc: + text += page.get_text() + valid_chars = re.findall(r"[가-힣a-zA-Z]", text) + logger.info(f"len(valid_chars): {len(valid_chars)}") + if len(valid_chars) < 10: + return text # 텍스트가 충분하지 않으면 바로 반환 + else: + text += page.get_text() + except Exception as e: + logger.info("[ERROR] PDF 텍스트 추출 실패:", e) + return text + + +# ✅ DOCX 텍스트 추출 +def extract_text_from_docx(docx_path): + text = "" + try: + doc = docx.Document(docx_path) + for para in doc.paragraphs: + text += para.text + "\n" + except Exception as e: + logger.info("[ERROR] DOCX 텍스트 추출 실패:", e) + return text + + +# ✅ OCR 전 이미지 전처리 함수 +def preprocess_image_for_ocr(pil_img, page_idx=None): + logger.info("[UTILS-OCR] 이미지 전처리 시작") + img = np.array(pil_img.convert("RGB")) # PIL → OpenCV 변환 + img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) # 그레이스케일 변환 + img = cv2.bilateralFilter(img, 9, 75, 75) # 노이즈 제거 + img = cv2.adaptiveThreshold( + img, + 255, + cv2.ADAPTIVE_THRESH_GAUSSIAN_C, + cv2.THRESH_BINARY, + 31, + 10, # 대비 향상 + ) + img = cv2.resize( + img, None, fx=2, fy=2, interpolation=cv2.INTER_LINEAR + ) # 해상도 확대 + + # # ✅ 전처리 이미지 저장 + # save_path = os.path.join("preprocess_image.png") + # logger.info(f"[UTILS-OCR] 전처리 이미지 저장: {save_path}") + # cv2.imwrite(save_path, img) + + return Image.fromarray(img) + + +# ✅ OCR 수행 (좌표 포함) +def extract_text_ocr(images): + """ + tesseract를 사용하여 이미지에서 텍스트 추출 및 좌표 정보 반환 + """ + all_texts = [] + coord_response = [] + + for page_idx, img in enumerate(images): + logger.info(f"[UTILS-OCR] 페이지 {page_idx + 1} OCR로 텍스트 추출 중...") + pre_img = preprocess_image_for_ocr(img) + text = pytesseract.image_to_string( + pre_img, lang="kor+eng", config="--oem 3 --psm 6" + ) + all_texts.append(text) + + ocr_data = pytesseract.image_to_data( + pre_img, + output_type=pytesseract.Output.DICT, + lang="kor+eng", + config="--oem 3 --psm 6", + ) + for i in range(len(ocr_data["text"])): + word = ocr_data["text"][i].strip() + if word == "": + continue + x, y, w, h = ( + ocr_data["left"][i], + ocr_data["top"][i], + ocr_data["width"][i], + ocr_data["height"][i], + ) + coord_response.append( + {"text": word, "coords": [x, y, x + w, y + h], "page": page_idx + 1} + ) + + logger.info(f"[UTILS-OCR] 페이지 {page_idx + 1} 텍스트 및 좌표 추출 완료") + + full_response = "\n".join(all_texts) + return full_response, coord_response + + +def extract_text_paddle_ocr(images): + """ + PaddleOCR를 사용하여 이미지에서 텍스트 추출 및 좌표 정보 반환 + """ + # os.environ["CUDA_VISIBLE_DEVICES"] = "" # GPU 사용 안 함 + ocr = PaddleOCR( + use_doc_orientation_classify=False, use_doc_unwarping=False, lang="korean" + ) + + full_response = [] + coord_response = [] + + for page_idx, img in enumerate(images): + print(f"[PaddleOCR] 페이지 {page_idx + 1} OCR로 텍스트 추출 중...") + img_np = np.array(img) + + if len(img_np.shape) == 2: # grayscale → RGB 변환 + img_np = cv2.cvtColor(img_np, cv2.COLOR_GRAY2RGB) + + results = ocr.predict(input=img_np) + + print(f"[PaddleOCR] 페이지 {page_idx + 1} OCR 결과 개수: {len(results)}") + for res_idx, res in enumerate(results): + print(f"[PaddleOCR] 페이지 {page_idx + 1} 결과 {res_idx + 1}개 추출 완료") + + res_dic = dict(res.items()) + texts = res_dic.get("rec_texts", []) + boxes = res_dic.get("rec_boxes", []) + + full_response.extend(texts) + + # ndarray → list 변환 + clean_boxes = [ + box.tolist() if isinstance(box, np.ndarray) else box for box in boxes + ] + coord_response.extend(clean_boxes) + + print("[PaddleOCR] 전체 페이지 텍스트 및 좌표 추출 완료") + return " ".join(full_response), coord_response