From fc3ead893a4b59d88b9737f364526505973805fd Mon Sep 17 00:00:00 2001
From: "b24503@hanmaceng.co.kr" <devcontainers@DESKTOP-8K6RJRQ>
Date: Tue, 7 Jan 2025 09:11:27 +0900
Subject: [PATCH] Initial commit

---
 .gitattributes             |  32 +++++++
 .gitignore                 | 166 +++++++++++++++++++++++++++++++++
 Dockerfile                 |  32 +++++++
 LICENSE                    |  21 +++++
 README.md                  |  60 ++++++++++++
 docker-compose.yml         |  71 ++++++++++++++
 pyproject.toml             |  46 ++++++++++
 requirements.txt           |   7 ++
 workspace/ADR-0.md         |  77 ++++++++++++++++
 workspace/main.py          | 183 +++++++++++++++++++++++++++++++++++++
 workspace/setting.json     |  15 +++
 workspace/template.py      |  18 ++++
 workspace/tests/example.py |  12 +++
 workspace/worker.py        |  12 +++
 14 files changed, 752 insertions(+)
 create mode 100755 .gitattributes
 create mode 100755 .gitignore
 create mode 100755 Dockerfile
 create mode 100755 LICENSE
 create mode 100755 README.md
 create mode 100755 docker-compose.yml
 create mode 100755 pyproject.toml
 create mode 100755 requirements.txt
 create mode 100755 workspace/ADR-0.md
 create mode 100755 workspace/main.py
 create mode 100755 workspace/setting.json
 create mode 100755 workspace/template.py
 create mode 100755 workspace/tests/example.py
 create mode 100755 workspace/worker.py

diff --git a/.gitattributes b/.gitattributes
new file mode 100755
index 0000000..d9a972e
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,32 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.pdf filter=lfs diff=lfs merge=lfs -text
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
new file mode 100755
index 0000000..094aab8
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,166 @@
+#output, model, etc
+output
+cache
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+.vscode/
+flagged/
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
diff --git a/Dockerfile b/Dockerfile
new file mode 100755
index 0000000..98cd4f7
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,32 @@
+# Base image
+FROM pytorch/pytorch:2.1.2-cuda11.8-cudnn8-devel
+
+# Set timezone and working directory
+ARG DEBIAN_FRONTEND=noninteractive
+ENV TZ=Asia/Seoul
+WORKDIR /opt/workspace
+
+# Use a faster mirror for apt-get
+RUN sed -i 's/archive.ubuntu.com/mirror.kakao.com/g' /etc/apt/sources.list
+
+# Install system dependencies (including dependencies for Hugging Face login)
+RUN apt-get update && apt-get install -y \
+    git \
+    vim \
+    curl \
+    && apt-get clean
+
+# Upgrade pip and install Python dependencies, including Hugging Face CLI
+COPY requirements.txt .
+RUN pip install --upgrade pip && \
+    pip install huggingface_hub && \
+    pip install -r requirements.txt
+
+# Copy project files into the container
+COPY . .
+
+# Set Python to output logs in real time
+ENV PYTHONUNBUFFERED=1
+
+# Default command to run both uvicorn server and worker
+# CMD ["/bin/bash", "-c", "uvicorn main:app --host 0.0.0.0 --port 8000 & python /opt/workspace/worker.py"]
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
new file mode 100755
index 0000000..2c00395
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Lectom
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100755
index 0000000..32f54b5
--- /dev/null
+++ b/README.md
@@ -0,0 +1,60 @@
+# FastAPI LLM Inference Framework
+
+## 개요
+
+### 목적
+* FastAPI를 이용한 대규모 언어 모델(LLM) 서빙 및 추론 환경 제공
+
+### 특징
+* **LLM**을 활용하여 입력 텍스트 기반의 추론 수행
+* **다중 모델 지원:** 여러 모델에 대해 순차적으로 추론 가능
+* **유연한 배치 처리:** 대용량 CSV 데이터를 분할하여 효율적으로 처리
+* **Redis + RQ 큐**를 활용한 비동기 작업 관리 및 확장성 제공
+
+---
+
+## 사용하기
+
+### 빌드
+```
+docker compose build
+```
+
+Dockerfile 상 변경은 없으나 디펜던시 변경 또는 환경 설정 업데이트가 있는 경우, `--no-cache` 옵션을 사용하여 캐시 없이 재빌드가 가능합니다.
+
+### 실행
+```
+docker compose up
+```
+
+`docker-compose.yml`의 실행 진입점은 다음과 같습니다:
+```
+entrypoint: uvicorn main:app --reload --host 0.0.0.0 --port 8000
+```
+
+---
+
+## API
+
+### API 문서 확인
+FastAPI의 자동 생성 API 문서를 확인하려면:
+```
+http://localhost:8000/docs
+```
+
+### 주요 API 엔드포인트
+1. **추론 시작** (`POST /start-inference/`):
+   - CSV 데이터와 모델 리스트를 업로드하여 추론 작업 시작
+   - 결과 파일은 자동으로 저장됨
+
+2. **결과 병합** (`GET /merge-results/`):
+   - `processed` 디렉토리에 저장된 모든 `_result.csv` 파일을 병합하여 하나의 CSV 파일로 통합합니다.
+   - 병합된 최종 결과는 `processed/final_result.csv`로 저장되며, 이 파일의 경로가 응답으로 반환됩니다.
+   - **사용 사례**:
+     - 다중 모델 추론 결과를 한곳에 통합하여 분석하려는 경우.
+     - 배치별로 나뉜 결과를 단일 파일로 결합하려는 경우.
+
+3. **최신 결과 다운로드** (`GET /download-latest`):
+   - 가장 최근에 완료된 추론 결과를 다운로드
+
+---
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100755
index 0000000..10804f0
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,71 @@
+version: "3.8"
+
+services:
+  llm-asyncio:
+    build:
+      context: .                
+      dockerfile: Dockerfile      
+    shm_size: "1000gb"
+    volumes:
+      - ./workspace:/opt/workspace/
+      - ./cache:/root/.cache/
+    environment:
+      PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
+      PYTHONPATH: /opt/workspace/
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: "all"
+              capabilities: [gpu]
+    container_name: llm-asyncio
+    ports:
+      - "8000:8000"
+    networks:
+      - llm-network
+    entrypoint: >
+      /bin/bash -c "
+      uvicorn main:app --reload --host 0.0.0.0 --port 8000
+      "
+    tty: true
+
+  redis:
+    image: redis:latest
+    container_name: redis-server
+    ports:
+      - "6379:6379"
+    restart: always
+    networks:
+      - llm-network
+
+  worker:
+    build:
+      context: .                  
+      dockerfile: Dockerfile      
+    shm_size: "1000gb"
+    volumes:
+      - ./workspace:/opt/workspace/        
+      - ./cache:/root/.cache/
+    environment:
+      PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
+      PYTHONPATH: /opt/workspace/
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: "all"
+              capabilities: [gpu]
+    networks:
+      - llm-network
+    entrypoint: >
+      /bin/bash -c "
+      python /opt/workspace/worker.py
+      "
+    tty: true
+    scale: 2
+      
+networks:
+  llm-network:
+    driver: bridge
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100755
index 0000000..6cc45da
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,46 @@
+exclude = [
+    ".bzr",
+    ".direnv",
+    ".eggs",
+    ".git",
+    ".git-rewrite",
+    ".hg",
+    ".mypy_cache",
+    ".nox",
+    ".pants.d",
+    ".pytype",
+    ".ruff_cache",
+    ".svn",
+    ".tox",
+    ".venv",
+    "__pypackages__",
+    "_build",
+    "buck-out",
+    "build",
+    "dist",
+    "node_modules",
+    "venv",
+]
+
+line-length = 120
+indent-width = 4
+
+[lint]
+# 기본적으로 Pyflakes('F')와 pycodestyle('E') 코드의 하위 집합을 활성화합니다.
+select = ["E4", "E7", "E9", "F"]
+ignore = []
+
+# 활성화된 모든 규칙에 대한 수정 허용.
+fixable = ["ALL"]
+unfixable = []
+
+# 밑줄 접두사가 붙은 경우 사용하지 않는 변수를 허용합니다.
+dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
+
+[format]
+quote-style = "double"
+indent-style = "space"
+skip-magic-trailing-comma = false
+line-ending = "auto"
+docstring-code-format = false
+docstring-code-line-length = "dynamic"
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100755
index 0000000..c0b6423
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,7 @@
+fastapi
+uvicorn
+pandas
+redis
+rq
+python-multipart
+vllm==0.6.2
\ No newline at end of file
diff --git a/workspace/ADR-0.md b/workspace/ADR-0.md
new file mode 100755
index 0000000..dcc2de3
--- /dev/null
+++ b/workspace/ADR-0.md
@@ -0,0 +1,77 @@
+# RESTful API 기반의 LLM 추론 아키텍처  
+FastAPI를 이용해 클라이언트가 RESTful API를 통해 서비스와 상호작용하도록 설계되었습니다.
+
+- **독립성**: 주요 기능(파일 업로드, 모델 추론, 작업 상태 확인)이 각기 독립된 API 엔드포인트로 구현되었습니다.
+
+---
+
+## 아키텍처 구성 요소  
+
+### (1) API 계층  
+- **FastAPI**:  
+  - 클라이언트와 직접 상호작용하는 API 레이어.  
+  - 비동기 요청을 처리하고 데이터 유효성 검사를 수행.  
+  - **주요 엔드포인트**:  
+    - `/start-inference/`: CSV 및 모델 리스트 파일 업로드 후 추론 작업 시작.
+    - `/merge-results/`: 배치별로 나뉜 결과를 단일 파일로 결합
+    - `/download-latest/`: 가장 최근에 완료된 결과 파일 다운로드.  
+
+### (2) 작업 처리 계층  
+- **RQ (Redis Queue)**:  
+  - 긴 작업(예: 대규모 LLM 추론)을 처리하기 위한 비동기 태스크 큐.  
+  - 작업 상태를 추적 가능하며, 확장성을 제공.  
+  - **주요 태스크**:  
+    - `run_inference`: 모델 로드 및 배치 기반 추론 처리.  
+
+### (3) 모델 및 비즈니스 로직 계층  
+- **LLM 모듈 (`vllm`)**:  
+  - LLM 모델(예: Llama, EXAONE)을 로드하고 추론을 수행.  
+  - GPU 메모리 최적화 및 모델 동적 로딩 지원.  
+- **템플릿 포맷팅 모듈 (`template.py`)**:  
+  - 각 모델에 특화된 입력 텍스트 포맷팅.  
+  - 사용자가 정의한 규칙 기반의 텍스트 전처리 지원.  
+
+### (4) 데이터 계층  
+- **파일 저장소**:  
+  - 업로드된 파일은 `/LLM_asyncio/uploaded` 디렉토리에 저장.  
+  - 추론 결과는 `/LLM_asyncio/processed` 디렉토리에 저장.  
+- **에러 처리 데이터**:  
+  - 추론 실패 또는 에러 발생 행은 `/LLM_asyncio/errors` 디렉토리에 저장하여 추적 가능.  
+
+---
+
+## 상태  
+**제안됨**  
+
+---
+
+## 콘텍스트  
+대규모 텍스트 데이터 추론은 높은 처리 비용과 시간이 소요됩니다.  
+이를 효율적으로 처리하기 위해 비동기 작업 관리, GPU 자원 활용 최적화, 배치 기반 설계를 도입하였습니다.  
+
+---
+
+## 결정  
+- **성능**: GPU 메모리 효율을 극대화하고 배치 기반 추론으로 처리 시간을 단축.  
+- **확장성**: Redis Queue와 다중 워커를 활용하여 대규모 요청 처리에 유연하게 대응.  
+- **안정성**: 오류 행 데이터를 별도로 저장하고 추적 가능하여 처리 안정성을 보장.  
+- **유지보수성**: 템플릿 기반 설계를 통해 다양한 모델에 대한 호환성을 확보.  
+
+---
+
+## 결과  
+- **사용자 경험 향상**:  
+  - 작업 상태를 실시간으로 확인하고 결과를 쉽게 다운로드할 수 있어 효율적인 워크플로우 제공.  
+- **확장 가능성**:  
+  - Redis와 RQ 워커를 추가하여 높은 트래픽 상황에도 대응 가능.  
+
+---
+
+## 컴플라이언스  
+- **모델 및 데이터**: 각 모델 사용 규약 및 데이터 처리 가이드라인 준수.  
+
+---
+
+## 노트  
+- **결정자**: 그래픽스개발팀 / 김용연  
+- **결정 날짜**: [2025.01.06]  
\ No newline at end of file
diff --git a/workspace/main.py b/workspace/main.py
new file mode 100755
index 0000000..1e9cf99
--- /dev/null
+++ b/workspace/main.py
@@ -0,0 +1,183 @@
+import os
+import pandas as pd
+from fastapi import FastAPI, UploadFile, BackgroundTasks
+from fastapi.responses import JSONResponse, FileResponse
+from redis import Redis
+from rq import Queue
+from vllm import LLM, SamplingParams
+import logging
+import gc
+import torch
+from tqdm import tqdm
+
+import sys
+sys.path.append("/opt/workspace/")
+from template import LLMInference
+
+app = FastAPI()
+
+# Redis 설정
+redis_conn = Redis(host="redis-server", port=6379, decode_responses=True)
+queue = Queue("model_tasks", connection=redis_conn)
+
+# 로깅 설정
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+# FastAPI 엔드포인트: CSV 파일 및 모델 리스트 업로드 처리
+@app.post("/start-inference/")
+async def process_csv(input_csv: UploadFile, model_list_txt: UploadFile, background_tasks: BackgroundTasks):
+    logger.info(f"file_name: {input_csv},model_list_file: {model_list_txt}")
+    # 파일 형식 확인 및 저장
+    if not input_csv.filename.endswith(".csv") or not model_list_txt.filename.endswith(".txt"):
+        return JSONResponse(content={"error": "Invalid file format."}, status_code=400)
+
+    file_path = f"uploaded/{input_csv.filename}"
+    model_list_path = f"uploaded/{model_list_txt.filename}"
+    os.makedirs("uploaded", exist_ok=True)
+
+    with open(file_path, "wb") as f:
+        f.write(await input_csv.read())
+
+    with open(model_list_path, "wb") as f:
+        f.write(await model_list_txt.read())
+
+    df = pd.read_csv(file_path, encoding="euc-kr")
+    batch_size = 10
+    job_ids = []
+
+    # 데이터를 batch_size로 나누어 작업 큐에 추가
+    for i in range(0, len(df), batch_size):
+        batch_file_path = file_path.replace(".csv", f"_batch_{i}_{i+batch_size}.csv")
+        df.iloc[i:i+batch_size].to_csv(batch_file_path, index=False, encoding="utf-8")
+        job = queue.enqueue(run_inference, batch_file_path, model_list_path, job_timeout=1800)
+        job_ids.append(job.id)
+
+    logger.info(f"Jobs enqueued: {job_ids}")
+    return {"job_ids": job_ids, "status": "queued"}
+
+def chat_formating(input_sentence: str, model_name: str):
+
+    if "llama" in model_name:
+        hidden_prompt = LLMInference.llama_template()
+    elif "gemma" in model_name:
+        hidden_prompt = LLMInference.gemma_template()
+    elif "exaone" in model_name:
+        hidden_prompt = LLMInference.exaone_template()
+    else:
+        raise ValueError("Unknown model name: " + model_name)
+
+    formated_sentence = hidden_prompt.format(input_sent=input_sentence)
+    logger.info(f"Sentence: {formated_sentence}")
+    return formated_sentence
+
+# 모델 추론 함수
+def run_inference(batch_file_path: str, model_list_path: str):
+    try:
+        # 워커 ID 확인
+        worker_id = os.environ.get("HOSTNAME", "Unknown Worker")
+        logger.info(f"Worker {worker_id} started inference for batch file: {batch_file_path}")
+
+        # 모델 리스트 읽기
+        with open(model_list_path, "r") as f:
+            model_list = [line.strip() for line in f.readlines()]
+
+        if not model_list:
+            raise ValueError("The model list file is empty.")
+
+        # 배치 데이터 읽기
+        df = pd.read_csv(batch_file_path, encoding="utf-8")
+        if "input" not in df.columns:
+            raise ValueError("The input CSV must contain a column named 'input'.")
+
+        # 추론 수행
+        for model in model_list:
+            logger.info(f"Worker {worker_id} loading model: {model}")
+            try:
+                llm = LLM(model)
+                torch.cuda.empty_cache()
+                logger.info(f"Worker {worker_id} loaded model {model} successfully.")
+            except Exception as e:
+                logger.error(f"Worker {worker_id} error loading model {model}: {e}")
+                continue
+
+            sampling_params = SamplingParams(max_tokens=50, temperature=0.7, top_p=0.9, top_k=50)
+            responses = []
+
+            # tqdm 추가: 워커별 모델 진행 상태 표시
+            with tqdm(total=len(df), desc=f"[{worker_id}] Model: {model}") as pbar:
+                model_name = model.split("/")[-1]
+                for _, row in df.iterrows():
+                    try:
+                        input_text = chat_formating(input_sentence=row["input"], model_name=model_name)
+                        response = llm.generate(input_text, sampling_params)[0].outputs[0].text.strip()
+                        logger.info(f"Model: {model}, Input: {input_text}, Output: {response}")
+                        responses.append(response)
+                    except Exception as e:
+                        logger.error(f"Worker {worker_id} error during inference for model {model}, row {row.name}: {e}")
+                        error_rows = pd.concat([error_rows, pd.DataFrame([row])], ignore_index=True)
+                        responses.append(None)
+                    finally:
+                        pbar.update(1)
+
+            # 결과 추가
+            df[model_name] = responses
+            del llm
+            torch.cuda.empty_cache()
+            gc.collect()
+
+        # 배치 결과 저장
+        output_path = batch_file_path.replace("uploaded", "processed").replace(".csv", "_result.csv")
+        os.makedirs("processed", exist_ok=True)
+        df.to_csv(output_path, index=False, encoding="utf-8")
+        logger.info(f"Worker {worker_id} inference completed for batch. Result saved to: {output_path}")
+
+        # 에러 행 저장
+        if not error_rows.empty:
+            error_path = batch_file_path.replace("uploaded", "errors").replace(".csv", "_errors.csv")
+            os.makedirs("errors", exist_ok=True)
+            error_rows.to_csv(error_path, index=False, encoding="utf-8")
+            logger.info(f"Error rows saved to: {error_path}")
+
+        return output_path
+
+    except Exception as e:
+        logger.error(f"Worker {worker_id} error during inference: {e}")
+        raise
+
+@app.get("/merge-results/")
+def merge_results():
+    try:
+        processed_dir = "processed"
+        all_files = [os.path.join(processed_dir, f) for f in os.listdir(processed_dir) if f.endswith("_result.csv")]
+        combined_df = pd.concat([pd.read_csv(f, encoding="utf-8") for f in all_files], ignore_index=True)
+        
+        final_output_path = os.path.join(processed_dir, "final_result.csv")
+        combined_df.to_csv(final_output_path, index=False, encoding="utf-8")
+        
+        logger.info(f"Final merged result saved to: {final_output_path}")
+        return {"final_result_path": final_output_path}
+    except Exception as e:
+        logger.error(f"Error during merging results: {e}")
+        return JSONResponse(content={"error": "Failed to merge results."}, status_code=500)
+    
+# 결과 파일 다운로드
+@app.get("/download-latest", response_class=FileResponse)
+def download_latest_file():
+    try:
+        # processed 디렉토리 경로
+        directory = "LLM_asyncio/processed"
+
+        csv_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith(".csv")]
+
+        if not csv_files:
+            return JSONResponse(content={"error": "No CSV files found in the processed directory."}, status_code=404)
+
+        latest_file = max(csv_files, key=os.path.getctime)
+
+        logger.info(f"Downloading latest file: {latest_file}")
+
+        return FileResponse(latest_file, media_type="application/csv", filename=os.path.basename(latest_file))
+    except Exception as e:
+        logger.error(f"Error during file download: {e}")
+        return JSONResponse(content={"error": "Failed to download the latest file."}, status_code=500)
\ No newline at end of file
diff --git a/workspace/setting.json b/workspace/setting.json
new file mode 100755
index 0000000..01d2ea3
--- /dev/null
+++ b/workspace/setting.json
@@ -0,0 +1,15 @@
+{
+    "terminal.integrated.fontFamily": "OxygenMono, D2CodingLigatureMono",
+    "terminal.integrated.fontWeightBold": "bold",
+    "[python]": {
+        "editor.defaultFormatter": "charliermarsh.ruff",
+        "editor.insertSpaces": true,
+        "editor.formatOnSave": true,
+        "editor.codeActionsOnSave": {
+          "source.fixAll": "explicit",
+          "source.organizeImports": "explicit",
+          "editor.insertSpaces": "explicit"
+        }
+    }
+    
+}
\ No newline at end of file
diff --git a/workspace/template.py b/workspace/template.py
new file mode 100755
index 0000000..c8d573f
--- /dev/null
+++ b/workspace/template.py
@@ -0,0 +1,18 @@
+class LLMInference:
+    def __init__(self):
+        pass
+
+    @staticmethod    
+    def llama_template():
+        hidden_prompt = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n친절한 건설안전전문가로서 반드시 [규칙]을 바탕으로 상대방의 요청에 답변을 생성해주세요.\n[규칙]\n1. 한 문장으로 핵심만 요약해서 답변을 생성합니다.\n2. 모든 답변은 반드시 한 문장의 한국어(Korean)으로 작성합니다.\n3. 생성된 답변의 신뢰성은 1(낮음)~5(높음)으로 평가합니다.4. 답변 형식은 다음과 같습니다. 답변.\n(신뢰성:3)<|eot_id|>\n<|start_header_id|>user<|end_header_id|>\n\n{input_sent}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
+        return hidden_prompt
+    
+    @staticmethod    
+    def gemma_template():
+        hidden_prompt = "<bos><start_of_turn>user\n친절한 건설안전전문가로서 반드시 [규칙]을 바탕으로 상대방의 요청에 답변을 생성해주세요.\n[규칙]\n1. 한 문장으로 핵심만 요약해서 답변을 생성합니다.\n2. 모든 답변은 반드시 한 문장의 한국어(Korean)으로 작성합니다.\n3. 생성된 답변의 신뢰성은 1(낮음)~5(높음)으로 평가합니다.4. 답변 형식은 다음과 같습니다. 답변.\n(신뢰성:3)\n\n{input_sent}<end_of_turn>\n<start_of_turn>model"
+        return hidden_prompt
+    
+    @staticmethod
+    def exaone_template():
+        hidden_prompt = "[|system|]친절한 건설안전전문가로서 반드시 [규칙]을 바탕으로 상대방의 요청에 답변을 생성해주세요.\n[규칙]\n1. 한 문장으로 핵심만 요약해서 답변을 생성합니다.\n2. 모든 답변은 반드시 한 문장의 한국어(Korean)으로 작성합니다.\n3. 생성된 답변의 신뢰성은 1(낮음)~5(높음)으로 평가합니다.4. 답변 형식은 다음과 같습니다. 답변.\n(신뢰성:3)[|endofturn|]\n\n[|user|]{input_sent}[|endofturn|]\n[|assistant|]"
+        return hidden_prompt
\ No newline at end of file
diff --git a/workspace/tests/example.py b/workspace/tests/example.py
new file mode 100755
index 0000000..6c4a131
--- /dev/null
+++ b/workspace/tests/example.py
@@ -0,0 +1,12 @@
+from vllm import LLM
+
+"""
+- max_model_len : 모델이 지원해주는 최대 시퀀스 길이입니다. 더 짧게도 가능하며, 모델이 지원해주는 가장 큰 값으로도 가능합니다. 저는 2048로 임의로 셋팅했습니다.
+- tensor_parallel_size : 앞서 vLLM을 소개할 때 vLLM은 분산 추론(distrubuted inference)를 지원합니다. 더 자세히 말하면 분산 텐서 병렬(distributed tenwor parallel)기반 inference 및 serving을 지원하는 것입니다. 이때 vLLM은 Ray를 활용해 분산 런타임을 지원합니다. 따라서 Python Ray가 설치되어 있어야하며, 이를 활용하면 쉽고 간단하게 gpu 등을 병렬로 처리할 수 있습니다. 저는 1이라고 셋팅해서 1개의 gpu를 사용하도록 설정했습니다.
+이렇게 올라온 모델을 사용해 이제 텍스트를 생성하는 text generate를 실행해보겠습니다. 다음과 같이 실행하면 됩니다.
+"""
+
+llm = LLM(model="yanolja/EEVE-Korean-Instruct-2.8B-v1.0", max_model_len=2048, tensor_parallel_size=1) # 모델로드
+
+requestoutput = llm.generate("안녕하십니까. 기상 캐스터 어시스턴트입니다. 오늘의 날씨는") # 입력문장
+print(requestoutput)
diff --git a/workspace/worker.py b/workspace/worker.py
new file mode 100755
index 0000000..4ed9b5d
--- /dev/null
+++ b/workspace/worker.py
@@ -0,0 +1,12 @@
+from redis import Redis
+from rq import Worker, Queue
+
+# Redis 설정
+redis_conn = Redis(host="redis-server", port=6379, decode_responses=False)
+
+# 작업 큐
+queue = Queue("model_tasks", connection=redis_conn)
+
+if __name__ == "__main__":
+    worker = Worker([queue], connection=redis_conn)
+    worker.work()
\ No newline at end of file