Add PKL storage and CSV conversion

This commit is contained in:
kyy
2025-01-09 16:25:32 +09:00
parent 4ced8db541
commit dcdc3157c3
2 changed files with 60 additions and 93 deletions

View File

@@ -2,6 +2,7 @@ import os
import pandas as pd
from fastapi import FastAPI, UploadFile, BackgroundTasks
from fastapi.responses import JSONResponse, FileResponse
import shutil
from redis import Redis
from rq import Queue
from vllm import LLM, SamplingParams
@@ -9,10 +10,8 @@ import logging
import gc
import torch
from tqdm import tqdm
import sys
sys.path.append("/workspace/LLM_asyncio")
from template import LLMInference
import pickle
app = FastAPI()
@@ -38,7 +37,7 @@ async def process_csv(input_csv: UploadFile, model_list_txt: UploadFile, backgro
model_list_path = f"uploaded/{model_list_txt.filename}"
os.makedirs("uploaded", exist_ok=True)
with open(file_path, "wb") as f:
with open(file_path, "wb") as f:
f.write(await input_csv.read())
with open(model_list_path, "wb") as f:
@@ -51,6 +50,30 @@ async def process_csv(input_csv: UploadFile, model_list_txt: UploadFile, backgro
logger.info(f"Job enqueued: {job.id}")
return {"job_id": job.id, "status": "queued"}
# CSV를 PKL로 변환
def save_to_pkl(dataframe: pd.DataFrame, output_path: str):
pkl_path = output_path.replace(".csv", ".pkl")
with open(pkl_path, "wb") as pkl_file:
pickle.dump(dataframe, pkl_file)
logger.info(f"Data saved as PKL: {pkl_path}")
return pkl_path
# PKL을 CSV로 변환
def convert_pkl_to_csv(pkl_path: str, csv_path: str):
with open(pkl_path, "rb") as pkl_file:
dataframe = pickle.load(pkl_file)
dataframe.to_csv(csv_path, index=False, encoding="utf-8")
logger.info(f"PKL converted to CSV: {csv_path}")
return csv_path
# CSV 파일 삭제 작업
def delete_csv_file(file_path: str):
try:
os.remove(file_path)
logger.info(f"CSV file deleted: {file_path}")
except Exception as e:
logger.error(f"Error deleting CSV file: {e}")
def chat_formating(input_sentence: str, model_name: str):
try:
if "llama" in model_name:
@@ -79,13 +102,18 @@ def run_inference(file_path: str, model_list_path: str, batch_size: int = 32):
raise ValueError("The model list file is empty.")
# CSV 읽기
df = pd.read_csv(file_path, encoding="euc-kr")
try:
df = pd.read_csv(file_path, encoding="euc-kr")
except Exception as e:
df = pd.read_csv(file_path, encoding="utf-8")
logger.info(f"Failed to read {file_path} as {e}")
if "input" not in df.columns:
raise ValueError("The input CSV must contain a column named 'input'.")
# 에러 발생한 행 저장용 DataFrame 초기화
error_rows = pd.DataFrame(columns=df.columns)
# 각 모델로 추론
for model in model_list:
model_name = model.split("/")[-1]
@@ -128,7 +156,8 @@ def run_inference(file_path: str, model_list_path: str, batch_size: int = 32):
# 결과 저장
output_path = file_path.replace("uploaded", "processed").replace(".csv", "_result.csv")
os.makedirs("processed", exist_ok=True)
df.to_csv(output_path, index=False, encoding="utf-8")
# df.to_csv(output_path, index=False, encoding="utf-8")
save_to_pkl(df, output_path)
logger.info(f"Inference completed. Result saved to: {output_path}")
# 에러 행 저장
@@ -143,22 +172,33 @@ def run_inference(file_path: str, model_list_path: str, batch_size: int = 32):
except Exception as e:
logger.error(f"Error during inference: {e}")
raise
# 결과 파일 다운로드
@app.get("/download-latest", response_class=FileResponse)
def download_latest_file():
# PKL에서 CSV로 변환하여 다운로드 후 삭제
@app.get("/download-latest-result", response_class=FileResponse)
def download_latest_result(background_tasks: BackgroundTasks):
try:
# processed 디렉토리 경로
directory = "processed"
csv_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith(".csv")]
# PKL 파일 저장 디렉토리
processed_dir = "processed"
if not os.path.exists(processed_dir):
return JSONResponse(content={"error": "Processed directory not found."}, status_code=404)
if not csv_files:
return JSONResponse(content={"error": "No CSV files found in the processed directory."}, status_code=404)
# 디렉토리 내 가장 최근에 저장된 PKL 파일 찾기
pkl_files = [os.path.join(processed_dir, f) for f in os.listdir(processed_dir) if f.endswith(".pkl")]
if not pkl_files:
return JSONResponse(content={"error": "No PKL files found in the processed directory."}, status_code=404)
latest_file = max(csv_files, key=os.path.getctime)
latest_pkl = max(pkl_files, key=os.path.getctime)
csv_path = latest_pkl.replace(".pkl", ".csv")
# PKL 파일을 CSV로 변환
convert_pkl_to_csv(latest_pkl, csv_path)
# Background task에 파일 삭제 작업 추가
background_tasks.add_task(delete_csv_file, csv_path)
# CSV 파일 응답 반환
return FileResponse(csv_path, media_type="application/csv", filename=os.path.basename(csv_path))
logger.info(f"Downloading latest file: {latest_file}")
return FileResponse(latest_file, media_type="application/csv", filename=os.path.basename(latest_file))
except Exception as e:
logger.error(f"Error during file download: {e}")
return JSONResponse(content={"error": "Failed to download the latest file."}, status_code=500)
return JSONResponse(content={"error": "Failed to download the result file."}, status_code=500)