도커라이징

This commit is contained in:
2025-02-13 16:37:45 +09:00
parent ad26488b1b
commit 0d34c20882
8 changed files with 267 additions and 95 deletions

View File

@@ -0,0 +1,19 @@
FROM python:3.11
ARG DEBIAN_FRONTEND=noninteractive
ENV TZ=Asia/Seoul
ADD /workspace /opt/workspace
WORKDIR /opt/workspace
ADD https://astral.sh/uv/install.sh /uv-installer.sh
RUN sh /uv-installer.sh && rm /uv-installer.sh
ENV PATH="/root/.local/bin/:$PATH"
RUN uv self update
COPY requirements.txt .
RUN uv pip install --no-cache-dir -r requirements.txt --system
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]

View File

@@ -2,17 +2,26 @@ version: "3.9"
services:
api:
build: .
command: uvicorn app:app --host 0.0.0.0 --port 8000
command: uvicorn main:app --host 0.0.0.0 --port 8000
ports:
- "8000:8000"
depends_on:
- redis
networks:
- app_network
worker:
build: .
command: python worker.py
command: python3 worker.py
depends_on:
- redis
networks:
- app_network
redis:
image: redis:6
ports:
- "6379:6379"
networks:
- app_network
networks:
app_network:
driver: bridge

View File

@@ -0,0 +1,145 @@
aiohappyeyeballs==2.4.6
aiohttp==3.11.12
aiosignal==1.3.2
annotated-types==0.7.0
anthropic==0.45.2
anyio==4.8.0
arrow==1.3.0
async-timeout==4.0.3
attrs==25.1.0
azure-ai-documentintelligence==1.0.0
azure-core==1.32.0
azure-identity==1.20.0
beautifulsoup4==4.13.3
blinker==1.9.0
certifi==2025.1.31
cffi==1.17.1
charset-normalizer==3.4.1
click==8.1.8
cobble==0.1.4
cryptography==44.0.1
deepl==1.21.0
defusedxml==0.7.1
distro==1.9.0
dnspython==2.7.0
email_validator==2.2.0
et_xmlfile==2.0.0
exceptiongroup==1.2.2
fastapi==0.115.8
fastapi-cli==0.0.7
feedparser==6.0.11
Flask==3.1.0
frozenlist==1.5.0
googleapis-common-protos==1.66.0
greenlet==3.1.1
grpcio==1.70.0
h11==0.14.0
httpcore==1.0.7
httptools==0.6.4
httpx==0.28.1
idna==3.10
isodate==0.7.2
itsdangerous==2.2.0
Jinja2==3.1.5
jiter==0.8.2
joblib==1.4.2
jsonpatch==1.33
jsonpointer==3.0.0
kiwipiepy==0.20.3
kiwipiepy-model==0.20.0
langchain==0.3.18
langchain-core==0.3.34
langchain-teddynote==0.3.42
langchain-text-splitters==0.3.6
langgraph==0.2.71
langgraph-checkpoint==2.0.12
langgraph-sdk==0.1.51
langsmith==0.3.8
lxml==5.3.1
lz4==4.4.3
mammoth==1.9.0
markdown-it-py==3.0.0
markdownify==0.14.1
markitdown==0.0.1a4
MarkupSafe==3.0.2
mdurl==0.1.2
mmh3==4.1.0
msal==1.31.1
msal-extensions==1.2.0
msgpack==1.1.0
multidict==6.1.0
nltk==3.9.1
numpy==1.26.4
olefile==0.47
openai==1.61.1
openpyxl==3.1.5
orjson==3.10.15
packaging==24.2
pandas==2.2.3
pathvalidate==3.2.3
pdf2image==1.17.0
pdfminer.six==20240706
pillow==11.1.0
pinecone-client==5.0.1
pinecone-plugin-inference==1.1.0
pinecone-plugin-interface==0.0.7
pinecone-text==0.9.0
portalocker==2.10.1
propcache==0.2.1
protobuf==4.25.6
protoc-gen-openapiv2==0.0.1
puremagic==1.28
pycparser==2.22
pydantic==2.10.6
pydantic-extra-types==2.10.2
pydantic-settings==2.7.1
pydantic_core==2.27.2
pydub==0.25.1
Pygments==2.19.1
PyJWT==2.10.1
python-dateutil==2.9.0.post0
python-dotenv==1.0.1
python-multipart==0.0.20
python-pptx==1.0.2
pytz==2025.1
PyYAML==6.0.2
rank-bm25==0.2.2
redis==5.2.1
Redis-Sentinel-Url==1.0.1
regex==2024.11.6
requests==2.32.3
requests-toolbelt==1.0.0
rich==13.9.4
rich-toolkit==0.13.2
rq==2.1.0
rq-dashboard==0.8.2.2
sgmllib3k==1.0.0
shellingham==1.5.4
six==1.17.0
sniffio==1.3.1
soupsieve==2.6
SpeechRecognition==3.14.1
SQLAlchemy==2.0.38
starlette==0.45.3
tavily-python==0.5.1
tenacity==9.0.0
tiktoken==0.8.0
tqdm==4.67.1
typer==0.15.1
types-python-dateutil==2.9.0.20241206
types-requests==2.32.0.20241016
typing_extensions==4.12.2
tzdata==2025.1
ujson==5.10.0
urllib3==2.3.0
uvicorn==0.34.0
uvloop==0.21.0
watchfiles==1.0.4
websockets==14.2
Werkzeug==3.1.3
wget==3.2
xlrd==2.0.1
XlsxWriter==3.2.2
yarl==1.18.3
youtube-transcript-api==0.6.3
zstandard==0.23.0

9
workspace/config.py Normal file
View File

@@ -0,0 +1,9 @@
import os
# 디렉토리 설정
UPLOAD_DIR = os.getenv("UPLOAD_DIR", "data")
OUTPUT_DIR = os.getenv("OUTPUT_DIR", "converted")
# Redis 연결 정보
REDIS_HOST = os.getenv("REDIS_HOST", "redis")
REDIS_PORT = int(os.getenv("REDIS_PORT", "6379"))

View File

@@ -5,48 +5,30 @@ from markitdown import MarkItDown
def convert_hwp_to_md(input_path: str, output_path: str):
loader = HWPLoader(input_path)
docs = loader.load()
# Document 객체 리스트를 문자열 리스트로 변환
docs_as_text = [
doc.page_content if hasattr(doc, "page_content") else str(doc) for doc in docs
]
docs_as_text = []
for doc in docs:
try:
text = doc.page_content if hasattr(doc, "page_content") else str(doc)
text = text.encode("utf-8", "ignore").decode(
"utf-8"
) # UTF-8로 변환하면서 깨진 문자 제거
docs_as_text.append(text)
except Exception as e:
print(f"인코딩 변환 중 오류 발생: {e}")
with open(output_path, "w", encoding="UTF-8") as f:
f.write(docs)
with open(output_path, "w", encoding="utf-8") as f:
f.write("\n".join(docs_as_text)) # ✅ 변환된 리스트를 파일에 저장
return None
def convert_txt_to_md(input_path: str, output_path: str):
return None
def convert_html_to_md(input_path: str, output_path: str):
return None
def convert_docx_to_md(input_path: str, output_path: str):
return None
def convert_pdf_to_md(input_path: str, output_path: str):
def convert_to_md(input_path: str, output_path: str):
md = MarkItDown(docintel_endpoint="<document_intelligence_endpoint>")
result = md.convert(input_path)
with open(output_path, "w", encoding="utf-8") as f:
f.write(result.text_content)
return None
def convert_ppt_to_md(input_path: str, output_path: str):
return None
def convert_excel_to_md(input_path: str, output_path: str):
return None
def convert_csv_to_md(input_path: str, output_path: str):
return None
def convert_json_to_md(input_path: str, output_path: str):
return None
def convert_img_to_md(input_path: str, output_path: str):
return None

View File

@@ -1,66 +1,86 @@
import json
import os
import shutil
from pathlib import Path
from typing import List
import redis
from config import OUTPUT_DIR, UPLOAD_DIR
from fastapi import FastAPI, UploadFile
from fastapi.middleware.cors import CORSMiddleware
from redis_client import redis_client
from rq import Queue
from worker import process_task
# RQ 작업 큐 생성
task_queue = Queue("task_queue1", connection=redis_client)
app = FastAPI()
UPLOAD_DIR = "data"
OUTPUT_DIR = "converted"
# CORS 설정
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # 모든 오리진 허용
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# 업로드 및 출력 디렉토리 생성
os.makedirs(UPLOAD_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)
redis_client = redis.StrictRedis(host="localhost", port=6379, decode_responses=True)
@app.post("/upload/")
async def upload_directory(files: List[UploadFile]):
"""사용자가 여러 개의 파일을 업로드하면 UPLOAD_DIR에 저장"""
"""사용자가 업로드한 파일들을 UPLOAD_DIR에 저장"""
uploaded_files = []
for file in files:
file_path = Path(UPLOAD_DIR) / file.filename
with open(file_path, "wb") as buffer:
shutil.copyfileobj(file.file, buffer)
uploaded_files.append(file.filename)
return {"message": "파일 업로드 성공", "files": uploaded_files}
@app.get("/convert/")
async def convert_files():
"""
업로드된 파일들을 지원하는 확장자(txt, html, docx, pdf)만 Redis 작업 큐에 태스크로 등록.
워커에서 배치 잡으로 처리합니다.
"""
"""업로드된 파일을 변환 큐(RQ)에 등록하고 Job ID 반환"""
files = os.listdir(UPLOAD_DIR)
enqueued_tasks = []
for file in files:
file_ext = file.split(".")[-1].lower()
# 지원하는 파일 형식만 큐에 넣기
if file_ext in ["txt", "html", "docx", "pdf"]:
if file_ext in ["txt", "html", "docx", "pdf", "hwp"]:
task = {
"filename": file,
"extension": file_ext,
"input_path": os.path.join(UPLOAD_DIR, file),
"output_path": os.path.join(OUTPUT_DIR, f"{Path(file).stem}.md"),
}
# 태스크를 JSON 문자열로 변환하여 큐에 등록
redis_client.lpush("task_queue", json.dumps(task))
enqueued_tasks.append(task)
job = task_queue.enqueue(process_task, task) # RQ에 작업 등록
enqueued_tasks.append({"task": task, "job_id": job.id})
return {"message": "작업이 큐에 추가되었습니다.", "tasks": enqueued_tasks}
@app.get("/task/{job_id}")
async def get_task_status(job_id: str):
"""RQ에서 특정 Job ID의 상태 확인"""
job = task_queue.fetch_job(job_id)
if not job:
return {"error": "존재하지 않는 작업입니다."}
return {
"job_id": job.id,
"status": job.get_status(),
"result": job.result,
"enqueued_at": str(job.enqueued_at),
"ended_at": str(job.ended_at) if job.ended_at else None,
}
@app.get("/download/{filename}")
async def download_file(filename: str):
"""변환된 Markdown 파일을 다운로드할 수 있도록 제공"""
"""변환된 Markdown 파일을 다운로드합니다."""
file_path = Path(OUTPUT_DIR) / filename
if not file_path.exists():
return {"error": "파일이 존재하지 않습니다."}

View File

@@ -0,0 +1,6 @@
import redis
from config import REDIS_HOST, REDIS_PORT
redis_client = redis.StrictRedis(
host=REDIS_HOST, port=REDIS_PORT, decode_responses=False
)

View File

@@ -1,56 +1,38 @@
import json
import time
import logging
import convert_obj_to_md
import redis
from redis_client import redis_client
from rq import Worker
UPLOAD_DIR = "data"
OUTPUT_DIR = "converted"
redis_client = redis.StrictRedis(host="localhost", port=6379, decode_responses=True)
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def process_task(task):
"""
태스크 정보를 기반으로 파일 확장자에 맞는 변환 함수를 호출.
태스크 정보를 기반으로 파일 확장자에 맞는 변환 함수를 호출합니다.
"""
file_ext = task.get("extension")
input_path = task.get("input_path")
output_path = task.get("output_path")
converters = {
"txt": convert_obj_to_md.convert_txt_to_md,
"hwp": convert_obj_to_md.convert_hwp_to_md,
"docx": convert_obj_to_md.convert_docx_to_md,
"pdf": convert_obj_to_md.convert_pdf_to_md,
"ppt": convert_obj_to_md.convert_ppt_to_md,
"excel": convert_obj_to_md.convert_excel_to_md,
}
converter = converters.get(file_ext)
if converter:
converter(input_path, output_path)
else:
print(f"지원하지 않는 파일 형식: {file_ext}")
logger.info(f"작업 처리 중: {task}")
def worker():
"""Redis 큐에서 태스크를 가져와 파일 변환 작업을 수행"""
while True:
# rpop: 큐의 오른쪽에서 태스크를 꺼냄
task_json = redis_client.rpop("task_queue")
if task_json:
try:
task = json.loads(task_json)
print(f"작업 처리 중: {task}")
process_task(task)
except Exception as e:
print(f"작업 처리 중 에러: {e}")
try:
if file_ext == "hwp":
convert_obj_to_md.convert_hwp_to_md(input_path, output_path)
else:
print("큐에 작업이 없습니다. 5초 후 재시도...")
time.sleep(5)
convert_obj_to_md.convert_to_md(input_path, output_path)
logger.info(f"변환 완료: {task}")
return {"status": "success", "output_path": output_path}
except Exception as e:
logger.error(f"작업 처리 중 에러: {e}", exc_info=True)
return {"status": "error", "message": str(e)}
if __name__ == "__main__":
print("Redis 워커 시작!")
worker()
listen = ["task_queue1"]
worker = Worker(listen, connection=redis_client) # 최신 방식
worker.work()