도커라이징

This commit is contained in:
2025-02-13 16:37:45 +09:00
parent ad26488b1b
commit 0d34c20882
8 changed files with 267 additions and 95 deletions

View File

@@ -0,0 +1,19 @@
FROM python:3.11
ARG DEBIAN_FRONTEND=noninteractive
ENV TZ=Asia/Seoul
ADD /workspace /opt/workspace
WORKDIR /opt/workspace
ADD https://astral.sh/uv/install.sh /uv-installer.sh
RUN sh /uv-installer.sh && rm /uv-installer.sh
ENV PATH="/root/.local/bin/:$PATH"
RUN uv self update
COPY requirements.txt .
RUN uv pip install --no-cache-dir -r requirements.txt --system
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]

View File

@@ -2,17 +2,26 @@ version: "3.9"
services: services:
api: api:
build: . build: .
command: uvicorn app:app --host 0.0.0.0 --port 8000 command: uvicorn main:app --host 0.0.0.0 --port 8000
ports: ports:
- "8000:8000" - "8000:8000"
depends_on: depends_on:
- redis - redis
networks:
- app_network
worker: worker:
build: . build: .
command: python worker.py command: python3 worker.py
depends_on: depends_on:
- redis - redis
networks:
- app_network
redis: redis:
image: redis:6 image: redis:6
ports: ports:
- "6379:6379" - "6379:6379"
networks:
- app_network
networks:
app_network:
driver: bridge

View File

@@ -0,0 +1,145 @@
aiohappyeyeballs==2.4.6
aiohttp==3.11.12
aiosignal==1.3.2
annotated-types==0.7.0
anthropic==0.45.2
anyio==4.8.0
arrow==1.3.0
async-timeout==4.0.3
attrs==25.1.0
azure-ai-documentintelligence==1.0.0
azure-core==1.32.0
azure-identity==1.20.0
beautifulsoup4==4.13.3
blinker==1.9.0
certifi==2025.1.31
cffi==1.17.1
charset-normalizer==3.4.1
click==8.1.8
cobble==0.1.4
cryptography==44.0.1
deepl==1.21.0
defusedxml==0.7.1
distro==1.9.0
dnspython==2.7.0
email_validator==2.2.0
et_xmlfile==2.0.0
exceptiongroup==1.2.2
fastapi==0.115.8
fastapi-cli==0.0.7
feedparser==6.0.11
Flask==3.1.0
frozenlist==1.5.0
googleapis-common-protos==1.66.0
greenlet==3.1.1
grpcio==1.70.0
h11==0.14.0
httpcore==1.0.7
httptools==0.6.4
httpx==0.28.1
idna==3.10
isodate==0.7.2
itsdangerous==2.2.0
Jinja2==3.1.5
jiter==0.8.2
joblib==1.4.2
jsonpatch==1.33
jsonpointer==3.0.0
kiwipiepy==0.20.3
kiwipiepy-model==0.20.0
langchain==0.3.18
langchain-core==0.3.34
langchain-teddynote==0.3.42
langchain-text-splitters==0.3.6
langgraph==0.2.71
langgraph-checkpoint==2.0.12
langgraph-sdk==0.1.51
langsmith==0.3.8
lxml==5.3.1
lz4==4.4.3
mammoth==1.9.0
markdown-it-py==3.0.0
markdownify==0.14.1
markitdown==0.0.1a4
MarkupSafe==3.0.2
mdurl==0.1.2
mmh3==4.1.0
msal==1.31.1
msal-extensions==1.2.0
msgpack==1.1.0
multidict==6.1.0
nltk==3.9.1
numpy==1.26.4
olefile==0.47
openai==1.61.1
openpyxl==3.1.5
orjson==3.10.15
packaging==24.2
pandas==2.2.3
pathvalidate==3.2.3
pdf2image==1.17.0
pdfminer.six==20240706
pillow==11.1.0
pinecone-client==5.0.1
pinecone-plugin-inference==1.1.0
pinecone-plugin-interface==0.0.7
pinecone-text==0.9.0
portalocker==2.10.1
propcache==0.2.1
protobuf==4.25.6
protoc-gen-openapiv2==0.0.1
puremagic==1.28
pycparser==2.22
pydantic==2.10.6
pydantic-extra-types==2.10.2
pydantic-settings==2.7.1
pydantic_core==2.27.2
pydub==0.25.1
Pygments==2.19.1
PyJWT==2.10.1
python-dateutil==2.9.0.post0
python-dotenv==1.0.1
python-multipart==0.0.20
python-pptx==1.0.2
pytz==2025.1
PyYAML==6.0.2
rank-bm25==0.2.2
redis==5.2.1
Redis-Sentinel-Url==1.0.1
regex==2024.11.6
requests==2.32.3
requests-toolbelt==1.0.0
rich==13.9.4
rich-toolkit==0.13.2
rq==2.1.0
rq-dashboard==0.8.2.2
sgmllib3k==1.0.0
shellingham==1.5.4
six==1.17.0
sniffio==1.3.1
soupsieve==2.6
SpeechRecognition==3.14.1
SQLAlchemy==2.0.38
starlette==0.45.3
tavily-python==0.5.1
tenacity==9.0.0
tiktoken==0.8.0
tqdm==4.67.1
typer==0.15.1
types-python-dateutil==2.9.0.20241206
types-requests==2.32.0.20241016
typing_extensions==4.12.2
tzdata==2025.1
ujson==5.10.0
urllib3==2.3.0
uvicorn==0.34.0
uvloop==0.21.0
watchfiles==1.0.4
websockets==14.2
Werkzeug==3.1.3
wget==3.2
xlrd==2.0.1
XlsxWriter==3.2.2
yarl==1.18.3
youtube-transcript-api==0.6.3
zstandard==0.23.0

9
workspace/config.py Normal file
View File

@@ -0,0 +1,9 @@
import os
# 디렉토리 설정
UPLOAD_DIR = os.getenv("UPLOAD_DIR", "data")
OUTPUT_DIR = os.getenv("OUTPUT_DIR", "converted")
# Redis 연결 정보
REDIS_HOST = os.getenv("REDIS_HOST", "redis")
REDIS_PORT = int(os.getenv("REDIS_PORT", "6379"))

View File

@@ -5,48 +5,30 @@ from markitdown import MarkItDown
def convert_hwp_to_md(input_path: str, output_path: str): def convert_hwp_to_md(input_path: str, output_path: str):
loader = HWPLoader(input_path) loader = HWPLoader(input_path)
docs = loader.load() docs = loader.load()
# Document 객체 리스트를 문자열 리스트로 변환
docs_as_text = [
doc.page_content if hasattr(doc, "page_content") else str(doc) for doc in docs
]
docs_as_text = []
for doc in docs:
try:
text = doc.page_content if hasattr(doc, "page_content") else str(doc)
text = text.encode("utf-8", "ignore").decode(
"utf-8"
) # UTF-8로 변환하면서 깨진 문자 제거
docs_as_text.append(text)
except Exception as e:
print(f"인코딩 변환 중 오류 발생: {e}")
with open(output_path, "w", encoding="UTF-8") as f: with open(output_path, "w", encoding="utf-8") as f:
f.write(docs) f.write("\n".join(docs_as_text)) # ✅ 변환된 리스트를 파일에 저장
return None return None
def convert_txt_to_md(input_path: str, output_path: str): def convert_to_md(input_path: str, output_path: str):
return None
def convert_html_to_md(input_path: str, output_path: str):
return None
def convert_docx_to_md(input_path: str, output_path: str):
return None
def convert_pdf_to_md(input_path: str, output_path: str):
md = MarkItDown(docintel_endpoint="<document_intelligence_endpoint>") md = MarkItDown(docintel_endpoint="<document_intelligence_endpoint>")
result = md.convert(input_path) result = md.convert(input_path)
with open(output_path, "w", encoding="utf-8") as f: with open(output_path, "w", encoding="utf-8") as f:
f.write(result.text_content) f.write(result.text_content)
return None return None
def convert_ppt_to_md(input_path: str, output_path: str):
return None
def convert_excel_to_md(input_path: str, output_path: str):
return None
def convert_csv_to_md(input_path: str, output_path: str):
return None
def convert_json_to_md(input_path: str, output_path: str):
return None
def convert_img_to_md(input_path: str, output_path: str):
return None

View File

@@ -1,66 +1,86 @@
import json
import os import os
import shutil import shutil
from pathlib import Path from pathlib import Path
from typing import List from typing import List
import redis from config import OUTPUT_DIR, UPLOAD_DIR
from fastapi import FastAPI, UploadFile from fastapi import FastAPI, UploadFile
from fastapi.middleware.cors import CORSMiddleware
from redis_client import redis_client
from rq import Queue
from worker import process_task
# RQ 작업 큐 생성
task_queue = Queue("task_queue1", connection=redis_client)
app = FastAPI() app = FastAPI()
UPLOAD_DIR = "data" # CORS 설정
OUTPUT_DIR = "converted" app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # 모든 오리진 허용
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# 업로드 및 출력 디렉토리 생성
os.makedirs(UPLOAD_DIR, exist_ok=True) os.makedirs(UPLOAD_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True) os.makedirs(OUTPUT_DIR, exist_ok=True)
redis_client = redis.StrictRedis(host="localhost", port=6379, decode_responses=True)
@app.post("/upload/") @app.post("/upload/")
async def upload_directory(files: List[UploadFile]): async def upload_directory(files: List[UploadFile]):
"""사용자가 여러 개의 파일을 업로드하면 UPLOAD_DIR에 저장""" """사용자가 업로드한 파일들을 UPLOAD_DIR에 저장"""
uploaded_files = [] uploaded_files = []
for file in files: for file in files:
file_path = Path(UPLOAD_DIR) / file.filename file_path = Path(UPLOAD_DIR) / file.filename
with open(file_path, "wb") as buffer: with open(file_path, "wb") as buffer:
shutil.copyfileobj(file.file, buffer) shutil.copyfileobj(file.file, buffer)
uploaded_files.append(file.filename) uploaded_files.append(file.filename)
return {"message": "파일 업로드 성공", "files": uploaded_files} return {"message": "파일 업로드 성공", "files": uploaded_files}
@app.get("/convert/") @app.get("/convert/")
async def convert_files(): async def convert_files():
""" """업로드된 파일을 변환 큐(RQ)에 등록하고 Job ID 반환"""
업로드된 파일들을 지원하는 확장자(txt, html, docx, pdf)만 Redis 작업 큐에 태스크로 등록.
워커에서 배치 잡으로 처리합니다.
"""
files = os.listdir(UPLOAD_DIR) files = os.listdir(UPLOAD_DIR)
enqueued_tasks = [] enqueued_tasks = []
for file in files: for file in files:
file_ext = file.split(".")[-1].lower() file_ext = file.split(".")[-1].lower()
# 지원하는 파일 형식만 큐에 넣기 if file_ext in ["txt", "html", "docx", "pdf", "hwp"]:
if file_ext in ["txt", "html", "docx", "pdf"]:
task = { task = {
"filename": file, "filename": file,
"extension": file_ext, "extension": file_ext,
"input_path": os.path.join(UPLOAD_DIR, file), "input_path": os.path.join(UPLOAD_DIR, file),
"output_path": os.path.join(OUTPUT_DIR, f"{Path(file).stem}.md"), "output_path": os.path.join(OUTPUT_DIR, f"{Path(file).stem}.md"),
} }
# 태스크를 JSON 문자열로 변환하여 큐에 등록 job = task_queue.enqueue(process_task, task) # RQ에 작업 등록
redis_client.lpush("task_queue", json.dumps(task)) enqueued_tasks.append({"task": task, "job_id": job.id})
enqueued_tasks.append(task)
return {"message": "작업이 큐에 추가되었습니다.", "tasks": enqueued_tasks} return {"message": "작업이 큐에 추가되었습니다.", "tasks": enqueued_tasks}
@app.get("/task/{job_id}")
async def get_task_status(job_id: str):
"""RQ에서 특정 Job ID의 상태 확인"""
job = task_queue.fetch_job(job_id)
if not job:
return {"error": "존재하지 않는 작업입니다."}
return {
"job_id": job.id,
"status": job.get_status(),
"result": job.result,
"enqueued_at": str(job.enqueued_at),
"ended_at": str(job.ended_at) if job.ended_at else None,
}
@app.get("/download/{filename}") @app.get("/download/{filename}")
async def download_file(filename: str): async def download_file(filename: str):
"""변환된 Markdown 파일을 다운로드할 수 있도록 제공""" """변환된 Markdown 파일을 다운로드합니다."""
file_path = Path(OUTPUT_DIR) / filename file_path = Path(OUTPUT_DIR) / filename
if not file_path.exists(): if not file_path.exists():
return {"error": "파일이 존재하지 않습니다."} return {"error": "파일이 존재하지 않습니다."}

View File

@@ -0,0 +1,6 @@
import redis
from config import REDIS_HOST, REDIS_PORT
redis_client = redis.StrictRedis(
host=REDIS_HOST, port=REDIS_PORT, decode_responses=False
)

View File

@@ -1,56 +1,38 @@
import json import logging
import time
import convert_obj_to_md import convert_obj_to_md
import redis from redis_client import redis_client
from rq import Worker
UPLOAD_DIR = "data" logging.basicConfig(level=logging.INFO)
OUTPUT_DIR = "converted" logger = logging.getLogger(__name__)
redis_client = redis.StrictRedis(host="localhost", port=6379, decode_responses=True)
def process_task(task): def process_task(task):
""" """
태스크 정보를 기반으로 파일 확장자에 맞는 변환 함수를 호출. 태스크 정보를 기반으로 파일 확장자에 맞는 변환 함수를 호출합니다.
""" """
file_ext = task.get("extension") file_ext = task.get("extension")
input_path = task.get("input_path") input_path = task.get("input_path")
output_path = task.get("output_path") output_path = task.get("output_path")
converters = { logger.info(f"작업 처리 중: {task}")
"txt": convert_obj_to_md.convert_txt_to_md,
"hwp": convert_obj_to_md.convert_hwp_to_md,
"docx": convert_obj_to_md.convert_docx_to_md,
"pdf": convert_obj_to_md.convert_pdf_to_md,
"ppt": convert_obj_to_md.convert_ppt_to_md,
"excel": convert_obj_to_md.convert_excel_to_md,
}
converter = converters.get(file_ext)
if converter:
converter(input_path, output_path)
else:
print(f"지원하지 않는 파일 형식: {file_ext}")
try:
def worker(): if file_ext == "hwp":
"""Redis 큐에서 태스크를 가져와 파일 변환 작업을 수행""" convert_obj_to_md.convert_hwp_to_md(input_path, output_path)
while True:
# rpop: 큐의 오른쪽에서 태스크를 꺼냄
task_json = redis_client.rpop("task_queue")
if task_json:
try:
task = json.loads(task_json)
print(f"작업 처리 중: {task}")
process_task(task)
except Exception as e:
print(f"작업 처리 중 에러: {e}")
else: else:
print("큐에 작업이 없습니다. 5초 후 재시도...") convert_obj_to_md.convert_to_md(input_path, output_path)
time.sleep(5) logger.info(f"변환 완료: {task}")
return {"status": "success", "output_path": output_path}
except Exception as e:
logger.error(f"작업 처리 중 에러: {e}", exc_info=True)
return {"status": "error", "message": str(e)}
if __name__ == "__main__": if __name__ == "__main__":
print("Redis 워커 시작!") listen = ["task_queue1"]
worker()
worker = Worker(listen, connection=redis_client) # 최신 방식
worker.work()