diff --git a/Dockerfile b/Dockerfile index e69de29..91ce622 100644 --- a/Dockerfile +++ b/Dockerfile @@ -0,0 +1,19 @@ +FROM python:3.11 + +ARG DEBIAN_FRONTEND=noninteractive +ENV TZ=Asia/Seoul + +ADD /workspace /opt/workspace +WORKDIR /opt/workspace + +ADD https://astral.sh/uv/install.sh /uv-installer.sh +RUN sh /uv-installer.sh && rm /uv-installer.sh +ENV PATH="/root/.local/bin/:$PATH" +RUN uv self update + +COPY requirements.txt . +RUN uv pip install --no-cache-dir -r requirements.txt --system + +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] + + diff --git a/docker-compose.yml b/docker-compose.yml index ec0df31..a214911 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -2,17 +2,26 @@ version: "3.9" services: api: build: . - command: uvicorn app:app --host 0.0.0.0 --port 8000 + command: uvicorn main:app --host 0.0.0.0 --port 8000 ports: - "8000:8000" depends_on: - redis + networks: + - app_network worker: build: . - command: python worker.py + command: python3 worker.py depends_on: - redis + networks: + - app_network redis: image: redis:6 ports: - "6379:6379" + networks: + - app_network +networks: + app_network: + driver: bridge \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index e69de29..c694ea0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -0,0 +1,145 @@ +aiohappyeyeballs==2.4.6 +aiohttp==3.11.12 +aiosignal==1.3.2 +annotated-types==0.7.0 +anthropic==0.45.2 +anyio==4.8.0 +arrow==1.3.0 +async-timeout==4.0.3 +attrs==25.1.0 +azure-ai-documentintelligence==1.0.0 +azure-core==1.32.0 +azure-identity==1.20.0 +beautifulsoup4==4.13.3 +blinker==1.9.0 +certifi==2025.1.31 +cffi==1.17.1 +charset-normalizer==3.4.1 +click==8.1.8 +cobble==0.1.4 +cryptography==44.0.1 +deepl==1.21.0 +defusedxml==0.7.1 +distro==1.9.0 +dnspython==2.7.0 +email_validator==2.2.0 +et_xmlfile==2.0.0 +exceptiongroup==1.2.2 +fastapi==0.115.8 +fastapi-cli==0.0.7 +feedparser==6.0.11 +Flask==3.1.0 +frozenlist==1.5.0 +googleapis-common-protos==1.66.0 +greenlet==3.1.1 +grpcio==1.70.0 +h11==0.14.0 +httpcore==1.0.7 +httptools==0.6.4 +httpx==0.28.1 +idna==3.10 +isodate==0.7.2 +itsdangerous==2.2.0 +Jinja2==3.1.5 +jiter==0.8.2 +joblib==1.4.2 +jsonpatch==1.33 +jsonpointer==3.0.0 +kiwipiepy==0.20.3 +kiwipiepy-model==0.20.0 +langchain==0.3.18 +langchain-core==0.3.34 +langchain-teddynote==0.3.42 +langchain-text-splitters==0.3.6 +langgraph==0.2.71 +langgraph-checkpoint==2.0.12 +langgraph-sdk==0.1.51 +langsmith==0.3.8 +lxml==5.3.1 +lz4==4.4.3 +mammoth==1.9.0 +markdown-it-py==3.0.0 +markdownify==0.14.1 +markitdown==0.0.1a4 +MarkupSafe==3.0.2 +mdurl==0.1.2 +mmh3==4.1.0 +msal==1.31.1 +msal-extensions==1.2.0 +msgpack==1.1.0 +multidict==6.1.0 +nltk==3.9.1 +numpy==1.26.4 +olefile==0.47 +openai==1.61.1 +openpyxl==3.1.5 +orjson==3.10.15 +packaging==24.2 +pandas==2.2.3 +pathvalidate==3.2.3 +pdf2image==1.17.0 +pdfminer.six==20240706 +pillow==11.1.0 +pinecone-client==5.0.1 +pinecone-plugin-inference==1.1.0 +pinecone-plugin-interface==0.0.7 +pinecone-text==0.9.0 +portalocker==2.10.1 +propcache==0.2.1 +protobuf==4.25.6 +protoc-gen-openapiv2==0.0.1 +puremagic==1.28 +pycparser==2.22 +pydantic==2.10.6 +pydantic-extra-types==2.10.2 +pydantic-settings==2.7.1 +pydantic_core==2.27.2 +pydub==0.25.1 +Pygments==2.19.1 +PyJWT==2.10.1 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-multipart==0.0.20 +python-pptx==1.0.2 +pytz==2025.1 +PyYAML==6.0.2 +rank-bm25==0.2.2 +redis==5.2.1 +Redis-Sentinel-Url==1.0.1 +regex==2024.11.6 +requests==2.32.3 +requests-toolbelt==1.0.0 +rich==13.9.4 +rich-toolkit==0.13.2 +rq==2.1.0 +rq-dashboard==0.8.2.2 +sgmllib3k==1.0.0 +shellingham==1.5.4 +six==1.17.0 +sniffio==1.3.1 +soupsieve==2.6 +SpeechRecognition==3.14.1 +SQLAlchemy==2.0.38 +starlette==0.45.3 +tavily-python==0.5.1 +tenacity==9.0.0 +tiktoken==0.8.0 +tqdm==4.67.1 +typer==0.15.1 +types-python-dateutil==2.9.0.20241206 +types-requests==2.32.0.20241016 +typing_extensions==4.12.2 +tzdata==2025.1 +ujson==5.10.0 +urllib3==2.3.0 +uvicorn==0.34.0 +uvloop==0.21.0 +watchfiles==1.0.4 +websockets==14.2 +Werkzeug==3.1.3 +wget==3.2 +xlrd==2.0.1 +XlsxWriter==3.2.2 +yarl==1.18.3 +youtube-transcript-api==0.6.3 +zstandard==0.23.0 diff --git a/workspace/config.py b/workspace/config.py new file mode 100644 index 0000000..1843f0d --- /dev/null +++ b/workspace/config.py @@ -0,0 +1,9 @@ +import os + +# 디렉토리 설정 +UPLOAD_DIR = os.getenv("UPLOAD_DIR", "data") +OUTPUT_DIR = os.getenv("OUTPUT_DIR", "converted") + +# Redis 연결 정보 +REDIS_HOST = os.getenv("REDIS_HOST", "redis") +REDIS_PORT = int(os.getenv("REDIS_PORT", "6379")) diff --git a/workspace/convert_obj_to_md.py b/workspace/convert_obj_to_md.py index 41db311..910cb98 100644 --- a/workspace/convert_obj_to_md.py +++ b/workspace/convert_obj_to_md.py @@ -5,48 +5,30 @@ from markitdown import MarkItDown def convert_hwp_to_md(input_path: str, output_path: str): loader = HWPLoader(input_path) docs = loader.load() + # Document 객체 리스트를 문자열 리스트로 변환 + docs_as_text = [ + doc.page_content if hasattr(doc, "page_content") else str(doc) for doc in docs + ] + docs_as_text = [] + for doc in docs: + try: + text = doc.page_content if hasattr(doc, "page_content") else str(doc) + text = text.encode("utf-8", "ignore").decode( + "utf-8" + ) # UTF-8로 변환하면서 깨진 문자 제거 + docs_as_text.append(text) + except Exception as e: + print(f"인코딩 변환 중 오류 발생: {e}") - with open(output_path, "w", encoding="UTF-8") as f: - f.write(docs) + with open(output_path, "w", encoding="utf-8") as f: + f.write("\n".join(docs_as_text)) # ✅ 변환된 리스트를 파일에 저장 return None -def convert_txt_to_md(input_path: str, output_path: str): - return None - - -def convert_html_to_md(input_path: str, output_path: str): - return None - - -def convert_docx_to_md(input_path: str, output_path: str): - return None - - -def convert_pdf_to_md(input_path: str, output_path: str): +def convert_to_md(input_path: str, output_path: str): md = MarkItDown(docintel_endpoint="") result = md.convert(input_path) with open(output_path, "w", encoding="utf-8") as f: f.write(result.text_content) return None - - -def convert_ppt_to_md(input_path: str, output_path: str): - return None - - -def convert_excel_to_md(input_path: str, output_path: str): - return None - - -def convert_csv_to_md(input_path: str, output_path: str): - return None - - -def convert_json_to_md(input_path: str, output_path: str): - return None - - -def convert_img_to_md(input_path: str, output_path: str): - return None diff --git a/workspace/main.py b/workspace/main.py index ffedb7d..aedcce8 100644 --- a/workspace/main.py +++ b/workspace/main.py @@ -1,66 +1,86 @@ -import json import os import shutil from pathlib import Path from typing import List -import redis +from config import OUTPUT_DIR, UPLOAD_DIR from fastapi import FastAPI, UploadFile +from fastapi.middleware.cors import CORSMiddleware +from redis_client import redis_client +from rq import Queue +from worker import process_task + +# RQ 작업 큐 생성 +task_queue = Queue("task_queue1", connection=redis_client) app = FastAPI() -UPLOAD_DIR = "data" -OUTPUT_DIR = "converted" +# CORS 설정 +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], # 모든 오리진 허용 + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) +# 업로드 및 출력 디렉토리 생성 os.makedirs(UPLOAD_DIR, exist_ok=True) os.makedirs(OUTPUT_DIR, exist_ok=True) -redis_client = redis.StrictRedis(host="localhost", port=6379, decode_responses=True) - @app.post("/upload/") async def upload_directory(files: List[UploadFile]): - """사용자가 여러 개의 파일을 업로드하면 UPLOAD_DIR에 저장""" + """사용자가 업로드한 파일들을 UPLOAD_DIR에 저장""" uploaded_files = [] - for file in files: file_path = Path(UPLOAD_DIR) / file.filename with open(file_path, "wb") as buffer: shutil.copyfileobj(file.file, buffer) uploaded_files.append(file.filename) - return {"message": "파일 업로드 성공", "files": uploaded_files} @app.get("/convert/") async def convert_files(): - """ - 업로드된 파일들을 지원하는 확장자(txt, html, docx, pdf)만 Redis 작업 큐에 태스크로 등록. - 워커에서 배치 잡으로 처리합니다. - """ + """업로드된 파일을 변환 큐(RQ)에 등록하고 Job ID 반환""" files = os.listdir(UPLOAD_DIR) enqueued_tasks = [] for file in files: file_ext = file.split(".")[-1].lower() - # 지원하는 파일 형식만 큐에 넣기 - if file_ext in ["txt", "html", "docx", "pdf"]: + if file_ext in ["txt", "html", "docx", "pdf", "hwp"]: task = { "filename": file, "extension": file_ext, "input_path": os.path.join(UPLOAD_DIR, file), "output_path": os.path.join(OUTPUT_DIR, f"{Path(file).stem}.md"), } - # 태스크를 JSON 문자열로 변환하여 큐에 등록 - redis_client.lpush("task_queue", json.dumps(task)) - enqueued_tasks.append(task) + job = task_queue.enqueue(process_task, task) # RQ에 작업 등록 + enqueued_tasks.append({"task": task, "job_id": job.id}) return {"message": "작업이 큐에 추가되었습니다.", "tasks": enqueued_tasks} +@app.get("/task/{job_id}") +async def get_task_status(job_id: str): + """RQ에서 특정 Job ID의 상태 확인""" + job = task_queue.fetch_job(job_id) + if not job: + return {"error": "존재하지 않는 작업입니다."} + + return { + "job_id": job.id, + "status": job.get_status(), + "result": job.result, + "enqueued_at": str(job.enqueued_at), + "ended_at": str(job.ended_at) if job.ended_at else None, + } + + @app.get("/download/{filename}") async def download_file(filename: str): - """변환된 Markdown 파일을 다운로드할 수 있도록 제공""" + """변환된 Markdown 파일을 다운로드합니다.""" file_path = Path(OUTPUT_DIR) / filename if not file_path.exists(): return {"error": "파일이 존재하지 않습니다."} diff --git a/workspace/redis_client.py b/workspace/redis_client.py new file mode 100644 index 0000000..5e47a5e --- /dev/null +++ b/workspace/redis_client.py @@ -0,0 +1,6 @@ +import redis +from config import REDIS_HOST, REDIS_PORT + +redis_client = redis.StrictRedis( + host=REDIS_HOST, port=REDIS_PORT, decode_responses=False +) diff --git a/workspace/worker.py b/workspace/worker.py index 402d2da..abdf3d0 100644 --- a/workspace/worker.py +++ b/workspace/worker.py @@ -1,56 +1,38 @@ -import json -import time +import logging import convert_obj_to_md -import redis +from redis_client import redis_client +from rq import Worker -UPLOAD_DIR = "data" -OUTPUT_DIR = "converted" - - -redis_client = redis.StrictRedis(host="localhost", port=6379, decode_responses=True) +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) def process_task(task): """ - 태스크 정보를 기반으로 파일 확장자에 맞는 변환 함수를 호출. + 태스크 정보를 기반으로 파일 확장자에 맞는 변환 함수를 호출합니다. """ file_ext = task.get("extension") input_path = task.get("input_path") output_path = task.get("output_path") - converters = { - "txt": convert_obj_to_md.convert_txt_to_md, - "hwp": convert_obj_to_md.convert_hwp_to_md, - "docx": convert_obj_to_md.convert_docx_to_md, - "pdf": convert_obj_to_md.convert_pdf_to_md, - "ppt": convert_obj_to_md.convert_ppt_to_md, - "excel": convert_obj_to_md.convert_excel_to_md, - } - converter = converters.get(file_ext) - if converter: - converter(input_path, output_path) - else: - print(f"지원하지 않는 파일 형식: {file_ext}") + logger.info(f"작업 처리 중: {task}") - -def worker(): - """Redis 큐에서 태스크를 가져와 파일 변환 작업을 수행""" - while True: - # rpop: 큐의 오른쪽에서 태스크를 꺼냄 - task_json = redis_client.rpop("task_queue") - if task_json: - try: - task = json.loads(task_json) - print(f"작업 처리 중: {task}") - process_task(task) - except Exception as e: - print(f"작업 처리 중 에러: {e}") + try: + if file_ext == "hwp": + convert_obj_to_md.convert_hwp_to_md(input_path, output_path) else: - print("큐에 작업이 없습니다. 5초 후 재시도...") - time.sleep(5) + convert_obj_to_md.convert_to_md(input_path, output_path) + logger.info(f"변환 완료: {task}") + return {"status": "success", "output_path": output_path} + + except Exception as e: + logger.error(f"작업 처리 중 에러: {e}", exc_info=True) + return {"status": "error", "message": str(e)} if __name__ == "__main__": - print("Redis 워커 시작!") - worker() + listen = ["task_queue1"] + + worker = Worker(listen, connection=redis_client) # 최신 방식 + worker.work()