Skeleton Code commit

This commit is contained in:
2025-02-12 17:46:01 +09:00
parent 63bd561f6d
commit ad26488b1b
7 changed files with 246 additions and 0 deletions

View File

@@ -0,0 +1,52 @@
from langchain_teddynote.document_loaders import HWPLoader
from markitdown import MarkItDown
def convert_hwp_to_md(input_path: str, output_path: str):
loader = HWPLoader(input_path)
docs = loader.load()
with open(output_path, "w", encoding="UTF-8") as f:
f.write(docs)
return None
def convert_txt_to_md(input_path: str, output_path: str):
return None
def convert_html_to_md(input_path: str, output_path: str):
return None
def convert_docx_to_md(input_path: str, output_path: str):
return None
def convert_pdf_to_md(input_path: str, output_path: str):
md = MarkItDown(docintel_endpoint="<document_intelligence_endpoint>")
result = md.convert(input_path)
with open(output_path, "w", encoding="utf-8") as f:
f.write(result.text_content)
return None
def convert_ppt_to_md(input_path: str, output_path: str):
return None
def convert_excel_to_md(input_path: str, output_path: str):
return None
def convert_csv_to_md(input_path: str, output_path: str):
return None
def convert_json_to_md(input_path: str, output_path: str):
return None
def convert_img_to_md(input_path: str, output_path: str):
return None

67
workspace/main.py Normal file
View File

@@ -0,0 +1,67 @@
import json
import os
import shutil
from pathlib import Path
from typing import List
import redis
from fastapi import FastAPI, UploadFile
app = FastAPI()
UPLOAD_DIR = "data"
OUTPUT_DIR = "converted"
os.makedirs(UPLOAD_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)
redis_client = redis.StrictRedis(host="localhost", port=6379, decode_responses=True)
@app.post("/upload/")
async def upload_directory(files: List[UploadFile]):
"""사용자가 여러 개의 파일을 업로드하면 UPLOAD_DIR에 저장"""
uploaded_files = []
for file in files:
file_path = Path(UPLOAD_DIR) / file.filename
with open(file_path, "wb") as buffer:
shutil.copyfileobj(file.file, buffer)
uploaded_files.append(file.filename)
return {"message": "파일 업로드 성공", "files": uploaded_files}
@app.get("/convert/")
async def convert_files():
"""
업로드된 파일들을 지원하는 확장자(txt, html, docx, pdf)만 Redis 작업 큐에 태스크로 등록.
워커에서 배치 잡으로 처리합니다.
"""
files = os.listdir(UPLOAD_DIR)
enqueued_tasks = []
for file in files:
file_ext = file.split(".")[-1].lower()
# 지원하는 파일 형식만 큐에 넣기
if file_ext in ["txt", "html", "docx", "pdf"]:
task = {
"filename": file,
"extension": file_ext,
"input_path": os.path.join(UPLOAD_DIR, file),
"output_path": os.path.join(OUTPUT_DIR, f"{Path(file).stem}.md"),
}
# 태스크를 JSON 문자열로 변환하여 큐에 등록
redis_client.lpush("task_queue", json.dumps(task))
enqueued_tasks.append(task)
return {"message": "작업이 큐에 추가되었습니다.", "tasks": enqueued_tasks}
@app.get("/download/{filename}")
async def download_file(filename: str):
"""변환된 Markdown 파일을 다운로드할 수 있도록 제공"""
file_path = Path(OUTPUT_DIR) / filename
if not file_path.exists():
return {"error": "파일이 존재하지 않습니다."}
return file_path.read_text(encoding="utf-8")

56
workspace/worker.py Normal file
View File

@@ -0,0 +1,56 @@
import json
import time
import convert_obj_to_md
import redis
UPLOAD_DIR = "data"
OUTPUT_DIR = "converted"
redis_client = redis.StrictRedis(host="localhost", port=6379, decode_responses=True)
def process_task(task):
"""
태스크 정보를 기반으로 파일 확장자에 맞는 변환 함수를 호출.
"""
file_ext = task.get("extension")
input_path = task.get("input_path")
output_path = task.get("output_path")
converters = {
"txt": convert_obj_to_md.convert_txt_to_md,
"hwp": convert_obj_to_md.convert_hwp_to_md,
"docx": convert_obj_to_md.convert_docx_to_md,
"pdf": convert_obj_to_md.convert_pdf_to_md,
"ppt": convert_obj_to_md.convert_ppt_to_md,
"excel": convert_obj_to_md.convert_excel_to_md,
}
converter = converters.get(file_ext)
if converter:
converter(input_path, output_path)
else:
print(f"지원하지 않는 파일 형식: {file_ext}")
def worker():
"""Redis 큐에서 태스크를 가져와 파일 변환 작업을 수행"""
while True:
# rpop: 큐의 오른쪽에서 태스크를 꺼냄
task_json = redis_client.rpop("task_queue")
if task_json:
try:
task = json.loads(task_json)
print(f"작업 처리 중: {task}")
process_task(task)
except Exception as e:
print(f"작업 처리 중 에러: {e}")
else:
print("큐에 작업이 없습니다. 5초 후 재시도...")
time.sleep(5)
if __name__ == "__main__":
print("Redis 워커 시작!")
worker()