도커라이징
This commit is contained in:
@@ -5,48 +5,30 @@ from markitdown import MarkItDown
|
||||
def convert_hwp_to_md(input_path: str, output_path: str):
|
||||
loader = HWPLoader(input_path)
|
||||
docs = loader.load()
|
||||
# Document 객체 리스트를 문자열 리스트로 변환
|
||||
docs_as_text = [
|
||||
doc.page_content if hasattr(doc, "page_content") else str(doc) for doc in docs
|
||||
]
|
||||
docs_as_text = []
|
||||
for doc in docs:
|
||||
try:
|
||||
text = doc.page_content if hasattr(doc, "page_content") else str(doc)
|
||||
text = text.encode("utf-8", "ignore").decode(
|
||||
"utf-8"
|
||||
) # UTF-8로 변환하면서 깨진 문자 제거
|
||||
docs_as_text.append(text)
|
||||
except Exception as e:
|
||||
print(f"인코딩 변환 중 오류 발생: {e}")
|
||||
|
||||
with open(output_path, "w", encoding="UTF-8") as f:
|
||||
f.write(docs)
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
f.write("\n".join(docs_as_text)) # ✅ 변환된 리스트를 파일에 저장
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def convert_txt_to_md(input_path: str, output_path: str):
|
||||
return None
|
||||
|
||||
|
||||
def convert_html_to_md(input_path: str, output_path: str):
|
||||
return None
|
||||
|
||||
|
||||
def convert_docx_to_md(input_path: str, output_path: str):
|
||||
return None
|
||||
|
||||
|
||||
def convert_pdf_to_md(input_path: str, output_path: str):
|
||||
def convert_to_md(input_path: str, output_path: str):
|
||||
md = MarkItDown(docintel_endpoint="<document_intelligence_endpoint>")
|
||||
result = md.convert(input_path)
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
f.write(result.text_content)
|
||||
return None
|
||||
|
||||
|
||||
def convert_ppt_to_md(input_path: str, output_path: str):
|
||||
return None
|
||||
|
||||
|
||||
def convert_excel_to_md(input_path: str, output_path: str):
|
||||
return None
|
||||
|
||||
|
||||
def convert_csv_to_md(input_path: str, output_path: str):
|
||||
return None
|
||||
|
||||
|
||||
def convert_json_to_md(input_path: str, output_path: str):
|
||||
return None
|
||||
|
||||
|
||||
def convert_img_to_md(input_path: str, output_path: str):
|
||||
return None
|
||||
|
||||
Reference in New Issue
Block a user