Files
rag_data_parser/workspace/convert_obj_to_md.py
2025-02-12 17:46:01 +09:00

53 lines
1.2 KiB
Python

from langchain_teddynote.document_loaders import HWPLoader
from markitdown import MarkItDown
def convert_hwp_to_md(input_path: str, output_path: str):
loader = HWPLoader(input_path)
docs = loader.load()
with open(output_path, "w", encoding="UTF-8") as f:
f.write(docs)
return None
def convert_txt_to_md(input_path: str, output_path: str):
return None
def convert_html_to_md(input_path: str, output_path: str):
return None
def convert_docx_to_md(input_path: str, output_path: str):
return None
def convert_pdf_to_md(input_path: str, output_path: str):
md = MarkItDown(docintel_endpoint="<document_intelligence_endpoint>")
result = md.convert(input_path)
with open(output_path, "w", encoding="utf-8") as f:
f.write(result.text_content)
return None
def convert_ppt_to_md(input_path: str, output_path: str):
return None
def convert_excel_to_md(input_path: str, output_path: str):
return None
def convert_csv_to_md(input_path: str, output_path: str):
return None
def convert_json_to_md(input_path: str, output_path: str):
return None
def convert_img_to_md(input_path: str, output_path: str):
return None