Files
rag_data_parser/workspace/convert_obj_to_md.py
2025-02-14 12:13:05 +09:00

33 lines
1.0 KiB
Python

from langchain_teddynote.document_loaders import HWPLoader
from markitdown import MarkItDown
def convert_hwp_to_md(input_path: str, output_path: str):
loader = HWPLoader(input_path)
docs = loader.load()
# Document 객체 리스트를 문자열 리스트로 변환
docs_as_text = [
doc.page_content if hasattr(doc, "page_content") else str(doc) for doc in docs
]
docs_as_text = []
for doc in docs:
try:
text = doc.page_content if hasattr(doc, "page_content") else str(doc)
text = text.encode("utf-8", "ignore").decode("utf-8")
docs_as_text.append(text)
except Exception as e:
print(f"인코딩 변환 중 오류 발생: {e}")
with open(output_path, "w", encoding="utf-8") as f:
f.write("\n".join(docs_as_text))
return None
def convert_to_md(input_path: str, output_path: str):
md = MarkItDown()
result = md.convert(input_path)
with open(output_path, "w", encoding="utf-8") as f:
f.write(result.text_content)
return None