from langchain_teddynote.document_loaders import HWPLoader from markitdown import MarkItDown def convert_hwp_to_md(input_path: str, output_path: str): loader = HWPLoader(input_path) docs = loader.load() # Document 객체 리스트를 문자열 리스트로 변환 docs_as_text = [ doc.page_content if hasattr(doc, "page_content") else str(doc) for doc in docs ] docs_as_text = [] for doc in docs: try: text = doc.page_content if hasattr(doc, "page_content") else str(doc) text = text.encode("utf-8", "ignore").decode("utf-8") docs_as_text.append(text) except Exception as e: print(f"인코딩 변환 중 오류 발생: {e}") with open(output_path, "w", encoding="utf-8") as f: f.write("\n".join(docs_as_text)) return None def convert_to_md(input_path: str, output_path: str): md = MarkItDown() result = md.convert(input_path) with open(output_path, "w", encoding="utf-8") as f: f.write(result.text_content) return None