import logging from typing import List, Callable from langchain_community.document_loaders import ( PDFMinerLoader, PDFPlumberLoader, PyPDFium2Loader, PyPDFLoader, PyMuPDFLoader, UnstructuredPDFLoader, CSVLoader, JSONLoader, UnstructuredMarkdownLoader, BSHTMLLoader, UnstructuredXMLLoader, DirectoryLoader, ) from langchain_unstructured import UnstructuredLoader from langchain_upstage import UpstageDocumentParseLoader from llama_index.core.node_parser import ( TokenTextSplitter, SentenceSplitter, SentenceWindowNodeParser, SemanticSplitterNodeParser, SemanticDoubleMergingSplitterNodeParser, SimpleFileNodeParser, ) from langchain.text_splitter import ( RecursiveCharacterTextSplitter, CharacterTextSplitter, KonlpyTextSplitter, SentenceTransformersTokenTextSplitter, ) from autorag import LazyInit logger = logging.getLogger("AutoRAG") parse_modules = { # PDF "pdfminer": PDFMinerLoader, "pdfplumber": PDFPlumberLoader, "pypdfium2": PyPDFium2Loader, "pypdf": PyPDFLoader, "pymupdf": PyMuPDFLoader, "unstructuredpdf": UnstructuredPDFLoader, # Common File Types # 1. CSV "csv": CSVLoader, # 2. JSON "json": JSONLoader, # 3. Markdown "unstructuredmarkdown": UnstructuredMarkdownLoader, # 4. HTML "bshtml": BSHTMLLoader, # 5. XML "unstructuredxml": UnstructuredXMLLoader, # 6. All files "directory": DirectoryLoader, "unstructured": UnstructuredLoader, "upstagedocumentparse": UpstageDocumentParseLoader, } chunk_modules = { # Llama Index # Token "token": TokenTextSplitter, # Sentence "sentence": SentenceSplitter, # window "sentencewindow": SentenceWindowNodeParser, # Semantic "semantic_llama_index": SemanticSplitterNodeParser, "semanticdoublemerging": SemanticDoubleMergingSplitterNodeParser, # Simple "simplefile": SimpleFileNodeParser, # LangChain # Token "sentencetransformerstoken": SentenceTransformersTokenTextSplitter, # Character "recursivecharacter": RecursiveCharacterTextSplitter, "character": CharacterTextSplitter, # Sentence "konlpy": KonlpyTextSplitter, } def split_by_sentence_kiwi() -> Callable[[str], List[str]]: try: from kiwipiepy import Kiwi except ImportError: raise ImportError( "You need to install kiwipiepy to use 'ko_kiwi' tokenizer. " "Please install kiwipiepy by running 'pip install kiwipiepy'. " "Or install Korean version of AutoRAG by running 'pip install AutoRAG[ko]'." ) kiwi = Kiwi() def split(text: str) -> List[str]: kiwi_result = kiwi.split_into_sents(text) sentences = list(map(lambda x: x.text, kiwi_result)) return sentences return split sentence_splitter_modules = {"kiwi": LazyInit(split_by_sentence_kiwi)}