Fix Dockerfile build issue
This commit is contained in:
79
autorag/data/parse/base.py
Normal file
79
autorag/data/parse/base.py
Normal file
@@ -0,0 +1,79 @@
|
||||
import functools
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from glob import glob
|
||||
from typing import Tuple, List, Optional
|
||||
import os
|
||||
|
||||
from autorag.utils import result_to_dataframe
|
||||
from autorag.data.utils.util import get_file_metadata
|
||||
|
||||
logger = logging.getLogger("AutoRAG")
|
||||
|
||||
|
||||
def parser_node(func):
|
||||
@functools.wraps(func)
|
||||
@result_to_dataframe(["texts", "path", "page", "last_modified_datetime"])
|
||||
def wrapper(
|
||||
data_path_glob: str,
|
||||
file_type: str,
|
||||
parse_method: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> Tuple[List[str], List[str], List[int], List[datetime]]:
|
||||
logger.info(f"Running parser - {func.__name__} module...")
|
||||
|
||||
data_path_list = glob(data_path_glob)
|
||||
if not data_path_list:
|
||||
raise FileNotFoundError(f"data does not exits in {data_path_glob}")
|
||||
|
||||
assert file_type in [
|
||||
"pdf",
|
||||
"csv",
|
||||
"json",
|
||||
"md",
|
||||
"html",
|
||||
"xml",
|
||||
"all_files",
|
||||
], f"search type {file_type} is not supported"
|
||||
|
||||
# extract only files from data_path_list based on the file_type set in the YAML file
|
||||
data_paths = (
|
||||
[
|
||||
data_path
|
||||
for data_path in data_path_list
|
||||
if os.path.basename(data_path).split(".")[-1] == file_type
|
||||
]
|
||||
if file_type != "all_files"
|
||||
else data_path_list
|
||||
)
|
||||
|
||||
if func.__name__ == "langchain_parse":
|
||||
parse_method = parse_method.lower()
|
||||
if parse_method == "directory":
|
||||
path_split_list = data_path_glob.split("/")
|
||||
glob_path = path_split_list.pop()
|
||||
folder_path = "/".join(path_split_list)
|
||||
kwargs.update({"glob": glob_path, "path": folder_path})
|
||||
result = func(
|
||||
data_path_list=data_paths, parse_method=parse_method, **kwargs
|
||||
)
|
||||
else:
|
||||
result = func(
|
||||
data_path_list=data_paths, parse_method=parse_method, **kwargs
|
||||
)
|
||||
elif func.__name__ in ["clova_ocr", "llama_parse", "table_hybrid_parse"]:
|
||||
result = func(data_path_list=data_paths, **kwargs)
|
||||
else:
|
||||
raise ValueError(f"Unsupported module_type: {func.__name__}")
|
||||
result = _add_last_modified_datetime(result)
|
||||
return result
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
def _add_last_modified_datetime(result):
|
||||
last_modified_datetime_lst = list(
|
||||
map(lambda x: get_file_metadata(x)["last_modified_datetime"], result[1])
|
||||
)
|
||||
result_with_dates = result + (last_modified_datetime_lst,)
|
||||
return result_with_dates
|
||||
Reference in New Issue
Block a user