Fix Dockerfile build issue
This commit is contained in:
0
autorag/data/utils/__init__.py
Normal file
0
autorag/data/utils/__init__.py
Normal file
103
autorag/data/utils/util.py
Normal file
103
autorag/data/utils/util.py
Normal file
@@ -0,0 +1,103 @@
|
||||
import mimetypes
|
||||
import os
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Tuple, Callable
|
||||
|
||||
import pandas as pd
|
||||
import yaml
|
||||
from langchain_core.documents import Document
|
||||
from llama_index.core.schema import NodeRelationship
|
||||
|
||||
from autorag.schema import Module
|
||||
from autorag.utils.util import make_combinations, explode
|
||||
|
||||
|
||||
def get_file_metadata(file_path: str) -> Dict:
|
||||
"""Get some handy metadate from filesystem.
|
||||
|
||||
Args:
|
||||
file_path: str: file path in str
|
||||
"""
|
||||
return {
|
||||
"file_path": file_path,
|
||||
"file_name": os.path.basename(file_path),
|
||||
"file_type": mimetypes.guess_type(file_path)[0],
|
||||
"file_size": os.path.getsize(file_path),
|
||||
"creation_datetime": datetime.fromtimestamp(
|
||||
Path(file_path).stat().st_ctime
|
||||
).strftime("%Y-%m-%d"),
|
||||
"last_modified_datetime": datetime.fromtimestamp(
|
||||
Path(file_path).stat().st_mtime
|
||||
).strftime("%Y-%m-%d"),
|
||||
"last_accessed_datetime": datetime.fromtimestamp(
|
||||
Path(file_path).stat().st_atime
|
||||
).strftime("%Y-%m-%d"),
|
||||
}
|
||||
|
||||
|
||||
def add_essential_metadata(metadata: Dict) -> Dict:
|
||||
if "last_modified_datetime" not in metadata:
|
||||
metadata["last_modified_datetime"] = datetime.now()
|
||||
return metadata
|
||||
|
||||
|
||||
def corpus_df_to_langchain_documents(corpus_df: pd.DataFrame) -> List[Document]:
|
||||
page_contents = corpus_df["contents"].tolist()
|
||||
ids = corpus_df["doc_id"].tolist()
|
||||
metadatas = corpus_df["metadata"].tolist()
|
||||
return list(
|
||||
map(
|
||||
lambda x: Document(page_content=x[0], metadata={"filename": x[1], **x[2]}),
|
||||
zip(page_contents, ids, metadatas),
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def add_essential_metadata_llama_text_node(metadata: Dict, relationships: Dict) -> Dict:
|
||||
if "last_modified_datetime" not in metadata:
|
||||
metadata["last_modified_datetime"] = datetime.now()
|
||||
|
||||
if "prev_id" not in metadata:
|
||||
if NodeRelationship.PREVIOUS in relationships:
|
||||
prev_node = relationships.get(NodeRelationship.PREVIOUS, None)
|
||||
if prev_node:
|
||||
metadata["prev_id"] = prev_node.node_id
|
||||
|
||||
if "next_id" not in metadata:
|
||||
if NodeRelationship.NEXT in relationships:
|
||||
next_node = relationships.get(NodeRelationship.NEXT, None)
|
||||
if next_node:
|
||||
metadata["next_id"] = next_node.node_id
|
||||
return metadata
|
||||
|
||||
|
||||
def load_yaml(yaml_path: str):
|
||||
if not os.path.exists(yaml_path):
|
||||
raise ValueError(f"YAML file {yaml_path} does not exist.")
|
||||
with open(yaml_path, "r", encoding="utf-8") as stream:
|
||||
try:
|
||||
yaml_dict = yaml.safe_load(stream)
|
||||
except yaml.YAMLError as exc:
|
||||
raise ValueError(f"YAML file {yaml_path} could not be loaded.") from exc
|
||||
return yaml_dict["modules"]
|
||||
|
||||
|
||||
def get_param_combinations(modules: List[Dict]) -> Tuple[List[Callable], List[Dict]]:
|
||||
module_callable_list, module_params_list = [], []
|
||||
for module in modules:
|
||||
module_instance = Module.from_dict(module)
|
||||
module_params_list.append(module_instance.module_param)
|
||||
module_callable_list.append(module_instance.module)
|
||||
|
||||
combinations = list(map(make_combinations, module_params_list))
|
||||
module_list, combination_list = explode(module_callable_list, combinations)
|
||||
return module_list, combination_list
|
||||
|
||||
|
||||
def get_start_end_idx(original_text: str, search_str: str) -> Tuple[int, int]:
|
||||
start_idx = original_text.find(search_str)
|
||||
if start_idx == -1:
|
||||
return 0, 0
|
||||
end_idx = start_idx + len(search_str)
|
||||
return start_idx, end_idx - 1
|
||||
Reference in New Issue
Block a user