Fix Dockerfile build issue

This commit is contained in:
kyy
2025-03-18 16:41:12 +09:00
parent 6814230bfb
commit 9323aa254a
228 changed files with 467 additions and 3488 deletions

View File

103
autorag/data/utils/util.py Normal file
View File

@@ -0,0 +1,103 @@
import mimetypes
import os
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Tuple, Callable
import pandas as pd
import yaml
from langchain_core.documents import Document
from llama_index.core.schema import NodeRelationship
from autorag.schema import Module
from autorag.utils.util import make_combinations, explode
def get_file_metadata(file_path: str) -> Dict:
"""Get some handy metadate from filesystem.
Args:
file_path: str: file path in str
"""
return {
"file_path": file_path,
"file_name": os.path.basename(file_path),
"file_type": mimetypes.guess_type(file_path)[0],
"file_size": os.path.getsize(file_path),
"creation_datetime": datetime.fromtimestamp(
Path(file_path).stat().st_ctime
).strftime("%Y-%m-%d"),
"last_modified_datetime": datetime.fromtimestamp(
Path(file_path).stat().st_mtime
).strftime("%Y-%m-%d"),
"last_accessed_datetime": datetime.fromtimestamp(
Path(file_path).stat().st_atime
).strftime("%Y-%m-%d"),
}
def add_essential_metadata(metadata: Dict) -> Dict:
if "last_modified_datetime" not in metadata:
metadata["last_modified_datetime"] = datetime.now()
return metadata
def corpus_df_to_langchain_documents(corpus_df: pd.DataFrame) -> List[Document]:
page_contents = corpus_df["contents"].tolist()
ids = corpus_df["doc_id"].tolist()
metadatas = corpus_df["metadata"].tolist()
return list(
map(
lambda x: Document(page_content=x[0], metadata={"filename": x[1], **x[2]}),
zip(page_contents, ids, metadatas),
)
)
def add_essential_metadata_llama_text_node(metadata: Dict, relationships: Dict) -> Dict:
if "last_modified_datetime" not in metadata:
metadata["last_modified_datetime"] = datetime.now()
if "prev_id" not in metadata:
if NodeRelationship.PREVIOUS in relationships:
prev_node = relationships.get(NodeRelationship.PREVIOUS, None)
if prev_node:
metadata["prev_id"] = prev_node.node_id
if "next_id" not in metadata:
if NodeRelationship.NEXT in relationships:
next_node = relationships.get(NodeRelationship.NEXT, None)
if next_node:
metadata["next_id"] = next_node.node_id
return metadata
def load_yaml(yaml_path: str):
if not os.path.exists(yaml_path):
raise ValueError(f"YAML file {yaml_path} does not exist.")
with open(yaml_path, "r", encoding="utf-8") as stream:
try:
yaml_dict = yaml.safe_load(stream)
except yaml.YAMLError as exc:
raise ValueError(f"YAML file {yaml_path} could not be loaded.") from exc
return yaml_dict["modules"]
def get_param_combinations(modules: List[Dict]) -> Tuple[List[Callable], List[Dict]]:
module_callable_list, module_params_list = [], []
for module in modules:
module_instance = Module.from_dict(module)
module_params_list.append(module_instance.module_param)
module_callable_list.append(module_instance.module)
combinations = list(map(make_combinations, module_params_list))
module_list, combination_list = explode(module_callable_list, combinations)
return module_list, combination_list
def get_start_end_idx(original_text: str, search_str: str) -> Tuple[int, int]:
start_idx = original_text.find(search_str)
if start_idx == -1:
return 0, 0
end_idx = start_idx + len(search_str)
return start_idx, end_idx - 1