import mimetypes import os from datetime import datetime from pathlib import Path from typing import Dict, List, Tuple, Callable import pandas as pd import yaml from langchain_core.documents import Document from llama_index.core.schema import NodeRelationship from autorag.schema import Module from autorag.utils.util import make_combinations, explode def get_file_metadata(file_path: str) -> Dict: """Get some handy metadate from filesystem. Args: file_path: str: file path in str """ return { "file_path": file_path, "file_name": os.path.basename(file_path), "file_type": mimetypes.guess_type(file_path)[0], "file_size": os.path.getsize(file_path), "creation_datetime": datetime.fromtimestamp( Path(file_path).stat().st_ctime ).strftime("%Y-%m-%d"), "last_modified_datetime": datetime.fromtimestamp( Path(file_path).stat().st_mtime ).strftime("%Y-%m-%d"), "last_accessed_datetime": datetime.fromtimestamp( Path(file_path).stat().st_atime ).strftime("%Y-%m-%d"), } def add_essential_metadata(metadata: Dict) -> Dict: if "last_modified_datetime" not in metadata: metadata["last_modified_datetime"] = datetime.now() return metadata def corpus_df_to_langchain_documents(corpus_df: pd.DataFrame) -> List[Document]: page_contents = corpus_df["contents"].tolist() ids = corpus_df["doc_id"].tolist() metadatas = corpus_df["metadata"].tolist() return list( map( lambda x: Document(page_content=x[0], metadata={"filename": x[1], **x[2]}), zip(page_contents, ids, metadatas), ) ) def add_essential_metadata_llama_text_node(metadata: Dict, relationships: Dict) -> Dict: if "last_modified_datetime" not in metadata: metadata["last_modified_datetime"] = datetime.now() if "prev_id" not in metadata: if NodeRelationship.PREVIOUS in relationships: prev_node = relationships.get(NodeRelationship.PREVIOUS, None) if prev_node: metadata["prev_id"] = prev_node.node_id if "next_id" not in metadata: if NodeRelationship.NEXT in relationships: next_node = relationships.get(NodeRelationship.NEXT, None) if next_node: metadata["next_id"] = next_node.node_id return metadata def load_yaml(yaml_path: str): if not os.path.exists(yaml_path): raise ValueError(f"YAML file {yaml_path} does not exist.") with open(yaml_path, "r", encoding="utf-8") as stream: try: yaml_dict = yaml.safe_load(stream) except yaml.YAMLError as exc: raise ValueError(f"YAML file {yaml_path} could not be loaded.") from exc return yaml_dict["modules"] def get_param_combinations(modules: List[Dict]) -> Tuple[List[Callable], List[Dict]]: module_callable_list, module_params_list = [], [] for module in modules: module_instance = Module.from_dict(module) module_params_list.append(module_instance.module_param) module_callable_list.append(module_instance.module) combinations = list(map(make_combinations, module_params_list)) module_list, combination_list = explode(module_callable_list, combinations) return module_list, combination_list def get_start_end_idx(original_text: str, search_str: str) -> Tuple[int, int]: start_idx = original_text.find(search_str) if start_idx == -1: return 0, 0 end_idx = start_idx + len(search_str) return start_idx, end_idx - 1