Fix Dockerfile build issue

2025-03-18 16:41:12 +09:00
parent 6814230bfb
commit 9323aa254a
228 changed files with 467 additions and 3488 deletions
--- a/autorag/evaluator.py
+++ b/autorag/evaluator.py
@@ -0,0 +1,560 @@
+import glob
+import json
+import logging
+import os
+import shutil
+from datetime import datetime
+from itertools import chain
+from typing import List, Dict, Optional
+from rich.progress import Progress, BarColumn, TimeElapsedColumn
+
+import pandas as pd
+import yaml
+
+from autorag.node_line import run_node_line
+from autorag.nodes.retrieval.base import get_bm25_pkl_name
+from autorag.nodes.retrieval.bm25 import bm25_ingest
+from autorag.nodes.retrieval.vectordb import (
+	vectordb_ingest,
+	filter_exist_ids,
+	filter_exist_ids_from_retrieval_gt,
+)
+from autorag.schema import Node
+from autorag.schema.node import (
+	module_type_exists,
+	extract_values_from_nodes,
+	extract_values_from_nodes_strategy,
+)
+from autorag.utils import (
+	cast_qa_dataset,
+	cast_corpus_dataset,
+	validate_qa_from_corpus_dataset,
+)
+from autorag.utils.util import (
+	load_summary_file,
+	explode,
+	load_yaml_config,
+	get_event_loop,
+)
+from autorag.vectordb import load_all_vectordb_from_yaml
+
+logger = logging.getLogger("AutoRAG")
+
+ascii_art = """
+                _        _____            _____
+     /\        | |      |  __ \     /\   / ____|
+    /  \  _   _| |_ ___ | |__) |   /  \ | |  __
+   / /\ \| | | | __/ _ \|  _  /   / /\ \| | |_ |
+  / ____ \ |_| | || (_) | | \ \  / ____ \ |__| |
+ /_/    \_\__,_|\__\___/|_|  \_\/_/    \_\_____|
+
+"""
+
+
+class Evaluator:
+	def __init__(
+		self,
+		qa_data_path: str,
+		corpus_data_path: str,
+		project_dir: Optional[str] = None,
+	):
+		"""
+		Initialize an Evaluator object.
+
+		:param qa_data_path: The path to the QA dataset.
+		    Must be parquet file.
+		:param corpus_data_path: The path to the corpus dataset.
+		    Must be parquet file.
+		:param project_dir: The path to the project directory.
+		    Default is the current directory.
+		"""
+		# validate data paths
+		if not os.path.exists(qa_data_path):
+			raise ValueError(f"QA data path {qa_data_path} does not exist.")
+		if not os.path.exists(corpus_data_path):
+			raise ValueError(f"Corpus data path {corpus_data_path} does not exist.")
+		if not qa_data_path.endswith(".parquet"):
+			raise ValueError(f"QA data path {qa_data_path} is not a parquet file.")
+		if not corpus_data_path.endswith(".parquet"):
+			raise ValueError(
+				f"Corpus data path {corpus_data_path} is not a parquet file."
+			)
+		self.qa_data_path = qa_data_path
+		self.corpus_data_path = corpus_data_path
+		self.qa_data = pd.read_parquet(qa_data_path, engine="pyarrow")
+		self.corpus_data = pd.read_parquet(corpus_data_path, engine="pyarrow")
+		self.qa_data = cast_qa_dataset(self.qa_data)
+		self.corpus_data = cast_corpus_dataset(self.corpus_data)
+		self.project_dir = project_dir if project_dir is not None else os.getcwd()
+		if not os.path.exists(self.project_dir):
+			os.makedirs(self.project_dir)
+
+		validate_qa_from_corpus_dataset(self.qa_data, self.corpus_data)
+
+		# copy dataset to the project directory
+		if not os.path.exists(os.path.join(self.project_dir, "data")):
+			os.makedirs(os.path.join(self.project_dir, "data"))
+		qa_path_in_project = os.path.join(self.project_dir, "data", "qa.parquet")
+		if not os.path.exists(qa_path_in_project):
+			self.qa_data.to_parquet(qa_path_in_project, index=False)
+		corpus_path_in_project = os.path.join(
+			self.project_dir, "data", "corpus.parquet"
+		)
+		if not os.path.exists(corpus_path_in_project):
+			self.corpus_data.to_parquet(corpus_path_in_project, index=False)
+
+	def start_trial(
+		self, yaml_path: str, skip_validation: bool = False, full_ingest: bool = True
+	):
+		"""
+		Start AutoRAG trial.
+		The trial means one experiment to optimize the RAG pipeline.
+		It consists of ingesting corpus data, running all nodes and modules, evaluating and finding the optimal modules.
+
+		:param yaml_path: The config YAML path
+		:param skip_validation: If True, it skips the validation step.
+			The validation step checks the input config YAML file is well formatted,
+			and there is any problem with the system settings.
+			Default is False.
+		:param full_ingest: If True, it checks the whole corpus data from corpus.parquet that exists in the Vector DB.
+			If your corpus is huge and don't want to check the whole vector DB, please set it to False.
+		:return: None
+		"""
+		# Make Resources directory
+		os.makedirs(os.path.join(self.project_dir, "resources"), exist_ok=True)
+
+		if not skip_validation:
+			logger.info(ascii_art)
+			logger.info(
+				"Start Validation input data and config YAML file first. "
+				"If you want to skip this, put the --skip_validation flag or "
+				"`skip_validation` at the start_trial function."
+			)
+			from autorag.validator import Validator  # resolve circular import
+
+			validator = Validator(
+				qa_data_path=self.qa_data_path, corpus_data_path=self.corpus_data_path
+			)
+			validator.validate(yaml_path)
+
+		os.environ["PROJECT_DIR"] = self.project_dir
+
+		trial_name = self.__get_new_trial_name()
+		self.__make_trial_dir(trial_name)
+
+		# copy YAML file to the trial directory
+		shutil.copy(
+			yaml_path, os.path.join(self.project_dir, trial_name, "config.yaml")
+		)
+		yaml_dict = load_yaml_config(yaml_path)
+		vectordb = yaml_dict.get("vectordb", [])
+
+		vectordb_config_path = os.path.join(
+			self.project_dir, "resources", "vectordb.yaml"
+		)
+		with open(vectordb_config_path, "w") as f:
+			yaml.safe_dump({"vectordb": vectordb}, f)
+
+		node_lines = self._load_node_lines(yaml_path)
+		self.__ingest_bm25_full(node_lines)
+
+		with Progress(
+			"[progress.description]{task.description}",
+			BarColumn(),
+			"[progress.percentage]{task.percentage:>3.0f}%",
+			"[progress.bar]{task.completed}/{task.total}",
+			TimeElapsedColumn(),
+		) as progress:
+			# Ingest VectorDB corpus
+			if any(
+				list(
+					map(
+						lambda nodes: module_type_exists(nodes, "vectordb"),
+						node_lines.values(),
+					)
+				)
+			):
+				task_ingest = progress.add_task("[cyan]Ingesting VectorDB...", total=1)
+
+				loop = get_event_loop()
+				loop.run_until_complete(self.__ingest_vectordb(yaml_path, full_ingest))
+
+				progress.update(task_ingest, completed=1)
+
+			trial_summary_df = pd.DataFrame(
+				columns=[
+					"node_line_name",
+					"node_type",
+					"best_module_filename",
+					"best_module_name",
+					"best_module_params",
+					"best_execution_time",
+				]
+			)
+			task_eval = progress.add_task(
+				"[cyan]Evaluating...", total=sum(map(len, node_lines.values()))
+			)
+
+			for i, (node_line_name, node_line) in enumerate(node_lines.items()):
+				node_line_dir = os.path.join(
+					self.project_dir, trial_name, node_line_name
+				)
+				os.makedirs(node_line_dir, exist_ok=False)
+				if i == 0:
+					previous_result = self.qa_data
+				logger.info(f"Running node line {node_line_name}...")
+				previous_result = run_node_line(
+					node_line, node_line_dir, previous_result, progress, task_eval
+				)
+
+				trial_summary_df = self._append_node_line_summary(
+					node_line_name, node_line_dir, trial_summary_df
+				)
+
+			trial_summary_df.to_csv(
+				os.path.join(self.project_dir, trial_name, "summary.csv"), index=False
+			)
+
+			logger.info("Evaluation complete.")
+
+	def __ingest_bm25_full(self, node_lines: Dict[str, List[Node]]):
+		if any(
+			list(
+				map(
+					lambda nodes: module_type_exists(nodes, "bm25"), node_lines.values()
+				)
+			)
+		):
+			logger.info("Embedding BM25 corpus...")
+			bm25_tokenizer_list = list(
+				chain.from_iterable(
+					map(
+						lambda nodes: self._find_bm25_tokenizer(nodes),
+						node_lines.values(),
+					)
+				)
+			)
+
+			if len(bm25_tokenizer_list) == 0:
+				bm25_tokenizer_list = ["porter_stemmer"]
+			for bm25_tokenizer in bm25_tokenizer_list:
+				bm25_dir = os.path.join(
+					self.project_dir, "resources", get_bm25_pkl_name(bm25_tokenizer)
+				)
+				if not os.path.exists(os.path.dirname(bm25_dir)):
+					os.makedirs(os.path.dirname(bm25_dir))
+				# ingest because bm25 supports update new corpus data
+				bm25_ingest(bm25_dir, self.corpus_data, bm25_tokenizer=bm25_tokenizer)
+			logger.info("BM25 corpus embedding complete.")
+
+	def __get_new_trial_name(self) -> str:
+		trial_json_path = os.path.join(self.project_dir, "trial.json")
+		if not os.path.exists(trial_json_path):
+			return "0"
+		with open(trial_json_path, "r") as f:
+			trial_json = json.load(f)
+		return str(int(trial_json[-1]["trial_name"]) + 1)
+
+	def __make_trial_dir(self, trial_name: str):
+		trial_json_path = os.path.join(self.project_dir, "trial.json")
+		if os.path.exists(trial_json_path):
+			with open(trial_json_path, "r") as f:
+				trial_json = json.load(f)
+		else:
+			trial_json = []
+
+		trial_json.append(
+			{
+				"trial_name": trial_name,
+				"start_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+			}
+		)
+		os.makedirs(os.path.join(self.project_dir, trial_name))
+		with open(trial_json_path, "w") as f:
+			json.dump(trial_json, f, indent=4)
+
+	@staticmethod
+	def _load_node_lines(yaml_path: str) -> Dict[str, List[Node]]:
+		yaml_dict = load_yaml_config(yaml_path)
+		node_lines = yaml_dict["node_lines"]
+		node_line_dict = {}
+		for node_line in node_lines:
+			node_line_dict[node_line["node_line_name"]] = list(
+				map(lambda node: Node.from_dict(node), node_line["nodes"])
+			)
+		return node_line_dict
+
+	def restart_trial(self, trial_path: str):
+		logger.info(ascii_art)
+		os.environ["PROJECT_DIR"] = self.project_dir
+		# Check if trial_path exists
+		if not os.path.exists(trial_path):
+			raise ValueError(f"Trial path {trial_path} does not exist.")
+		# Check if trial is completed
+		if os.path.exists(os.path.join(trial_path, "summary.csv")):
+			raise ValueError(f"Trial path {trial_path} is already completed.")
+
+		# Extract node lines from config.yaml
+		yaml_path = os.path.join(trial_path, "config.yaml")
+		node_lines = self._load_node_lines(yaml_path)
+
+		node_line_names = list(node_lines.keys())
+		nodes = list(node_lines.values())
+		node_names = list(
+			map(lambda node: list(map(lambda n: n.node_type, node)), nodes)
+		)
+
+		# If the First Node Line folder hasn't even been created, proceed to start_trial
+		if not os.path.exists(os.path.join(trial_path, node_line_names[0])):
+			self.start_trial(yaml_path)
+			return None
+
+		# Find conflict node line and node
+		conflict_line_name, conflict_node_name = self.__find_conflict_point(
+			trial_path, node_line_names, node_lines
+		)
+		node_dir = os.path.join(trial_path, conflict_line_name, conflict_node_name)
+		if os.path.exists(node_dir):
+			shutil.rmtree(node_dir)
+
+		# Set remain_nodes and remain_lines
+		remain_nodes, completed_node_names, remain_lines, remain_line_names = (
+			self._set_remain_nodes_and_lines(
+				node_line_names,
+				nodes,
+				node_names,
+				conflict_node_name,
+				conflict_line_name,
+			)
+		)
+		# Set previous_result
+		previous_result = self.__set_previous_result(
+			node_line_names, node_names, trial_path, conflict_node_name
+		)
+
+		# Run Node
+		if remain_nodes:
+			conflict_line_dir = os.path.join(trial_path, conflict_line_name)
+			summary_lst = []
+			# Get already run node summary and append to summary_lst
+			for completed_node_name in completed_node_names:
+				summary_lst = self._append_node_summary(
+					conflict_line_dir, completed_node_name, summary_lst
+				)
+			for node in remain_nodes:
+				previous_result = node.run(previous_result, conflict_line_dir)
+				summary_lst = self._append_node_summary(
+					conflict_line_dir, node.node_type, summary_lst
+				)
+			pd.DataFrame(summary_lst).to_csv(
+				os.path.join(conflict_line_dir, "summary.csv"), index=False
+			)
+
+		# Run node line
+		trial_summary_df = pd.DataFrame(
+			columns=[
+				"node_line_name",
+				"node_type",
+				"best_module_filename",
+				"best_module_name",
+				"best_module_params",
+				"best_execution_time",
+			]
+		)
+		completed_line_names = node_line_names[
+			: node_line_names.index(conflict_line_name)
+		]
+		# Get already run node line's summary and append to trial_summary_df
+		if completed_line_names:
+			for line_name in completed_line_names:
+				node_line_dir = os.path.join(trial_path, line_name)
+				trial_summary_df = self._append_node_line_summary(
+					line_name, node_line_dir, trial_summary_df
+				)
+		if remain_lines:
+			for node_line_name, node_line in zip(remain_line_names, remain_lines):
+				node_line_dir = os.path.join(trial_path, node_line_name)
+				if not os.path.exists(node_line_dir):
+					os.makedirs(node_line_dir)
+				logger.info(f"Running node line {node_line_name}...")
+				previous_result = run_node_line(
+					node_line, node_line_dir, previous_result
+				)
+				trial_summary_df = self._append_node_line_summary(
+					node_line_name, node_line_dir, trial_summary_df
+				)
+		trial_summary_df.to_csv(os.path.join(trial_path, "summary.csv"), index=False)
+
+		logger.info("Evaluation complete.")
+
+	def __find_conflict_point(
+		self,
+		trial_path: str,
+		node_line_names: List[str],
+		node_lines: Dict[str, List[Node]],
+	) -> tuple[str, str]:
+		for node_line_name in node_line_names:
+			node_line_dir = os.path.join(trial_path, node_line_name)
+			if not os.path.exists(node_line_dir):
+				return node_line_name, node_lines[node_line_name][0].node_type
+
+			if not os.path.exists(os.path.join(node_line_dir, "summary.csv")):
+				conflict_node_name = self._find_conflict_node_name(
+					node_line_dir, node_lines[node_line_name]
+				)
+				return node_line_name, conflict_node_name
+
+		raise ValueError(f"No error node line found in {trial_path}.")
+
+	@staticmethod
+	def _find_conflict_node_name(node_line_dir: str, node_line: List[Node]) -> str:
+		for node in node_line:
+			node_dir = os.path.join(node_line_dir, node.node_type)
+			if not os.path.exists(node_dir) or not os.path.exists(
+				os.path.join(node_dir, "summary.csv")
+			):
+				return node.node_type
+		raise TypeError("No conflict node name found.")
+
+	def __set_previous_result(
+		self,
+		node_line_names: List[str],
+		node_names: List[List[str]],
+		trial_path: str,
+		conflict_node_name: str,
+	):
+		exploded_node_line, exploded_node = explode(node_line_names, node_names)
+		conflict_node_index = exploded_node.index(conflict_node_name)
+		# Set previous_result
+		if conflict_node_index == 0:
+			previous_result = self.qa_data
+		else:
+			previous_node_line = exploded_node_line[conflict_node_index - 1]
+			previous_node = exploded_node[conflict_node_index - 1]
+
+			previous_node_dir = os.path.join(
+				trial_path, previous_node_line, previous_node
+			)
+			best_file_pattern = f"{previous_node_dir}/best_*.parquet"
+			previous_result = pd.read_parquet(
+				glob.glob(best_file_pattern)[0], engine="pyarrow"
+			)
+		return previous_result
+
+	@staticmethod
+	def _set_remain_nodes_and_lines(
+		node_line_names: List[str],
+		nodes: List[List[Node]],
+		node_names: List[List[str]],
+		conflict_node_name: str,
+		conflict_node_line_name: str,
+	):
+		conflict_node_line_index = node_line_names.index(conflict_node_line_name)
+		full_conflict_node_line_nodes = nodes[conflict_node_line_index]
+		full_conflict_node_line_node_names = node_names[conflict_node_line_index]
+
+		if conflict_node_name == full_conflict_node_line_node_names[0]:
+			remain_nodes = None
+			completed_node_names = None
+			remain_node_lines = nodes[conflict_node_line_index:]
+			remain_node_line_names = node_line_names[conflict_node_line_index:]
+		else:
+			conflict_node_index = full_conflict_node_line_node_names.index(
+				conflict_node_name
+			)
+			remain_nodes = full_conflict_node_line_nodes[conflict_node_index:]
+			completed_node_names = full_conflict_node_line_node_names[
+				:conflict_node_index
+			]
+			if conflict_node_line_index + 1 >= len(node_line_names):
+				remain_node_lines = None
+				remain_node_line_names = None
+			else:
+				remain_node_lines = nodes[conflict_node_line_index + 1 :]
+				remain_node_line_names = node_line_names[conflict_node_line_index + 1 :]
+		return (
+			remain_nodes,
+			completed_node_names,
+			remain_node_lines,
+			remain_node_line_names,
+		)
+
+	@staticmethod
+	def _append_node_line_summary(
+		node_line_name: str, node_line_dir: str, trial_summary_df: pd.DataFrame
+	):
+		summary_df = load_summary_file(
+			os.path.join(node_line_dir, "summary.csv"),
+			dict_columns=["best_module_params"],
+		)
+		summary_df = summary_df.assign(node_line_name=node_line_name)
+		summary_df = summary_df[list(trial_summary_df.columns)]
+		if len(trial_summary_df) <= 0:
+			trial_summary_df = summary_df
+		else:
+			trial_summary_df = pd.concat(
+				[trial_summary_df, summary_df], ignore_index=True
+			)
+		return trial_summary_df
+
+	@staticmethod
+	def _append_node_summary(
+		node_line_dir: str, node_name: str, summary_lst: List[Dict]
+	):
+		node_summary_df = load_summary_file(
+			os.path.join(node_line_dir, node_name, "summary.csv")
+		)
+		best_node_row = node_summary_df.loc[node_summary_df["is_best"]]
+		summary_lst.append(
+			{
+				"node_type": node_name,
+				"best_module_filename": best_node_row["filename"].values[0],
+				"best_module_name": best_node_row["module_name"].values[0],
+				"best_module_params": best_node_row["module_params"].values[0],
+				"best_execution_time": best_node_row["execution_time"].values[0],
+			}
+		)
+		return summary_lst
+
+	@staticmethod
+	def _find_bm25_tokenizer(nodes: List[Node]):
+		bm25_tokenizer_list = extract_values_from_nodes(nodes, "bm25_tokenizer")
+		strategy_tokenizer_list = list(
+			chain.from_iterable(
+				extract_values_from_nodes_strategy(nodes, "bm25_tokenizer")
+			)
+		)
+		return list(set(bm25_tokenizer_list + strategy_tokenizer_list))
+
+	@staticmethod
+	def _find_embedding_model(nodes: List[Node]):
+		embedding_models_list = extract_values_from_nodes(nodes, "embedding_model")
+		retrieval_module_dicts = extract_values_from_nodes_strategy(
+			nodes, "retrieval_modules"
+		)
+		for retrieval_modules in retrieval_module_dicts:
+			vectordb_modules = list(
+				filter(lambda x: x["module_type"] == "vectordb", retrieval_modules)
+			)
+			embedding_models_list.extend(
+				list(map(lambda x: x.get("embedding_model", None), vectordb_modules))
+			)
+		embedding_models_list = list(
+			filter(lambda x: x is not None, embedding_models_list)
+		)
+		return list(set(embedding_models_list))
+
+	async def __ingest_vectordb(self, yaml_path, full_ingest: bool):
+		vectordb_list = load_all_vectordb_from_yaml(yaml_path, self.project_dir)
+		if full_ingest is True:
+			# get the target ingest corpus from the whole corpus
+			for vectordb in vectordb_list:
+				target_corpus = await filter_exist_ids(vectordb, self.corpus_data)
+				await vectordb_ingest(vectordb, target_corpus)
+		else:
+			# get the target ingest corpus from the retrieval gt only
+			for vectordb in vectordb_list:
+				target_corpus = await filter_exist_ids_from_retrieval_gt(
+					vectordb, self.qa_data, self.corpus_data
+				)
+				await vectordb_ingest(vectordb, target_corpus)