autorag_evaluation/autorag/deploy/base.py

import logging
import os
import pathlib
import uuid
from copy import deepcopy
from typing import Optional, Dict, List

import pandas as pd
import yaml

from autorag.support import get_support_modules
from autorag.utils.util import load_summary_file, load_yaml_config

logger = logging.getLogger("AutoRAG")


def extract_node_line_names(config_dict: Dict) -> List[str]:
	"""
	Extract node line names with the given config dictionary order.

	:param config_dict: The YAML configuration dict for the pipeline.
	    You can load this to access trail_folder/config.yaml.
	:return: The list of node line names.
	    It is the order of the node line names in the pipeline.
	"""
	return [node_line["node_line_name"] for node_line in config_dict["node_lines"]]


def extract_node_strategy(config_dict: Dict) -> Dict:
	"""
	Extract node strategies with the given config dictionary.
	The return value is a dictionary of the node type and its strategy.

	:param config_dict: The YAML configuration dict for the pipeline.
	    You can load this to access trail_folder/config.yaml.
	:return: Key is node_type and value is strategy dict.
	"""
	return {
		node["node_type"]: node.get("strategy", {})
		for node_line in config_dict["node_lines"]
		for node in node_line["nodes"]
	}


def summary_df_to_yaml(summary_df: pd.DataFrame, config_dict: Dict) -> Dict:
	"""
	Convert trial summary dataframe to config yaml file.

	:param summary_df: The trial summary dataframe of the evaluated trial.
	:param config_dict: The yaml configuration dict for the pipeline.
	    You can load this to access trail_folder/config.yaml.
	:return: Dictionary of config yaml file.
	    You can save this dictionary to yaml file.
	"""

	# summary_df columns : 'node_line_name', 'node_type', 'best_module_filename',
	#                      'best_module_name', 'best_module_params', 'best_execution_time'
	node_line_names = extract_node_line_names(config_dict)
	node_strategies = extract_node_strategy(config_dict)
	strategy_df = pd.DataFrame(
		{
			"node_type": list(node_strategies.keys()),
			"strategy": list(node_strategies.values()),
		}
	)
	summary_df = summary_df.merge(strategy_df, on="node_type", how="left")
	summary_df["categorical_node_line_name"] = pd.Categorical(
		summary_df["node_line_name"], categories=node_line_names, ordered=True
	)
	summary_df = summary_df.sort_values(by="categorical_node_line_name")
	grouped = summary_df.groupby("categorical_node_line_name", observed=False)

	node_lines = [
		{
			"node_line_name": node_line_name,
			"nodes": [
				{
					"node_type": row["node_type"],
					"strategy": row["strategy"],
					"modules": [
						{
							"module_type": row["best_module_name"],
							**row["best_module_params"],
						}
					],
				}
				for _, row in node_line.iterrows()
			],
		}
		for node_line_name, node_line in grouped
	]
	return {"node_lines": node_lines}


def extract_best_config(trial_path: str, output_path: Optional[str] = None) -> Dict:
	"""
	Extract the optimal pipeline from the evaluated trial.

	:param trial_path: The path to the trial directory that you want to extract the pipeline from.
	    Must already be evaluated.
	:param output_path: Output path that pipeline yaml file will be saved.
	    Must be .yaml or .yml file.
	    If None, it does not save YAML file and just returns dict values.
	    Default is None.
	:return: The dictionary of the extracted pipeline.
	"""
	summary_path = os.path.join(trial_path, "summary.csv")
	if not os.path.exists(summary_path):
		raise ValueError(f"summary.csv does not exist in {trial_path}.")
	trial_summary_df = load_summary_file(
		summary_path, dict_columns=["best_module_params"]
	)
	config_yaml_path = os.path.join(trial_path, "config.yaml")
	with open(config_yaml_path, "r") as f:
		config_dict = yaml.safe_load(f)
	yaml_dict = summary_df_to_yaml(trial_summary_df, config_dict)
	yaml_dict["vectordb"] = extract_vectordb_config(trial_path)
	if output_path is not None:
		with open(output_path, "w") as f:
			yaml.safe_dump(yaml_dict, f)
	return yaml_dict


def extract_vectordb_config(trial_path: str) -> List[Dict]:
	# get vectordb.yaml file
	project_dir = pathlib.PurePath(os.path.realpath(trial_path)).parent
	vectordb_config_path = os.path.join(project_dir, "resources", "vectordb.yaml")
	if not os.path.exists(vectordb_config_path):
		raise ValueError(f"vectordb.yaml does not exist in {vectordb_config_path}.")
	with open(vectordb_config_path, "r") as f:
		vectordb_dict = yaml.safe_load(f)
	result = vectordb_dict.get("vectordb", [])
	if len(result) != 0:
		return result
	# return default setting of chroma
	return [
		{
			"name": "default",
			"db_type": "chroma",
			"client_type": "persistent",
			"embedding_model": "openai",
			"collection_name": "openai",
			"path": os.path.join(project_dir, "resources", "chroma"),
		}
	]


class BaseRunner:
	def __init__(self, config: Dict, project_dir: Optional[str] = None):
		self.config = config
		project_dir = os.getcwd() if project_dir is None else project_dir
		os.environ["PROJECT_DIR"] = project_dir

		# init modules
		node_lines = deepcopy(self.config["node_lines"])
		self.module_instances = []
		self.module_params = []
		for node_line in node_lines:
			for node in node_line["nodes"]:
				if len(node["modules"]) != 1:
					raise ValueError(
						"The number of modules in a node must be 1 for using runner."
						"Please use extract_best_config method for extracting yaml file from evaluated trial."
					)
				module = node["modules"][0]
				module_type = module.pop("module_type")
				module_params = module
				module_instance = get_support_modules(module_type)(
					project_dir=project_dir,
					**module_params,
				)
				self.module_instances.append(module_instance)
				self.module_params.append(module_params)

	@classmethod
	def from_yaml(cls, yaml_path: str, project_dir: Optional[str] = None):
		"""
		Load Runner from the YAML file.
		Must be extracted YAML file from the evaluated trial using the extract_best_config method.

		:param yaml_path: The path of the YAML file.
		:param project_dir: The path of the project directory.
			Default is the current directory.
		:return: Initialized Runner.
		"""
		config = load_yaml_config(yaml_path)
		return cls(config, project_dir=project_dir)

	@classmethod
	def from_trial_folder(cls, trial_path: str):
		"""
		Load Runner from the evaluated trial folder.
		Must already be evaluated using Evaluator class.
		It sets the project_dir as the parent directory of the trial folder.

		:param trial_path: The path of the trial folder.
		:return: Initialized Runner.
		"""
		config = extract_best_config(trial_path)
		return cls(config, project_dir=os.path.dirname(trial_path))


class Runner(BaseRunner):
	def run(self, query: str, result_column: str = "generated_texts"):
		"""
		Run the pipeline with query.
		The loaded pipeline must start with a single query,
		so the first module of the pipeline must be `query_expansion` or `retrieval` module.

		:param query: The query of the user.
		:param result_column: The result column name for the answer.
		    Default is `generated_texts`, which is the output of the `generation` module.
		:return: The result of the pipeline.
		"""
		previous_result = pd.DataFrame(
			{
				"qid": str(uuid.uuid4()),
				"query": [query],
				"retrieval_gt": [[]],
				"generation_gt": [""],
			}
		)  # pseudo qa data for execution
		for module_instance, module_param in zip(
			self.module_instances, self.module_params
		):
			new_result = module_instance.pure(
				previous_result=previous_result, **module_param
			)
			duplicated_columns = previous_result.columns.intersection(
				new_result.columns
			)
			drop_previous_result = previous_result.drop(columns=duplicated_columns)
			previous_result = pd.concat([drop_previous_result, new_result], axis=1)

		return previous_result[result_column].tolist()[0]