Fix Dockerfile build issue

2025-03-18 16:41:12 +09:00
parent 6814230bfb
commit 9323aa254a
228 changed files with 467 additions and 3488 deletions
--- a/autorag/nodes/promptmaker/init.py
+++ b/autorag/nodes/promptmaker/init.py
@@ -0,0 +1,3 @@
+from .long_context_reorder import LongContextReorder
+from .window_replacement import WindowReplacement
+from .fstring import Fstring
--- a/autorag/nodes/promptmaker/base.py
+++ b/autorag/nodes/promptmaker/base.py
@@ -0,0 +1,34 @@
+import logging
+from abc import ABCMeta
+from pathlib import Path
+from typing import Union
+
+import pandas as pd
+
+from autorag.schema.base import BaseModule
+
+logger = logging.getLogger("AutoRAG")
+
+
+class BasePromptMaker(BaseModule, metaclass=ABCMeta):
+	def __init__(self, project_dir: Union[str, Path], *args, **kwargs):
+		logger.info(
+			f"Initialize prompt maker node - {self.__class__.__name__} module..."
+		)
+
+	def __del__(self):
+		logger.info(f"Prompt maker node - {self.__class__.__name__} module is deleted.")
+
+	def cast_to_run(self, previous_result: pd.DataFrame, *args, **kwargs):
+		logger.info(f"Running prompt maker node - {self.__class__.__name__} module...")
+		# get query and retrieved contents from previous_result
+		assert (
+			"query" in previous_result.columns
+		), "previous_result must have query column."
+		assert (
+			"retrieved_contents" in previous_result.columns
+		), "previous_result must have retrieved_contents column."
+		query = previous_result["query"].tolist()
+		retrieved_contents = previous_result["retrieved_contents"].tolist()
+		prompt = kwargs.pop("prompt")
+		return query, retrieved_contents, prompt
--- a/autorag/nodes/promptmaker/fstring.py
+++ b/autorag/nodes/promptmaker/fstring.py
@@ -0,0 +1,49 @@
+from typing import List
+
+import pandas as pd
+
+from autorag.nodes.promptmaker.base import BasePromptMaker
+from autorag.utils import result_to_dataframe
+
+
+class Fstring(BasePromptMaker):
+	@result_to_dataframe(["prompts"])
+	def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
+		query, retrieved_contents, prompt = self.cast_to_run(
+			previous_result, *args, **kwargs
+		)
+		return self._pure(prompt, query, retrieved_contents)
+
+	def _pure(
+		self, prompt: str, queries: List[str], retrieved_contents: List[List[str]]
+	) -> List[str]:
+		"""
+		Make a prompt using f-string from a query and retrieved_contents.
+		You must type a prompt or prompt list at a config YAML file like this:
+
+		.. Code:: yaml
+		nodes:
+		- node_type: prompt_maker
+		  modules:
+		  - module_type: fstring
+			prompt: [Answer this question: {query} \n\n {retrieved_contents},
+			Read the passages carefully and answer this question: {query} \n\n Passages: {retrieved_contents}]
+
+		:param prompt: A prompt string.
+		:param queries: List of query strings.
+		:param retrieved_contents: List of retrieved contents.
+		:return: Prompts that are made by f-string.
+		"""
+
+		def fstring_row(
+			_prompt: str, _query: str, _retrieved_contents: List[str]
+		) -> str:
+			contents_str = "\n\n".join(_retrieved_contents)
+			return _prompt.format(query=_query, retrieved_contents=contents_str)
+
+		return list(
+			map(
+				lambda x: fstring_row(prompt, x[0], x[1]),
+				zip(queries, retrieved_contents),
+			)
+		)
--- a/autorag/nodes/promptmaker/long_context_reorder.py
+++ b/autorag/nodes/promptmaker/long_context_reorder.py
@@ -0,0 +1,83 @@
+import logging
+from typing import List
+
+import numpy as np
+import pandas as pd
+
+from autorag.nodes.promptmaker.base import BasePromptMaker
+from autorag.utils import result_to_dataframe
+
+logger = logging.getLogger("AutoRAG")
+
+
+class LongContextReorder(BasePromptMaker):
+	@result_to_dataframe(["prompts"])
+	def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
+		query, retrieved_contents, prompt = self.cast_to_run(
+			previous_result, *args, **kwargs
+		)
+		assert (
+			"retrieve_scores" in previous_result.columns
+		), "previous_result must have retrieve_scores column."
+		retrieve_scores = previous_result["retrieve_scores"].tolist()
+		return self._pure(prompt, query, retrieved_contents, retrieve_scores)
+
+	def _pure(
+		self,
+		prompt: str,
+		queries: List[str],
+		retrieved_contents: List[List[str]],
+		retrieve_scores: List[List[float]],
+	) -> List[str]:
+		"""
+		Models struggle to access significant details found
+		in the center of extended contexts. A study
+		(https://arxiv.org/abs/2307.03172) observed that the best
+		performance typically arises when crucial data is positioned
+		at the start or conclusion of the input context. Additionally,
+		as the input context lengthens, performance drops notably, even
+		in models designed for long contexts."
+
+		.. Code:: yaml
+		nodes:
+		- node_type: prompt_maker
+		  modules:
+		  - module_type: long_context_reorder
+		    prompt: [Answer this question: {query} \n\n {retrieved_contents},
+		    Read the passages carefully and answer this question: {query} \n\n Passages: {retrieved_contents}]
+
+		:param prompt: A prompt string.
+		:param queries: List of query strings.
+		:param retrieved_contents: List of retrieved contents.
+		:param retrieve_scores: List of `retrieve scores`.
+		:return: Prompts that are made by long context reorder.
+		"""
+
+		def long_context_reorder_row(
+			_prompt: str,
+			_query: str,
+			_retrieved_contents: List[str],
+			_retrieve_scores: List[float],
+		) -> str:
+			if isinstance(_retrieved_contents, np.ndarray):
+				_retrieved_contents = _retrieved_contents.tolist()
+			if not len(_retrieved_contents) == len(_retrieve_scores):
+				logger.info("If you use a summarizer, the reorder will not proceed.")
+				return _prompt.format(
+					query=_query, retrieved_contents="\n\n".join(_retrieved_contents)
+				)
+			content_scores = list(zip(_retrieved_contents, _retrieve_scores))
+			sorted_content_scores = sorted(
+				content_scores, key=lambda x: x[1], reverse=True
+			)
+			content_result, score_result = zip(*sorted_content_scores)
+			_retrieved_contents.append(content_result[0])
+			contents_str = "\n\n".join(_retrieved_contents)
+			return _prompt.format(query=_query, retrieved_contents=contents_str)
+
+		return list(
+			map(
+				lambda x: long_context_reorder_row(prompt, x[0], x[1], x[2]),
+				zip(queries, retrieved_contents, retrieve_scores),
+			)
+		)
--- a/autorag/nodes/promptmaker/run.py
+++ b/autorag/nodes/promptmaker/run.py
@@ -0,0 +1,280 @@
+import os
+import pathlib
+from copy import deepcopy
+from typing import List, Dict, Optional, Union
+
+import pandas as pd
+import tokenlog
+
+from autorag.evaluation import evaluate_generation
+from autorag.evaluation.util import cast_metrics
+from autorag.schema.metricinput import MetricInput
+from autorag.strategy import measure_speed, filter_by_threshold, select_best
+from autorag.support import get_support_modules
+from autorag.utils import validate_qa_dataset
+from autorag.utils.util import make_combinations, explode, split_dataframe
+
+
+def run_prompt_maker_node(
+	modules: List,
+	module_params: List[Dict],
+	previous_result: pd.DataFrame,
+	node_line_dir: str,
+	strategies: Dict,
+) -> pd.DataFrame:
+	"""
+	Run prompt maker node.
+	With this function, you can select the best prompt maker module.
+	As default, when you can use only one module, the evaluation will be skipped.
+	If you want to select the best prompt among modules, you can use strategies.
+	When you use them, you must pass 'generator_modules' and its parameters at strategies.
+	Because it uses generator modules and generator metrics for evaluation this module.
+	It is recommended to use one params and modules for evaluation,
+	but you can use multiple params and modules for evaluation.
+	When you don't set generator module at strategies, it will use the default generator module.
+	The default generator module is llama_index_llm with openai gpt-3.5-turbo model.
+
+	:param modules: Prompt maker module classes to run.
+	:param module_params: Prompt maker module parameters.
+	:param previous_result: Previous result dataframe.
+	    Could be query expansion's best result or qa data.
+	:param node_line_dir: This node line's directory.
+	:param strategies: Strategies for prompt maker node.
+	:return: The best result dataframe.
+	    It contains previous result columns and prompt maker's result columns which is 'prompts'.
+	"""
+	if not os.path.exists(node_line_dir):
+		os.makedirs(node_line_dir)
+	node_dir = os.path.join(node_line_dir, "prompt_maker")
+	if not os.path.exists(node_dir):
+		os.makedirs(node_dir)
+	project_dir = pathlib.PurePath(node_line_dir).parent.parent
+
+	# run modules
+	results, execution_times = zip(
+		*map(
+			lambda task: measure_speed(
+				task[0].run_evaluator,
+				project_dir=project_dir,
+				previous_result=previous_result,
+				**task[1],
+			),
+			zip(modules, module_params),
+		)
+	)
+	average_times = list(map(lambda x: x / len(results[0]), execution_times))
+
+	# get average token usage
+	token_usages = []
+	for i, result in enumerate(results):
+		token_logger = tokenlog.getLogger(
+			f"prompt_maker_{i}", strategies.get("tokenizer", "gpt2")
+		)
+		token_logger.query_batch(result["prompts"].tolist())
+		token_usages.append(token_logger.get_token_usage() / len(result))
+
+	# save results to folder
+	filepaths = list(
+		map(lambda x: os.path.join(node_dir, f"{x}.parquet"), range(len(modules)))
+	)
+	list(
+		map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths))
+	)  # execute save to parquet
+	filenames = list(map(lambda x: os.path.basename(x), filepaths))
+
+	# make summary file
+	summary_df = pd.DataFrame(
+		{
+			"filename": filenames,
+			"module_name": list(map(lambda module: module.__name__, modules)),
+			"module_params": module_params,
+			"execution_time": average_times,
+			"average_prompt_token": token_usages,
+		}
+	)
+
+	metric_names, metric_params = cast_metrics(strategies.get("metrics"))
+
+	# Run evaluation when there are more than one module.
+	if len(modules) > 1:
+		# pop general keys from strategies (e.g. metrics, speed_threshold)
+		general_key = ["metrics", "speed_threshold", "token_threshold", "tokenizer"]
+		general_strategy = dict(
+			filter(lambda x: x[0] in general_key, strategies.items())
+		)
+		extra_strategy = dict(
+			filter(lambda x: x[0] not in general_key, strategies.items())
+		)
+
+		# first, filter by threshold if it is enabled.
+		if general_strategy.get("speed_threshold") is not None:
+			results, filenames = filter_by_threshold(
+				results, average_times, general_strategy["speed_threshold"], filenames
+			)
+
+		# Calculate tokens and save to summary
+		if general_strategy.get("token_threshold") is not None:
+			results, filenames = filter_by_threshold(
+				results, token_usages, general_strategy["token_threshold"], filenames
+			)
+
+		# run metrics before filtering
+		if metric_names is None or len(metric_names) <= 0:
+			raise ValueError(
+				"You must at least one metrics for prompt maker evaluation."
+			)
+
+		# get generator modules from strategy
+		generator_callables, generator_params = make_generator_callable_params(
+			extra_strategy
+		)
+
+		# get generation_gt
+		qa_data = pd.read_parquet(
+			os.path.join(project_dir, "data", "qa.parquet"), engine="pyarrow"
+		)
+		validate_qa_dataset(qa_data)
+		generation_gt = qa_data["generation_gt"].tolist()
+		generation_gt = list(map(lambda x: x.tolist(), generation_gt))
+
+		metric_inputs = [MetricInput(generation_gt=gen_gt) for gen_gt in generation_gt]
+
+		all_prompts = []
+		for result in results:
+			all_prompts.extend(result["prompts"].tolist())
+
+		evaluation_result_all = evaluate_one_prompt_maker_node(
+			all_prompts,
+			generator_callables,
+			generator_params,
+			metric_inputs * len(results),
+			general_strategy["metrics"],
+			project_dir,
+			strategy_name=strategies.get("strategy", "mean"),
+		)
+		evaluation_results = split_dataframe(
+			evaluation_result_all, chunk_size=len(results[0])
+		)
+
+		evaluation_df = pd.DataFrame(
+			{
+				"filename": filenames,
+				**{
+					f"prompt_maker_{metric_name}": list(
+						map(lambda x: x[metric_name].mean(), evaluation_results)
+					)
+					for metric_name in metric_names
+				},
+			}
+		)
+		summary_df = pd.merge(
+			on="filename", left=summary_df, right=evaluation_df, how="left"
+		)
+
+		best_result, best_filename = select_best(
+			evaluation_results,
+			metric_names,
+			filenames,
+			strategies.get("strategy", "mean"),
+		)
+		# change metric name columns to prompt_maker_metric_name
+		best_result = best_result.rename(
+			columns={
+				metric_name: f"prompt_maker_{metric_name}"
+				for metric_name in metric_names
+			}
+		)
+		best_result = best_result.drop(columns=["generated_texts"])
+	else:
+		best_result, best_filename = results[0], filenames[0]
+
+	# add 'is_best' column at summary file
+	summary_df["is_best"] = summary_df["filename"] == best_filename
+
+	best_result = pd.concat([previous_result, best_result], axis=1)
+
+	# save files
+	summary_df.to_csv(os.path.join(node_dir, "summary.csv"), index=False)
+	best_result.to_parquet(
+		os.path.join(node_dir, f"best_{os.path.splitext(best_filename)[0]}.parquet"),
+		index=False,
+	)
+
+	return best_result
+
+
+def make_generator_callable_params(strategy_dict: Dict):
+	node_dict = deepcopy(strategy_dict)
+	generator_module_list: Optional[List[Dict]] = node_dict.pop(
+		"generator_modules", None
+	)
+	if generator_module_list is None:
+		generator_module_list = [
+			{
+				"module_type": "llama_index_llm",
+				"llm": "openai",
+				"model": "gpt-3.5-turbo",
+			}
+		]
+	node_params = node_dict
+	modules = list(
+		map(
+			lambda module_dict: get_support_modules(module_dict.pop("module_type")),
+			generator_module_list,
+		)
+	)
+	param_combinations = list(
+		map(
+			lambda module_dict: make_combinations({**module_dict, **node_params}),
+			generator_module_list,
+		)
+	)
+	return explode(modules, param_combinations)
+
+
+def evaluate_one_prompt_maker_node(
+	prompts: List[str],
+	generator_classes: List,
+	generator_params: List[Dict],
+	metric_inputs: List[MetricInput],
+	metrics: Union[List[str], List[Dict]],
+	project_dir,
+	strategy_name: str,
+) -> pd.DataFrame:
+	input_df = pd.DataFrame({"prompts": prompts})
+	generator_results = list(
+		map(
+			lambda x: x[0].run_evaluator(
+				project_dir=project_dir, previous_result=input_df, **x[1]
+			),
+			zip(generator_classes, generator_params),
+		)
+	)
+	evaluation_results = list(
+		map(
+			lambda x: evaluate_generator_result(x[0], metric_inputs, metrics),
+			zip(generator_results, generator_classes),
+		)
+	)
+	metric_names = (
+		list(map(lambda x: x["metric_name"], metrics))
+		if isinstance(metrics[0], dict)
+		else metrics
+	)
+	best_result, _ = select_best(
+		evaluation_results, metric_names, strategy_name=strategy_name
+	)
+	best_result = pd.concat([input_df, best_result], axis=1)
+	return best_result  # it has 'generated_texts' column
+
+
+def evaluate_generator_result(
+	result_df: pd.DataFrame,
+	metric_inputs: List[MetricInput],
+	metrics: Union[List[str], List[Dict]],
+) -> pd.DataFrame:
+	@evaluate_generation(metric_inputs=metric_inputs, metrics=metrics)
+	def evaluate(df):
+		return df["generated_texts"].tolist()
+
+	return evaluate(result_df)
--- a/autorag/nodes/promptmaker/window_replacement.py
+++ b/autorag/nodes/promptmaker/window_replacement.py
@@ -0,0 +1,85 @@
+import logging
+import os
+from typing import List, Dict
+
+import pandas as pd
+
+from autorag.nodes.promptmaker.base import BasePromptMaker
+from autorag.utils import result_to_dataframe, fetch_contents
+
+logger = logging.getLogger("AutoRAG")
+
+
+class WindowReplacement(BasePromptMaker):
+	def __init__(self, project_dir: str, *args, **kwargs):
+		super().__init__(project_dir, *args, **kwargs)
+		# load corpus
+		data_dir = os.path.join(project_dir, "data")
+		self.corpus_data = pd.read_parquet(
+			os.path.join(data_dir, "corpus.parquet"), engine="pyarrow"
+		)
+
+	@result_to_dataframe(["prompts"])
+	def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
+		query, retrieved_contents, prompt = self.cast_to_run(
+			previous_result, *args, **kwargs
+		)
+		retrieved_ids = previous_result["retrieved_ids"].tolist()
+		# get metadata from corpus
+		retrieved_metadata = fetch_contents(
+			self.corpus_data, retrieved_ids, column_name="metadata"
+		)
+		return self._pure(prompt, query, retrieved_contents, retrieved_metadata)
+
+	def _pure(
+		self,
+		prompt: str,
+		queries: List[str],
+		retrieved_contents: List[List[str]],
+		retrieved_metadata: List[List[Dict]],
+	) -> List[str]:
+		"""
+		Replace retrieved_contents with a window to create a Prompt
+		(only available for corpus chunked with Sentence window method)
+		You must type a prompt or prompt list at a config YAML file like this:
+
+		.. Code:: yaml
+		nodes:
+		- node_type: prompt_maker
+		  modules:
+		  - module_type: window_replacement
+		    prompt: [Answer this question: {query} \n\n {retrieved_contents},
+		    Read the passages carefully and answer this question: {query} \n\n Passages: {retrieved_contents}]
+
+		:param prompt: A prompt string.
+		:param queries: List of query strings.
+		:param retrieved_contents: List of retrieved contents.
+		:param retrieved_metadata: List of retrieved metadata.
+		:return: Prompts that are made by window_replacement.
+		"""
+
+		def window_replacement_row(
+			_prompt: str,
+			_query: str,
+			_retrieved_contents,
+			_retrieved_metadata: List[Dict],
+		) -> str:
+			window_list = []
+			for content, metadata in zip(_retrieved_contents, _retrieved_metadata):
+				if "window" in metadata:
+					window_list.append(metadata["window"])
+				else:
+					window_list.append(content)
+					logger.info(
+						"Only available for corpus chunked with Sentence window method."
+						"window_replacement will not proceed."
+					)
+			contents_str = "\n\n".join(window_list)
+			return _prompt.format(query=_query, retrieved_contents=contents_str)
+
+		return list(
+			map(
+				lambda x: window_replacement_row(prompt, x[0], x[1], x[2]),
+				zip(queries, retrieved_contents, retrieved_metadata),
+			)
+		)