Fix Dockerfile build issue

2025-03-18 16:41:12 +09:00
parent 6814230bfb
commit 9323aa254a
228 changed files with 467 additions and 3488 deletions
--- a/autorag/nodes/retrieval/hybrid_rrf.py
+++ b/autorag/nodes/retrieval/hybrid_rrf.py
@@ -0,0 +1,128 @@
+import os
+from pathlib import Path
+from typing import List, Tuple, Union
+
+import pandas as pd
+
+from autorag.nodes.retrieval.base import HybridRetrieval
+from autorag.utils.util import pop_params, fetch_contents, result_to_dataframe
+
+
+class HybridRRF(HybridRetrieval):
+	def _pure(self, ids, scores, top_k: int, weight: int = 60, rrf_k: int = -1):
+		return hybrid_rrf(ids, scores, top_k, weight, rrf_k)
+
+	@classmethod
+	def run_evaluator(
+		cls,
+		project_dir: Union[str, Path],
+		previous_result: pd.DataFrame,
+		*args,
+		**kwargs,
+	):
+		if "ids" in kwargs and "scores" in kwargs:
+			data_dir = os.path.join(project_dir, "data")
+			corpus_df = pd.read_parquet(
+				os.path.join(data_dir, "corpus.parquet"), engine="pyarrow"
+			)
+
+			params = pop_params(hybrid_rrf, kwargs)
+			assert (
+				"ids" in params and "scores" in params and "top_k" in params
+			), "ids, scores, and top_k must be specified."
+
+			@result_to_dataframe(
+				["retrieved_contents", "retrieved_ids", "retrieve_scores"]
+			)
+			def __rrf(**rrf_params):
+				ids, scores = hybrid_rrf(**rrf_params)
+				contents = fetch_contents(corpus_df, ids)
+				return contents, ids, scores
+
+			return __rrf(**params)
+		else:
+			assert (
+				"target_modules" in kwargs and "target_module_params" in kwargs
+			), "target_modules and target_module_params must be specified if there is not ids and scores."
+			instance = cls(project_dir, *args, **kwargs)
+			result = instance.pure(previous_result, *args, **kwargs)
+			del instance
+			return result
+
+
+def hybrid_rrf(
+	ids: Tuple,
+	scores: Tuple,
+	top_k: int,
+	weight: int = 60,
+	rrf_k: int = -1,
+) -> Tuple[List[List[str]], List[List[float]]]:
+	"""
+	Hybrid RRF function.
+	RRF (Rank Reciprocal Fusion) is a method to fuse multiple retrieval results.
+	It is common to fuse dense retrieval and sparse retrieval results using RRF.
+	To use this function, you must input ids and scores as tuple.
+	It is more unique than other retrieval modules because it does not really execute retrieval but just fuses
+	the results of other retrieval functions.
+	So you have to run more than two retrieval modules before running this function.
+	And collect ids and scores result from each retrieval module.
+	Make it as a tuple and input it to this function.
+
+	:param ids: The tuple of ids that you want to fuse.
+	    The length of this must be the same as the length of scores.
+	:param scores: The retrieve scores that you want to fuse.
+	    The length of this must be the same as the length of ids.
+	:param top_k: The number of passages to be retrieved.
+	:param weight: Hyperparameter for RRF.
+	    It was originally rrf_k value.
+	    Default is 60.
+	    For more information, please visit our documentation.
+	:param rrf_k: (Deprecated) Hyperparameter for RRF.
+	    It was originally rrf_k value. Will remove at a further version.
+	:return: The tuple of ids and fused scores that are fused by RRF.
+	"""
+	assert len(ids) == len(scores), "The length of ids and scores must be the same."
+	assert len(ids) > 1, "You must input more than one retrieval results."
+	assert top_k > 0, "top_k must be greater than 0."
+	assert weight > 0, "rrf_k must be greater than 0."
+
+	if rrf_k != -1:
+		weight = int(rrf_k)
+	else:
+		weight = int(weight)
+
+	id_df = pd.DataFrame({f"id_{i}": id_list for i, id_list in enumerate(ids)})
+	score_df = pd.DataFrame(
+		{f"score_{i}": score_list for i, score_list in enumerate(scores)}
+	)
+	df = pd.concat([id_df, score_df], axis=1)
+
+	def rrf_pure_apply(row):
+		ids_tuple = tuple(row[[f"id_{i}" for i in range(len(ids))]].values)
+		scores_tuple = tuple(row[[f"score_{i}" for i in range(len(scores))]].values)
+		return pd.Series(rrf_pure(ids_tuple, scores_tuple, weight, top_k))
+
+	df[["rrf_id", "rrf_score"]] = df.apply(rrf_pure_apply, axis=1)
+	return df["rrf_id"].tolist(), df["rrf_score"].tolist()
+
+
+def rrf_pure(
+	ids: Tuple, scores: Tuple, rrf_k: int, top_k: int
+) -> Tuple[List[str], List[float]]:
+	df = pd.concat(
+		[pd.Series(dict(zip(_id, score))) for _id, score in zip(ids, scores)], axis=1
+	)
+	rank_df = df.rank(ascending=False, method="min")
+	rank_df = rank_df.fillna(0)
+	rank_df["rrf"] = rank_df.apply(lambda row: rrf_calculate(row, rrf_k), axis=1)
+	rank_df = rank_df.sort_values(by="rrf", ascending=False)
+	return rank_df.index.tolist()[:top_k], rank_df["rrf"].tolist()[:top_k]
+
+
+def rrf_calculate(row, rrf_k):
+	result = 0
+	for r in row:
+		if r == 0:
+			continue
+		result += 1 / (r + rrf_k)
+	return result