Fix Dockerfile build issue

2025-03-18 16:41:12 +09:00
parent 6814230bfb
commit 9323aa254a
228 changed files with 467 additions and 3488 deletions
--- a/autorag/nodes/retrieval/init.py
+++ b/autorag/nodes/retrieval/init.py
@@ -0,0 +1,4 @@
+from .bm25 import BM25
+from .hybrid_cc import HybridCC
+from .hybrid_rrf import HybridRRF
+from .vectordb import VectorDB
--- a/autorag/nodes/retrieval/base.py
+++ b/autorag/nodes/retrieval/base.py
@@ -0,0 +1,127 @@
+import abc
+import logging
+import os
+from typing import List, Union, Tuple
+
+import pandas as pd
+
+from autorag.schema import BaseModule
+from autorag.support import get_support_modules
+from autorag.utils import fetch_contents, result_to_dataframe, validate_qa_dataset
+from autorag.utils.util import pop_params
+
+logger = logging.getLogger("AutoRAG")
+
+
+class BaseRetrieval(BaseModule, metaclass=abc.ABCMeta):
+	def __init__(self, project_dir: str, *args, **kwargs):
+		logger.info(f"Initialize retrieval node - {self.__class__.__name__}")
+
+		self.resources_dir = os.path.join(project_dir, "resources")
+		data_dir = os.path.join(project_dir, "data")
+		# fetch data from corpus_data
+		self.corpus_df = pd.read_parquet(
+			os.path.join(data_dir, "corpus.parquet"), engine="pyarrow"
+		)
+
+	def __del__(self):
+		logger.info(f"Deleting retrieval node - {self.__class__.__name__} module...")
+
+	def cast_to_run(self, previous_result: pd.DataFrame, *args, **kwargs):
+		logger.info(f"Running retrieval node - {self.__class__.__name__} module...")
+		validate_qa_dataset(previous_result)
+		# find queries columns & type cast queries
+		assert (
+			"query" in previous_result.columns
+		), "previous_result must have query column."
+		if "queries" not in previous_result.columns:
+			previous_result["queries"] = previous_result["query"]
+		previous_result.loc[:, "queries"] = previous_result["queries"].apply(
+			cast_queries
+		)
+		queries = previous_result["queries"].tolist()
+		return queries
+
+
+class HybridRetrieval(BaseRetrieval, metaclass=abc.ABCMeta):
+	def __init__(
+		self, project_dir: str, target_modules, target_module_params, *args, **kwargs
+	):
+		super().__init__(project_dir)
+		self.target_modules = list(
+			map(
+				lambda x, y: get_support_modules(x)(
+					**y,
+					project_dir=project_dir,
+				),
+				target_modules,
+				target_module_params,
+			)
+		)
+		self.target_module_params = target_module_params
+
+	@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
+	def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
+		result_dfs: List[pd.DataFrame] = list(
+			map(
+				lambda x, y: x.pure(
+					**y,
+					previous_result=previous_result,
+				),
+				self.target_modules,
+				self.target_module_params,
+			)
+		)
+		ids = tuple(
+			map(lambda df: df["retrieved_ids"].apply(list).tolist(), result_dfs)
+		)
+		scores = tuple(
+			map(
+				lambda df: df["retrieve_scores"].apply(list).tolist(),
+				result_dfs,
+			)
+		)
+
+		_pure_params = pop_params(self._pure, kwargs)
+		if "ids" in _pure_params or "scores" in _pure_params:
+			raise ValueError(
+				"With specifying ids or scores, you must use HybridRRF.run_evaluator instead."
+			)
+		ids, scores = self._pure(ids=ids, scores=scores, **_pure_params)
+		contents = fetch_contents(self.corpus_df, ids)
+		return contents, ids, scores
+
+
+def cast_queries(queries: Union[str, List[str]]) -> List[str]:
+	if isinstance(queries, str):
+		return [queries]
+	elif isinstance(queries, List):
+		return queries
+	else:
+		raise ValueError(f"queries must be str or list, but got {type(queries)}")
+
+
+def evenly_distribute_passages(
+	ids: List[List[str]], scores: List[List[float]], top_k: int
+) -> Tuple[List[str], List[float]]:
+	assert len(ids) == len(scores), "ids and scores must have same length."
+	query_cnt = len(ids)
+	avg_len = top_k // query_cnt
+	remainder = top_k % query_cnt
+
+	new_ids = []
+	new_scores = []
+	for i in range(query_cnt):
+		if i < remainder:
+			new_ids.extend(ids[i][: avg_len + 1])
+			new_scores.extend(scores[i][: avg_len + 1])
+		else:
+			new_ids.extend(ids[i][:avg_len])
+			new_scores.extend(scores[i][:avg_len])
+
+	return new_ids, new_scores
+
+
+def get_bm25_pkl_name(bm25_tokenizer: str):
+	bm25_tokenizer = bm25_tokenizer.replace("/", "")
+	return f"bm25_{bm25_tokenizer}.pkl"
--- a/autorag/nodes/retrieval/bm25.py
+++ b/autorag/nodes/retrieval/bm25.py
@@ -0,0 +1,365 @@
+import asyncio
+import os
+import pickle
+import re
+from typing import List, Dict, Tuple, Callable, Union, Iterable, Optional
+
+import numpy as np
+import pandas as pd
+from llama_index.core.indices.keyword_table.utils import simple_extract_keywords
+from nltk import PorterStemmer
+from rank_bm25 import BM25Okapi
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
+
+from autorag.nodes.retrieval.base import (
+	evenly_distribute_passages,
+	BaseRetrieval,
+	get_bm25_pkl_name,
+)
+from autorag.utils import validate_corpus_dataset, fetch_contents
+from autorag.utils.util import (
+	get_event_loop,
+	normalize_string,
+	result_to_dataframe,
+	pop_params,
+)
+
+
+def tokenize_ko_kiwi(texts: List[str]) -> List[List[str]]:
+	try:
+		from kiwipiepy import Kiwi, Token
+	except ImportError:
+		raise ImportError(
+			"You need to install kiwipiepy to use 'ko_kiwi' tokenizer. "
+			"Please install kiwipiepy by running 'pip install kiwipiepy'. "
+			"Or install Korean version of AutoRAG by running 'pip install AutoRAG[ko]'."
+		)
+	texts = list(map(lambda x: x.strip().lower(), texts))
+	kiwi = Kiwi()
+	tokenized_list: Iterable[List[Token]] = kiwi.tokenize(texts)
+	return [list(map(lambda x: x.form, token_list)) for token_list in tokenized_list]
+
+
+def tokenize_ko_kkma(texts: List[str]) -> List[List[str]]:
+	try:
+		from konlpy.tag import Kkma
+	except ImportError:
+		raise ImportError(
+			"You need to install konlpy to use 'ko_kkma' tokenizer. "
+			"Please install konlpy by running 'pip install konlpy'. "
+			"Or install Korean version of AutoRAG by running 'pip install AutoRAG[ko]'."
+		)
+	tokenizer = Kkma()
+	tokenized_list: List[List[str]] = list(map(lambda x: tokenizer.morphs(x), texts))
+	return tokenized_list
+
+
+def tokenize_ko_okt(texts: List[str]) -> List[List[str]]:
+	try:
+		from konlpy.tag import Okt
+	except ImportError:
+		raise ImportError(
+			"You need to install konlpy to use 'ko_kkma' tokenizer. "
+			"Please install konlpy by running 'pip install konlpy'. "
+			"Or install Korean version of AutoRAG by running 'pip install AutoRAG[ko]'."
+		)
+	tokenizer = Okt()
+	tokenized_list: List[List[str]] = list(map(lambda x: tokenizer.morphs(x), texts))
+	return tokenized_list
+
+
+def tokenize_porter_stemmer(texts: List[str]) -> List[List[str]]:
+	def tokenize_remove_stopword(text: str, stemmer) -> List[str]:
+		text = text.lower()
+		words = list(simple_extract_keywords(text))
+		return [stemmer.stem(word) for word in words]
+
+	stemmer = PorterStemmer()
+	tokenized_list: List[List[str]] = list(
+		map(lambda x: tokenize_remove_stopword(x, stemmer), texts)
+	)
+	return tokenized_list
+
+
+def tokenize_space(texts: List[str]) -> List[List[str]]:
+	def tokenize_space_text(text: str) -> List[str]:
+		text = normalize_string(text)
+		return re.split(r"\s+", text.strip())
+
+	return list(map(tokenize_space_text, texts))
+
+
+def load_bm25_corpus(bm25_path: str) -> Dict:
+	if bm25_path is None:
+		return {}
+	with open(bm25_path, "rb") as f:
+		bm25_corpus = pickle.load(f)
+	return bm25_corpus
+
+
+def tokenize_ja_sudachipy(texts: List[str]) -> List[List[str]]:
+	try:
+		from sudachipy import dictionary, tokenizer
+	except ImportError:
+		raise ImportError(
+			"You need to install SudachiPy to use 'sudachipy' tokenizer. "
+			"Please install SudachiPy by running 'pip install sudachipy'."
+		)
+
+	# Initialize SudachiPy with the default tokenizer
+	tokenizer_obj = dictionary.Dictionary(dict="core").create()
+
+	# Choose the tokenizer mode: NORMAL, SEARCH, A
+	mode = tokenizer.Tokenizer.SplitMode.A
+
+	# Tokenize the input texts
+	tokenized_list = []
+	for text in texts:
+		tokens = tokenizer_obj.tokenize(text, mode)
+		tokenized_list.append([token.surface() for token in tokens])
+
+	return tokenized_list
+
+
+BM25_TOKENIZER = {
+	"porter_stemmer": tokenize_porter_stemmer,
+	"ko_kiwi": tokenize_ko_kiwi,
+	"space": tokenize_space,
+	"ko_kkma": tokenize_ko_kkma,
+	"ko_okt": tokenize_ko_okt,
+	"sudachipy": tokenize_ja_sudachipy,
+}
+
+
+class BM25(BaseRetrieval):
+	def __init__(self, project_dir: str, *args, **kwargs):
+		"""
+		Initialize BM25 module.
+		(Retrieval)
+
+		:param project_dir: The project directory path.
+		:param bm25_tokenizer: The tokenizer name that is used to the BM25.
+		    It supports 'porter_stemmer', 'ko_kiwi', and huggingface `AutoTokenizer`.
+		    You can pass huggingface tokenizer name.
+		    Default is porter_stemmer.
+		:param kwargs: The optional arguments.
+		"""
+
+		super().__init__(project_dir)
+		# check if bm25_path and file exist
+		bm25_tokenizer = kwargs.get("bm25_tokenizer", None)
+		if bm25_tokenizer is None:
+			bm25_tokenizer = "porter_stemmer"
+		bm25_path = os.path.join(self.resources_dir, get_bm25_pkl_name(bm25_tokenizer))
+
+		assert (
+			bm25_path is not None
+		), "bm25_path must be specified for using bm25 retrieval."
+		assert os.path.exists(
+			bm25_path
+		), f"bm25_path {bm25_path} does not exist. Please ingest first."
+
+		self.bm25_corpus = load_bm25_corpus(bm25_path)
+		assert (
+			"tokens" and "passage_id" in list(self.bm25_corpus.keys())
+		), "bm25_corpus must contain tokens and passage_id. Please check you ingested bm25 corpus correctly."
+		self.tokenizer = select_bm25_tokenizer(bm25_tokenizer)
+		assert self.bm25_corpus["tokenizer_name"] == bm25_tokenizer, (
+			f"The bm25 corpus tokenizer is {self.bm25_corpus['tokenizer_name']}, but your input is {bm25_tokenizer}. "
+			f"You need to ingest again. Delete bm25 pkl file and re-ingest it."
+		)
+		self.bm25_instance = BM25Okapi(self.bm25_corpus["tokens"])
+
+	@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
+	def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
+		queries = self.cast_to_run(previous_result)
+		pure_params = pop_params(self._pure, kwargs)
+		ids, scores = self._pure(queries, *args, **pure_params)
+		contents = fetch_contents(self.corpus_df, ids)
+		return contents, ids, scores
+
+	def _pure(
+		self,
+		queries: List[List[str]],
+		top_k: int,
+		ids: Optional[List[List[str]]] = None,
+	) -> Tuple[List[List[str]], List[List[float]]]:
+		"""
+		BM25 retrieval function.
+		You have to load a pickle file that is already ingested.
+
+		:param queries: 2-d list of query strings.
+		    Each element of the list is a query strings of each row.
+		:param top_k: The number of passages to be retrieved.
+		:param ids: The optional list of ids that you want to retrieve.
+		    You don't need to specify this in the general use cases.
+		    Default is None.
+		:return: The 2-d list contains a list of passage ids that retrieved from bm25 and 2-d list of its scores.
+		    It will be a length of queries. And each element has a length of top_k.
+		"""
+		if ids is not None:
+			score_result = list(
+				map(
+					lambda query_list, id_list: get_bm25_scores(
+						query_list,
+						id_list,
+						self.tokenizer,
+						self.bm25_instance,
+						self.bm25_corpus,
+					),
+					queries,
+					ids,
+				)
+			)
+			return ids, score_result
+
+		# run async bm25_pure function
+		tasks = [
+			bm25_pure(
+				input_queries,
+				top_k,
+				self.tokenizer,
+				self.bm25_instance,
+				self.bm25_corpus,
+			)
+			for input_queries in queries
+		]
+		loop = get_event_loop()
+		results = loop.run_until_complete(asyncio.gather(*tasks))
+		id_result = list(map(lambda x: x[0], results))
+		score_result = list(map(lambda x: x[1], results))
+		return id_result, score_result
+
+
+async def bm25_pure(
+	queries: List[str], top_k: int, tokenizer, bm25_api: BM25Okapi, bm25_corpus: Dict
+) -> Tuple[List[str], List[float]]:
+	"""
+	Async BM25 retrieval function.
+	Its usage is for async retrieval of bm25 row by row.
+
+	:param queries: A list of query strings.
+	:param top_k: The number of passages to be retrieved.
+	:param tokenizer: A tokenizer that will be used to tokenize queries.
+	:param bm25_api: A bm25 api instance that will be used to retrieve passages.
+	:param bm25_corpus: A dictionary containing the bm25 corpus, which is doc_id from corpus and tokenized corpus.
+	    Its data structure looks like this:
+
+	    .. Code:: python
+
+	        {
+	            "tokens": [], # 2d list of tokens
+	            "passage_id": [], # 2d list of passage_id. Type must be str.
+	        }
+	:return: The tuple contains a list of passage ids that retrieved from bm25 and its scores.
+	"""
+	# I don't make queries operation to async, because queries length might be small, so it will occur overhead.
+	tokenized_queries = tokenize(queries, tokenizer)
+	id_result = []
+	score_result = []
+	for query in tokenized_queries:
+		scores = bm25_api.get_scores(query)
+		sorted_scores = sorted(scores, reverse=True)
+		top_n_index = np.argsort(scores)[::-1][:top_k]
+		ids = [bm25_corpus["passage_id"][i] for i in top_n_index]
+		id_result.append(ids)
+		score_result.append(sorted_scores[:top_k])
+
+	# make a total result to top_k
+	id_result, score_result = evenly_distribute_passages(id_result, score_result, top_k)
+	# sort id_result and score_result by score
+	result = [
+		(_id, score)
+		for score, _id in sorted(
+			zip(score_result, id_result), key=lambda pair: pair[0], reverse=True
+		)
+	]
+	id_result, score_result = zip(*result)
+	return list(id_result), list(score_result)
+
+
+def get_bm25_scores(
+	queries: List[str],
+	ids: List[str],
+	tokenizer,
+	bm25_api: BM25Okapi,
+	bm25_corpus: Dict,
+) -> List[float]:
+	if len(ids) == 0 or not bool(ids):
+		return []
+	tokenized_queries = tokenize(queries, tokenizer)
+	result_dict = {id_: [] for id_ in ids}
+	for query in tokenized_queries:
+		scores = bm25_api.get_scores(query)
+		for i, id_ in enumerate(ids):
+			result_dict[id_].append(scores[bm25_corpus["passage_id"].index(id_)])
+	result_df = pd.DataFrame(result_dict)
+	return result_df.max(axis=0).tolist()
+
+
+def tokenize(queries: List[str], tokenizer) -> List[List[int]]:
+	if isinstance(tokenizer, PreTrainedTokenizerBase):
+		tokenized_queries = tokenizer(queries).input_ids
+	else:
+		tokenized_queries = tokenizer(queries)
+	return tokenized_queries
+
+
+def bm25_ingest(
+	corpus_path: str, corpus_data: pd.DataFrame, bm25_tokenizer: str = "porter_stemmer"
+):
+	if not corpus_path.endswith(".pkl"):
+		raise ValueError(f"Corpus path {corpus_path} is not a pickle file.")
+	validate_corpus_dataset(corpus_data)
+	ids = corpus_data["doc_id"].tolist()
+
+	# Initialize bm25_corpus
+	bm25_corpus = pd.DataFrame()
+
+	# Load the BM25 corpus if it exists and get the passage ids
+	if os.path.exists(corpus_path) and os.path.getsize(corpus_path) > 0:
+		with open(corpus_path, "rb") as r:
+			corpus = pickle.load(r)
+			bm25_corpus = pd.DataFrame.from_dict(corpus)
+		duplicated_passage_rows = bm25_corpus[bm25_corpus["passage_id"].isin(ids)]
+		new_passage = corpus_data[
+			~corpus_data["doc_id"].isin(duplicated_passage_rows["passage_id"])
+		]
+	else:
+		new_passage = corpus_data
+
+	if not new_passage.empty:
+		tokenizer = select_bm25_tokenizer(bm25_tokenizer)
+		if isinstance(tokenizer, PreTrainedTokenizerBase):
+			tokenized_corpus = tokenizer(new_passage["contents"].tolist()).input_ids
+		else:
+			tokenized_corpus = tokenizer(new_passage["contents"].tolist())
+		new_bm25_corpus = pd.DataFrame(
+			{
+				"tokens": tokenized_corpus,
+				"passage_id": new_passage["doc_id"].tolist(),
+			}
+		)
+
+		if not bm25_corpus.empty:
+			bm25_corpus_updated = pd.concat(
+				[bm25_corpus, new_bm25_corpus], ignore_index=True
+			)
+			bm25_dict = bm25_corpus_updated.to_dict("list")
+		else:
+			bm25_dict = new_bm25_corpus.to_dict("list")
+
+		# add tokenizer name to bm25_dict
+		bm25_dict["tokenizer_name"] = bm25_tokenizer
+
+		with open(corpus_path, "wb") as w:
+			pickle.dump(bm25_dict, w)
+
+
+def select_bm25_tokenizer(
+	bm25_tokenizer: str,
+) -> Callable[[str], List[Union[int, str]]]:
+	if bm25_tokenizer in list(BM25_TOKENIZER.keys()):
+		return BM25_TOKENIZER[bm25_tokenizer]
+
+	return AutoTokenizer.from_pretrained(bm25_tokenizer, use_fast=False)
--- a/autorag/nodes/retrieval/hybrid_cc.py
+++ b/autorag/nodes/retrieval/hybrid_cc.py
@@ -0,0 +1,214 @@
+import os
+from pathlib import Path
+from typing import Tuple, List, Union
+
+import numpy as np
+import pandas as pd
+
+from autorag.nodes.retrieval.base import HybridRetrieval
+from autorag.utils.util import pop_params, fetch_contents, result_to_dataframe
+
+
+def normalize_mm(scores: List[str], fixed_min_value: float = 0):
+	arr = np.array(scores)
+	max_value = np.max(arr)
+	min_value = np.min(arr)
+	norm_score = (arr - min_value) / (max_value - min_value)
+	return norm_score
+
+
+def normalize_tmm(scores: List[str], fixed_min_value: float):
+	arr = np.array(scores)
+	max_value = np.max(arr)
+	norm_score = (arr - fixed_min_value) / (max_value - fixed_min_value)
+	return norm_score
+
+
+def normalize_z(scores: List[str], fixed_min_value: float = 0):
+	arr = np.array(scores)
+	mean_value = np.mean(arr)
+	std_value = np.std(arr)
+	norm_score = (arr - mean_value) / std_value
+	return norm_score
+
+
+def normalize_dbsf(scores: List[str], fixed_min_value: float = 0):
+	arr = np.array(scores)
+	mean_value = np.mean(arr)
+	std_value = np.std(arr)
+	min_value = mean_value - 3 * std_value
+	max_value = mean_value + 3 * std_value
+	norm_score = (arr - min_value) / (max_value - min_value)
+	return norm_score
+
+
+normalize_method_dict = {
+	"mm": normalize_mm,
+	"tmm": normalize_tmm,
+	"z": normalize_z,
+	"dbsf": normalize_dbsf,
+}
+
+
+class HybridCC(HybridRetrieval):
+	def _pure(
+		self,
+		ids: Tuple,
+		scores: Tuple,
+		top_k: int,
+		weight: float,
+		normalize_method: str = "mm",
+		semantic_theoretical_min_value: float = -1.0,
+		lexical_theoretical_min_value: float = 0.0,
+	):
+		return hybrid_cc(
+			ids,
+			scores,
+			top_k,
+			weight,
+			normalize_method,
+			semantic_theoretical_min_value,
+			lexical_theoretical_min_value,
+		)
+
+	@classmethod
+	def run_evaluator(
+		cls,
+		project_dir: Union[str, Path],
+		previous_result: pd.DataFrame,
+		*args,
+		**kwargs,
+	):
+		if "ids" in kwargs and "scores" in kwargs:
+			data_dir = os.path.join(project_dir, "data")
+			corpus_df = pd.read_parquet(
+				os.path.join(data_dir, "corpus.parquet"), engine="pyarrow"
+			)
+
+			params = pop_params(hybrid_cc, kwargs)
+			assert (
+				"ids" in params and "scores" in params and "top_k" in params
+			), "ids, scores, and top_k must be specified."
+
+			@result_to_dataframe(
+				["retrieved_contents", "retrieved_ids", "retrieve_scores"]
+			)
+			def __cc(**cc_params):
+				ids, scores = hybrid_cc(**cc_params)
+				contents = fetch_contents(corpus_df, ids)
+				return contents, ids, scores
+
+			return __cc(**params)
+		else:
+			assert (
+				"target_modules" in kwargs and "target_module_params" in kwargs
+			), "target_modules and target_module_params must be specified if there is not ids and scores."
+			instance = cls(project_dir, *args, **kwargs)
+			result = instance.pure(previous_result, *args, **kwargs)
+			del instance
+			return result
+
+
+def hybrid_cc(
+	ids: Tuple,
+	scores: Tuple,
+	top_k: int,
+	weight: float,
+	normalize_method: str = "mm",
+	semantic_theoretical_min_value: float = -1.0,
+	lexical_theoretical_min_value: float = 0.0,
+) -> Tuple[List[List[str]], List[List[float]]]:
+	"""
+	Hybrid CC function.
+	CC (convex combination) is a method to fuse lexical and semantic retrieval results.
+	It is a method that first normalizes the scores of each retrieval result,
+	and then combines them with the given weights.
+	It is uniquer than other retrieval modules, because it does not really execute retrieval,
+	but just fuse the results of other retrieval functions.
+	So you have to run more than two retrieval modules before running this function.
+	And collect ids and scores result from each retrieval module.
+	Make it as tuple and input it to this function.
+
+	:param ids: The tuple of ids that you want to fuse.
+	    The length of this must be the same as the length of scores.
+	    The semantic retrieval ids must be the first index.
+	:param scores: The retrieve scores that you want to fuse.
+	    The length of this must be the same as the length of ids.
+	    The semantic retrieval scores must be the first index.
+	:param top_k: The number of passages to be retrieved.
+	:param normalize_method: The normalization method to use.
+	  There are some normalization method that you can use at the hybrid cc method.
+	  AutoRAG support following.
+	    - `mm`: Min-max scaling
+	    - `tmm`: Theoretical min-max scaling
+	    - `z`: z-score normalization
+	    - `dbsf`: 3-sigma normalization
+	:param weight: The weight value. If the weight is 1.0, it means the
+	  weight to the semantic module will be 1.0 and weight to the lexical module will be 0.0.
+	:param semantic_theoretical_min_value: This value used by `tmm` normalization method. You can set the
+	    theoretical minimum value by yourself. Default is -1.
+	:param lexical_theoretical_min_value: This value used by `tmm` normalization method. You can set the
+	    theoretical minimum value by yourself. Default is 0.
+	:return: The tuple of ids and fused scores that fused by CC. Plus, the third element is selected weight value.
+	"""
+	assert len(ids) == len(scores), "The length of ids and scores must be the same."
+	assert len(ids) > 1, "You must input more than one retrieval results."
+	assert top_k > 0, "top_k must be greater than 0."
+	assert weight >= 0, "The weight must be greater than 0."
+	assert weight <= 1, "The weight must be less than 1."
+
+	df = pd.DataFrame(
+		{
+			"semantic_ids": ids[0],
+			"lexical_ids": ids[1],
+			"semantic_score": scores[0],
+			"lexical_score": scores[1],
+		}
+	)
+
+	def cc_pure_apply(row):
+		return fuse_per_query(
+			row["semantic_ids"],
+			row["lexical_ids"],
+			row["semantic_score"],
+			row["lexical_score"],
+			normalize_method=normalize_method,
+			weight=weight,
+			top_k=top_k,
+			semantic_theoretical_min_value=semantic_theoretical_min_value,
+			lexical_theoretical_min_value=lexical_theoretical_min_value,
+		)
+
+	# fixed weight
+	df[["cc_id", "cc_score"]] = df.apply(
+		lambda row: cc_pure_apply(row), axis=1, result_type="expand"
+	)
+	return df["cc_id"].tolist(), df["cc_score"].tolist()
+
+
+def fuse_per_query(
+	semantic_ids: List[str],
+	lexical_ids: List[str],
+	semantic_scores: List[float],
+	lexical_scores: List[float],
+	normalize_method: str,
+	weight: float,
+	top_k: int,
+	semantic_theoretical_min_value: float,
+	lexical_theoretical_min_value: float,
+):
+	normalize_func = normalize_method_dict[normalize_method]
+	norm_semantic_scores = normalize_func(
+		semantic_scores, semantic_theoretical_min_value
+	)
+	norm_lexical_scores = normalize_func(lexical_scores, lexical_theoretical_min_value)
+	ids = [semantic_ids, lexical_ids]
+	scores = [norm_semantic_scores, norm_lexical_scores]
+	df = pd.concat(
+		[pd.Series(dict(zip(_id, score))) for _id, score in zip(ids, scores)], axis=1
+	)
+	df.columns = ["semantic", "lexical"]
+	df = df.fillna(0)
+	df["weighted_sum"] = df.mul((weight, 1.0 - weight)).sum(axis=1)
+	df = df.sort_values(by="weighted_sum", ascending=False)
+	return df.index.tolist()[:top_k], df["weighted_sum"][:top_k].tolist()
--- a/autorag/nodes/retrieval/hybrid_rrf.py
+++ b/autorag/nodes/retrieval/hybrid_rrf.py
@@ -0,0 +1,128 @@
+import os
+from pathlib import Path
+from typing import List, Tuple, Union
+
+import pandas as pd
+
+from autorag.nodes.retrieval.base import HybridRetrieval
+from autorag.utils.util import pop_params, fetch_contents, result_to_dataframe
+
+
+class HybridRRF(HybridRetrieval):
+	def _pure(self, ids, scores, top_k: int, weight: int = 60, rrf_k: int = -1):
+		return hybrid_rrf(ids, scores, top_k, weight, rrf_k)
+
+	@classmethod
+	def run_evaluator(
+		cls,
+		project_dir: Union[str, Path],
+		previous_result: pd.DataFrame,
+		*args,
+		**kwargs,
+	):
+		if "ids" in kwargs and "scores" in kwargs:
+			data_dir = os.path.join(project_dir, "data")
+			corpus_df = pd.read_parquet(
+				os.path.join(data_dir, "corpus.parquet"), engine="pyarrow"
+			)
+
+			params = pop_params(hybrid_rrf, kwargs)
+			assert (
+				"ids" in params and "scores" in params and "top_k" in params
+			), "ids, scores, and top_k must be specified."
+
+			@result_to_dataframe(
+				["retrieved_contents", "retrieved_ids", "retrieve_scores"]
+			)
+			def __rrf(**rrf_params):
+				ids, scores = hybrid_rrf(**rrf_params)
+				contents = fetch_contents(corpus_df, ids)
+				return contents, ids, scores
+
+			return __rrf(**params)
+		else:
+			assert (
+				"target_modules" in kwargs and "target_module_params" in kwargs
+			), "target_modules and target_module_params must be specified if there is not ids and scores."
+			instance = cls(project_dir, *args, **kwargs)
+			result = instance.pure(previous_result, *args, **kwargs)
+			del instance
+			return result
+
+
+def hybrid_rrf(
+	ids: Tuple,
+	scores: Tuple,
+	top_k: int,
+	weight: int = 60,
+	rrf_k: int = -1,
+) -> Tuple[List[List[str]], List[List[float]]]:
+	"""
+	Hybrid RRF function.
+	RRF (Rank Reciprocal Fusion) is a method to fuse multiple retrieval results.
+	It is common to fuse dense retrieval and sparse retrieval results using RRF.
+	To use this function, you must input ids and scores as tuple.
+	It is more unique than other retrieval modules because it does not really execute retrieval but just fuses
+	the results of other retrieval functions.
+	So you have to run more than two retrieval modules before running this function.
+	And collect ids and scores result from each retrieval module.
+	Make it as a tuple and input it to this function.
+
+	:param ids: The tuple of ids that you want to fuse.
+	    The length of this must be the same as the length of scores.
+	:param scores: The retrieve scores that you want to fuse.
+	    The length of this must be the same as the length of ids.
+	:param top_k: The number of passages to be retrieved.
+	:param weight: Hyperparameter for RRF.
+	    It was originally rrf_k value.
+	    Default is 60.
+	    For more information, please visit our documentation.
+	:param rrf_k: (Deprecated) Hyperparameter for RRF.
+	    It was originally rrf_k value. Will remove at a further version.
+	:return: The tuple of ids and fused scores that are fused by RRF.
+	"""
+	assert len(ids) == len(scores), "The length of ids and scores must be the same."
+	assert len(ids) > 1, "You must input more than one retrieval results."
+	assert top_k > 0, "top_k must be greater than 0."
+	assert weight > 0, "rrf_k must be greater than 0."
+
+	if rrf_k != -1:
+		weight = int(rrf_k)
+	else:
+		weight = int(weight)
+
+	id_df = pd.DataFrame({f"id_{i}": id_list for i, id_list in enumerate(ids)})
+	score_df = pd.DataFrame(
+		{f"score_{i}": score_list for i, score_list in enumerate(scores)}
+	)
+	df = pd.concat([id_df, score_df], axis=1)
+
+	def rrf_pure_apply(row):
+		ids_tuple = tuple(row[[f"id_{i}" for i in range(len(ids))]].values)
+		scores_tuple = tuple(row[[f"score_{i}" for i in range(len(scores))]].values)
+		return pd.Series(rrf_pure(ids_tuple, scores_tuple, weight, top_k))
+
+	df[["rrf_id", "rrf_score"]] = df.apply(rrf_pure_apply, axis=1)
+	return df["rrf_id"].tolist(), df["rrf_score"].tolist()
+
+
+def rrf_pure(
+	ids: Tuple, scores: Tuple, rrf_k: int, top_k: int
+) -> Tuple[List[str], List[float]]:
+	df = pd.concat(
+		[pd.Series(dict(zip(_id, score))) for _id, score in zip(ids, scores)], axis=1
+	)
+	rank_df = df.rank(ascending=False, method="min")
+	rank_df = rank_df.fillna(0)
+	rank_df["rrf"] = rank_df.apply(lambda row: rrf_calculate(row, rrf_k), axis=1)
+	rank_df = rank_df.sort_values(by="rrf", ascending=False)
+	return rank_df.index.tolist()[:top_k], rank_df["rrf"].tolist()[:top_k]
+
+
+def rrf_calculate(row, rrf_k):
+	result = 0
+	for r in row:
+		if r == 0:
+			continue
+		result += 1 / (r + rrf_k)
+	return result
--- a/autorag/nodes/retrieval/run.py
+++ b/autorag/nodes/retrieval/run.py
@@ -0,0 +1,544 @@
+import logging
+import os
+import pathlib
+from copy import deepcopy
+from typing import List, Callable, Dict, Tuple, Union
+
+import numpy as np
+import pandas as pd
+
+from autorag.evaluation import evaluate_retrieval
+from autorag.schema.metricinput import MetricInput
+from autorag.strategy import measure_speed, filter_by_threshold, select_best
+from autorag.support import get_support_modules
+from autorag.utils.util import get_best_row, to_list, apply_recursive
+
+logger = logging.getLogger("AutoRAG")
+
+semantic_module_names = ["vectordb", "VectorDB"]
+lexical_module_names = ["bm25", "BM25"]
+hybrid_module_names = ["hybrid_rrf", "hybrid_cc", "HybridCC", "HybridRRF"]
+
+
+def run_retrieval_node(
+	modules: List,
+	module_params: List[Dict],
+	previous_result: pd.DataFrame,
+	node_line_dir: str,
+	strategies: Dict,
+) -> pd.DataFrame:
+	"""
+	Run evaluation and select the best module among retrieval node results.
+
+	:param modules: Retrieval modules to run.
+	:param module_params: Retrieval module parameters.
+	:param previous_result: Previous result dataframe.
+	    Could be query expansion's best result or qa data.
+	:param node_line_dir: This node line's directory.
+	:param strategies: Strategies for retrieval node.
+	:return: The best result dataframe.
+	    It contains previous result columns and retrieval node's result columns.
+	"""
+	if not os.path.exists(node_line_dir):
+		os.makedirs(node_line_dir)
+	project_dir = pathlib.PurePath(node_line_dir).parent.parent
+	qa_df = pd.read_parquet(
+		os.path.join(project_dir, "data", "qa.parquet"), engine="pyarrow"
+	)
+	retrieval_gt = qa_df["retrieval_gt"].tolist()
+	retrieval_gt = apply_recursive(lambda x: str(x), to_list(retrieval_gt))
+	# make rows to metric_inputs
+	metric_inputs = [
+		MetricInput(retrieval_gt=ret_gt, query=query, generation_gt=gen_gt)
+		for ret_gt, query, gen_gt in zip(
+			retrieval_gt, qa_df["query"].tolist(), qa_df["generation_gt"].tolist()
+		)
+	]
+
+	save_dir = os.path.join(node_line_dir, "retrieval")  # node name
+	if not os.path.exists(save_dir):
+		os.makedirs(save_dir)
+
+	def run(input_modules, input_module_params) -> Tuple[List[pd.DataFrame], List]:
+		"""
+		Run input modules and parameters.
+
+		:param input_modules: Input modules
+		:param input_module_params: Input module parameters
+		:return: First, it returns list of result dataframe.
+		Second, it returns list of execution times.
+		"""
+		result, execution_times = zip(
+			*map(
+				lambda task: measure_speed(
+					task[0].run_evaluator,
+					project_dir=project_dir,
+					previous_result=previous_result,
+					**task[1],
+				),
+				zip(input_modules, input_module_params),
+			)
+		)
+		average_times = list(map(lambda x: x / len(result[0]), execution_times))
+
+		# run metrics before filtering
+		if strategies.get("metrics") is None:
+			raise ValueError("You must at least one metrics for retrieval evaluation.")
+		result = list(
+			map(
+				lambda x: evaluate_retrieval_node(
+					x,
+					metric_inputs,
+					strategies.get("metrics"),
+				),
+				result,
+			)
+		)
+
+		return result, average_times
+
+	def save_and_summary(
+		input_modules,
+		input_module_params,
+		result_list,
+		execution_time_list,
+		filename_start: int,
+	):
+		"""
+		Save the result and make summary file
+
+		:param input_modules: Input modules
+		:param input_module_params: Input module parameters
+		:param result_list: Result list
+		:param execution_time_list: Execution times
+		:param filename_start: The first filename to use
+		:return: First, it returns list of result dataframe.
+		Second, it returns list of execution times.
+		"""
+
+		# save results to folder
+		filepaths = list(
+			map(
+				lambda x: os.path.join(save_dir, f"{x}.parquet"),
+				range(filename_start, filename_start + len(input_modules)),
+			)
+		)
+		list(
+			map(
+				lambda x: x[0].to_parquet(x[1], index=False),
+				zip(result_list, filepaths),
+			)
+		)  # execute save to parquet
+		filename_list = list(map(lambda x: os.path.basename(x), filepaths))
+
+		summary_df = pd.DataFrame(
+			{
+				"filename": filename_list,
+				"module_name": list(map(lambda module: module.__name__, input_modules)),
+				"module_params": input_module_params,
+				"execution_time": execution_time_list,
+				**{
+					metric: list(map(lambda result: result[metric].mean(), result_list))
+					for metric in strategies.get("metrics")
+				},
+			}
+		)
+		summary_df.to_csv(os.path.join(save_dir, "summary.csv"), index=False)
+		return summary_df
+
+	def find_best(results, average_times, filenames):
+		# filter by strategies
+		if strategies.get("speed_threshold") is not None:
+			results, filenames = filter_by_threshold(
+				results, average_times, strategies["speed_threshold"], filenames
+			)
+		selected_result, selected_filename = select_best(
+			results,
+			strategies.get("metrics"),
+			filenames,
+			strategies.get("strategy", "mean"),
+		)
+		return selected_result, selected_filename
+
+	filename_first = 0
+	# run semantic modules
+	logger.info("Running retrieval node - semantic retrieval module...")
+	if any([module.__name__ in semantic_module_names for module in modules]):
+		semantic_modules, semantic_module_params = zip(
+			*filter(
+				lambda x: x[0].__name__ in semantic_module_names,
+				zip(modules, module_params),
+			)
+		)
+		semantic_results, semantic_times = run(semantic_modules, semantic_module_params)
+		semantic_summary_df = save_and_summary(
+			semantic_modules,
+			semantic_module_params,
+			semantic_results,
+			semantic_times,
+			filename_first,
+		)
+		semantic_selected_result, semantic_selected_filename = find_best(
+			semantic_results, semantic_times, semantic_summary_df["filename"].tolist()
+		)
+		semantic_summary_df["is_best"] = (
+			semantic_summary_df["filename"] == semantic_selected_filename
+		)
+		filename_first += len(semantic_modules)
+	else:
+		(
+			semantic_selected_filename,
+			semantic_summary_df,
+			semantic_results,
+			semantic_times,
+		) = None, pd.DataFrame(), [], []
+	# run lexical modules
+	logger.info("Running retrieval node - lexical retrieval module...")
+	if any([module.__name__ in lexical_module_names for module in modules]):
+		lexical_modules, lexical_module_params = zip(
+			*filter(
+				lambda x: x[0].__name__ in lexical_module_names,
+				zip(modules, module_params),
+			)
+		)
+		lexical_results, lexical_times = run(lexical_modules, lexical_module_params)
+		lexical_summary_df = save_and_summary(
+			lexical_modules,
+			lexical_module_params,
+			lexical_results,
+			lexical_times,
+			filename_first,
+		)
+		lexical_selected_result, lexical_selected_filename = find_best(
+			lexical_results, lexical_times, lexical_summary_df["filename"].tolist()
+		)
+		lexical_summary_df["is_best"] = (
+			lexical_summary_df["filename"] == lexical_selected_filename
+		)
+		filename_first += len(lexical_modules)
+	else:
+		(
+			lexical_selected_filename,
+			lexical_summary_df,
+			lexical_results,
+			lexical_times,
+		) = None, pd.DataFrame(), [], []
+
+	logger.info("Running retrieval node - hybrid retrieval module...")
+	# Next, run hybrid retrieval
+	if any([module.__name__ in hybrid_module_names for module in modules]):
+		hybrid_modules, hybrid_module_params = zip(
+			*filter(
+				lambda x: x[0].__name__ in hybrid_module_names,
+				zip(modules, module_params),
+			)
+		)
+		if all(
+			["target_module_params" in x for x in hybrid_module_params]
+		):  # for Runner.run
+			# If target_module_params are already given, run hybrid retrieval directly
+			hybrid_results, hybrid_times = run(hybrid_modules, hybrid_module_params)
+			hybrid_summary_df = save_and_summary(
+				hybrid_modules,
+				hybrid_module_params,
+				hybrid_results,
+				hybrid_times,
+				filename_first,
+			)
+			filename_first += len(hybrid_modules)
+		else:  # for Evaluator
+			# get id and score
+			ids_scores = get_ids_and_scores(
+				save_dir,
+				[semantic_selected_filename, lexical_selected_filename],
+				semantic_summary_df,
+				lexical_summary_df,
+				previous_result,
+			)
+			hybrid_module_params = list(
+				map(lambda x: {**x, **ids_scores}, hybrid_module_params)
+			)
+
+			# optimize each modules
+			real_hybrid_times = [
+				get_hybrid_execution_times(semantic_summary_df, lexical_summary_df)
+			] * len(hybrid_module_params)
+			hybrid_times = real_hybrid_times.copy()
+			hybrid_results = []
+			for module, module_param in zip(hybrid_modules, hybrid_module_params):
+				module_result_df, module_best_weight = optimize_hybrid(
+					module,
+					module_param,
+					strategies,
+					metric_inputs,
+					project_dir,
+					previous_result,
+				)
+				module_param["weight"] = module_best_weight
+				hybrid_results.append(module_result_df)
+
+			hybrid_summary_df = save_and_summary(
+				hybrid_modules,
+				hybrid_module_params,
+				hybrid_results,
+				hybrid_times,
+				filename_first,
+			)
+			filename_first += len(hybrid_modules)
+			hybrid_summary_df["execution_time"] = hybrid_times
+			best_semantic_summary_row = semantic_summary_df.loc[
+				semantic_summary_df["is_best"]
+			].iloc[0]
+			best_lexical_summary_row = lexical_summary_df.loc[
+				lexical_summary_df["is_best"]
+			].iloc[0]
+			target_modules = (
+				best_semantic_summary_row["module_name"],
+				best_lexical_summary_row["module_name"],
+			)
+			target_module_params = (
+				best_semantic_summary_row["module_params"],
+				best_lexical_summary_row["module_params"],
+			)
+			hybrid_summary_df = edit_summary_df_params(
+				hybrid_summary_df, target_modules, target_module_params
+			)
+	else:
+		if any([module.__name__ in hybrid_module_names for module in modules]):
+			logger.warning(
+				"You must at least one semantic module and lexical module for hybrid evaluation."
+				"Passing hybrid module."
+			)
+		_, hybrid_summary_df, hybrid_results, hybrid_times = (
+			None,
+			pd.DataFrame(),
+			[],
+			[],
+		)
+
+	summary = pd.concat(
+		[semantic_summary_df, lexical_summary_df, hybrid_summary_df], ignore_index=True
+	)
+	results = semantic_results + lexical_results + hybrid_results
+	average_times = semantic_times + lexical_times + hybrid_times
+	filenames = summary["filename"].tolist()
+
+	# filter by strategies
+	selected_result, selected_filename = find_best(results, average_times, filenames)
+	best_result = pd.concat([previous_result, selected_result], axis=1)
+
+	# add summary.csv 'is_best' column
+	summary["is_best"] = summary["filename"] == selected_filename
+
+	# save the result files
+	best_result.to_parquet(
+		os.path.join(
+			save_dir, f"best_{os.path.splitext(selected_filename)[0]}.parquet"
+		),
+		index=False,
+	)
+	summary.to_csv(os.path.join(save_dir, "summary.csv"), index=False)
+	return best_result
+
+
+def evaluate_retrieval_node(
+	result_df: pd.DataFrame,
+	metric_inputs: List[MetricInput],
+	metrics: Union[List[str], List[Dict]],
+) -> pd.DataFrame:
+	"""
+	Evaluate retrieval node from retrieval node result dataframe.
+
+	:param result_df: The result dataframe from a retrieval node.
+	:param metric_inputs: List of metric input schema for AutoRAG.
+	:param metrics: Metric list from input strategies.
+	:return: Return result_df with metrics columns.
+	    The columns will be 'retrieved_contents', 'retrieved_ids', 'retrieve_scores', and metric names.
+	"""
+
+	@evaluate_retrieval(
+		metric_inputs=metric_inputs,
+		metrics=metrics,
+	)
+	def evaluate_this_module(df: pd.DataFrame):
+		return (
+			df["retrieved_contents"].tolist(),
+			df["retrieved_ids"].tolist(),
+			df["retrieve_scores"].tolist(),
+		)
+
+	return evaluate_this_module(result_df)
+
+
+def edit_summary_df_params(
+	summary_df: pd.DataFrame, target_modules, target_module_params
+) -> pd.DataFrame:
+	def delete_ids_scores(x):
+		del x["ids"]
+		del x["scores"]
+		return x
+
+	summary_df["module_params"] = summary_df["module_params"].apply(delete_ids_scores)
+	summary_df["new_params"] = [
+		{"target_modules": target_modules, "target_module_params": target_module_params}
+	] * len(summary_df)
+	summary_df["module_params"] = summary_df.apply(
+		lambda row: {**row["module_params"], **row["new_params"]}, axis=1
+	)
+	summary_df = summary_df.drop(columns=["new_params"])
+	return summary_df
+
+
+def get_ids_and_scores(
+	node_dir: str,
+	filenames: List[str],
+	semantic_summary_df: pd.DataFrame,
+	lexical_summary_df: pd.DataFrame,
+	previous_result,
+) -> Dict[str, Tuple[List[List[str]], List[List[float]]]]:
+	project_dir = pathlib.PurePath(node_dir).parent.parent.parent
+	best_results_df = list(
+		map(
+			lambda filename: pd.read_parquet(
+				os.path.join(node_dir, filename), engine="pyarrow"
+			),
+			filenames,
+		)
+	)
+	ids = tuple(
+		map(lambda df: df["retrieved_ids"].apply(list).tolist(), best_results_df)
+	)
+	scores = tuple(
+		map(lambda df: df["retrieve_scores"].apply(list).tolist(), best_results_df)
+	)
+	# search non-duplicate ids
+	semantic_ids = deepcopy(ids[0])
+	lexical_ids = deepcopy(ids[1])
+
+	def get_non_duplicate_ids(target_ids, compare_ids) -> List[List[str]]:
+		"""
+		Get non-duplicate ids from target_ids and compare_ids.
+		If you want to non-duplicate ids of semantic_ids, you have to put it at target_ids.
+		"""
+		result_ids = []
+		assert len(target_ids) == len(compare_ids)
+		for target_id_list, compare_id_list in zip(target_ids, compare_ids):
+			query_duplicated = list(set(compare_id_list) - set(target_id_list))
+			duplicate_list = query_duplicated if len(query_duplicated) != 0 else []
+			result_ids.append(duplicate_list)
+		return result_ids
+
+	lexical_target_ids = get_non_duplicate_ids(lexical_ids, semantic_ids)
+	semantic_target_ids = get_non_duplicate_ids(semantic_ids, lexical_ids)
+
+	new_id_tuple = (
+		[a + b for a, b in zip(semantic_ids, semantic_target_ids)],
+		[a + b for a, b in zip(lexical_ids, lexical_target_ids)],
+	)
+
+	# search non-duplicate ids' scores
+	new_semantic_scores = get_scores_by_ids(
+		semantic_target_ids, semantic_summary_df, project_dir, previous_result
+	)
+	new_lexical_scores = get_scores_by_ids(
+		lexical_target_ids, lexical_summary_df, project_dir, previous_result
+	)
+
+	new_score_tuple = (
+		[a + b for a, b in zip(scores[0], new_semantic_scores)],
+		[a + b for a, b in zip(scores[1], new_lexical_scores)],
+	)
+	return {
+		"ids": new_id_tuple,
+		"scores": new_score_tuple,
+	}
+
+
+def get_scores_by_ids(
+	ids: List[List[str]], module_summary_df: pd.DataFrame, project_dir, previous_result
+) -> List[List[float]]:
+	module_name = get_best_row(module_summary_df)["module_name"]
+	module_params = get_best_row(module_summary_df)["module_params"]
+	module = get_support_modules(module_name)
+	result_df = module.run_evaluator(
+		project_dir=project_dir,
+		previous_result=previous_result,
+		ids=ids,
+		**module_params,
+	)
+	return to_list(result_df["retrieve_scores"].tolist())
+
+
+def find_unique_elems(list1: List[str], list2: List[str]) -> List[str]:
+	return list(set(list1).symmetric_difference(set(list2)))
+
+
+def get_hybrid_execution_times(lexical_summary, semantic_summary) -> float:
+	lexical_execution_time = lexical_summary.loc[lexical_summary["is_best"]].iloc[0][
+		"execution_time"
+	]
+	semantic_execution_time = semantic_summary.loc[semantic_summary["is_best"]].iloc[0][
+		"execution_time"
+	]
+	return lexical_execution_time + semantic_execution_time
+
+
+def optimize_hybrid(
+	hybrid_module_func: Callable,
+	hybrid_module_param: Dict,
+	strategy: Dict,
+	input_metrics: List[MetricInput],
+	project_dir,
+	previous_result,
+):
+	if (
+		hybrid_module_func.__name__ == "HybridRRF"
+		or hybrid_module_func.__name__ == "hybrid_rrf"
+	):
+		weight_range = hybrid_module_param.pop("weight_range", (4, 80))
+		test_weight_size = weight_range[1] - weight_range[0] + 1
+	elif (
+		hybrid_module_func.__name__ == "HybridCC"
+		or hybrid_module_func.__name__ == "hybrid_cc"
+	):
+		weight_range = hybrid_module_param.pop("weight_range", (0.0, 1.0))
+		test_weight_size = hybrid_module_param.pop("test_weight_size", 101)
+	else:
+		raise ValueError("You must input hybrid module function at hybrid_module_func.")
+
+	weight_candidates = np.linspace(
+		weight_range[0], weight_range[1], test_weight_size
+	).tolist()
+
+	result_list = []
+	for weight_value in weight_candidates:
+		result_df = hybrid_module_func.run_evaluator(
+			project_dir=project_dir,
+			previous_result=previous_result,
+			weight=weight_value,
+			**hybrid_module_param,
+		)
+		result_list.append(result_df)
+
+		# evaluate here
+	if strategy.get("metrics") is None:
+		raise ValueError("You must at least one metrics for retrieval evaluation.")
+	result_list = list(
+		map(
+			lambda x: evaluate_retrieval_node(
+				x,
+				input_metrics,
+				strategy.get("metrics"),
+			),
+			result_list,
+		)
+	)
+
+	# select best result
+	best_result_df, best_weight = select_best(
+		result_list,
+		strategy.get("metrics"),
+		metadatas=weight_candidates,
+		strategy_name=strategy.get("strategy", "normalize_mean"),
+	)
+	return best_result_df, best_weight
--- a/autorag/nodes/retrieval/vectordb.py
+++ b/autorag/nodes/retrieval/vectordb.py
@@ -0,0 +1,303 @@
+import itertools
+import logging
+import os
+from typing import List, Tuple, Optional
+
+import numpy as np
+import pandas as pd
+from llama_index.core.embeddings import BaseEmbedding
+from llama_index.embeddings.openai import OpenAIEmbedding
+
+from autorag.evaluation.metric.util import (
+	calculate_l2_distance,
+	calculate_inner_product,
+	calculate_cosine_similarity,
+)
+from autorag.nodes.retrieval.base import evenly_distribute_passages, BaseRetrieval
+from autorag.utils import (
+	validate_corpus_dataset,
+	cast_corpus_dataset,
+	cast_qa_dataset,
+	validate_qa_dataset,
+)
+from autorag.utils.util import (
+	get_event_loop,
+	process_batch,
+	openai_truncate_by_token,
+	flatten_apply,
+	result_to_dataframe,
+	pop_params,
+	fetch_contents,
+	empty_cuda_cache,
+	convert_inputs_to_list,
+	make_batch,
+)
+from autorag.vectordb import load_vectordb_from_yaml
+from autorag.vectordb.base import BaseVectorStore
+
+logger = logging.getLogger("AutoRAG")
+
+
+class VectorDB(BaseRetrieval):
+	def __init__(self, project_dir: str, vectordb: str = "default", **kwargs):
+		"""
+		Initialize VectorDB retrieval node.
+
+		:param project_dir: The project directory path.
+		:param vectordb: The vectordb name.
+			You must configure the vectordb name in the config.yaml file.
+			If you don't configure, it uses the default vectordb.
+		:param kwargs: The optional arguments.
+			Not affected in the init method.
+		"""
+		super().__init__(project_dir)
+
+		vectordb_config_path = os.path.join(self.resources_dir, "vectordb.yaml")
+		self.vector_store = load_vectordb_from_yaml(
+			vectordb_config_path, vectordb, project_dir
+		)
+
+		self.embedding_model = self.vector_store.embedding
+
+	def __del__(self):
+		del self.vector_store
+		del self.embedding_model
+		empty_cuda_cache()
+		super().__del__()
+
+	@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
+	def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
+		queries = self.cast_to_run(previous_result)
+		pure_params = pop_params(self._pure, kwargs)
+		ids, scores = self._pure(queries, **pure_params)
+		contents = fetch_contents(self.corpus_df, ids)
+		return contents, ids, scores
+
+	def _pure(
+		self,
+		queries: List[List[str]],
+		top_k: int,
+		embedding_batch: int = 128,
+		ids: Optional[List[List[str]]] = None,
+	) -> Tuple[List[List[str]], List[List[float]]]:
+		"""
+		VectorDB retrieval function.
+		You have to get a chroma collection that is already ingested.
+		You have to get an embedding model that is already used in ingesting.
+
+		:param queries: 2-d list of query strings.
+		    Each element of the list is a query strings of each row.
+		:param top_k: The number of passages to be retrieved.
+		:param embedding_batch: The number of queries to be processed in parallel.
+		    This is used to prevent API error at the query embedding.
+		    Default is 128.
+		:param ids: The optional list of ids that you want to retrieve.
+		    You don't need to specify this in the general use cases.
+		    Default is None.
+
+		:return: The 2-d list contains a list of passage ids that retrieved from vectordb and 2-d list of its scores.
+		    It will be a length of queries. And each element has a length of top_k.
+		"""
+		# if ids are specified, fetch the ids score from Chroma
+		if ids is not None:
+			return self.__get_ids_scores(queries, ids, embedding_batch)
+
+		# run async vector_db_pure function
+		tasks = [
+			vectordb_pure(query_list, top_k, self.vector_store)
+			for query_list in queries
+		]
+		loop = get_event_loop()
+		results = loop.run_until_complete(
+			process_batch(tasks, batch_size=embedding_batch)
+		)
+		id_result = list(map(lambda x: x[0], results))
+		score_result = list(map(lambda x: x[1], results))
+		return id_result, score_result
+
+	def __get_ids_scores(self, queries, ids, embedding_batch: int):
+		# truncate queries and embedding execution here.
+		openai_embedding_limit = 8000
+		if isinstance(self.embedding_model, OpenAIEmbedding):
+			queries = list(
+				map(
+					lambda query_list: openai_truncate_by_token(
+						query_list,
+						openai_embedding_limit,
+						self.embedding_model.model_name,
+					),
+					queries,
+				)
+			)
+
+		query_embeddings = flatten_apply(
+			run_query_embedding_batch,
+			queries,
+			embedding_model=self.embedding_model,
+			batch_size=embedding_batch,
+		)
+
+		loop = get_event_loop()
+
+		async def run_fetch(ids):
+			final_result = []
+			for id_list in ids:
+				if len(id_list) == 0:
+					final_result.append([])
+				else:
+					result = await self.vector_store.fetch(id_list)
+					final_result.append(result)
+			return final_result
+
+		content_embeddings = loop.run_until_complete(run_fetch(ids))
+
+		score_result = list(
+			map(
+				lambda query_embedding_list, content_embedding_list: get_id_scores(
+					query_embedding_list,
+					content_embedding_list,
+					similarity_metric=self.vector_store.similarity_metric,
+				),
+				query_embeddings,
+				content_embeddings,
+			)
+		)
+		return ids, score_result
+
+
+async def vectordb_pure(
+	queries: List[str], top_k: int, vectordb: BaseVectorStore
+) -> Tuple[List[str], List[float]]:
+	"""
+	Async VectorDB retrieval function.
+	Its usage is for async retrieval of vector_db row by row.
+
+	:param query_embeddings: A list of query embeddings.
+	:param top_k: The number of passages to be retrieved.
+	:param vectordb: The vector store instance.
+	:return: The tuple contains a list of passage ids that are retrieved from vectordb and a list of its scores.
+	"""
+	id_result, score_result = await vectordb.query(queries=queries, top_k=top_k)
+
+	# Distribute passages evenly
+	id_result, score_result = evenly_distribute_passages(id_result, score_result, top_k)
+	# sort id_result and score_result by score
+	result = [
+		(_id, score)
+		for score, _id in sorted(
+			zip(score_result, id_result), key=lambda pair: pair[0], reverse=True
+		)
+	]
+	id_result, score_result = zip(*result)
+	return list(id_result), list(score_result)
+
+
+async def filter_exist_ids(
+	vectordb: BaseVectorStore,
+	corpus_data: pd.DataFrame,
+) -> pd.DataFrame:
+	corpus_data = cast_corpus_dataset(corpus_data)
+	validate_corpus_dataset(corpus_data)
+	ids = corpus_data["doc_id"].tolist()
+
+	# Query the collection to check if IDs already exist
+	existed_bool_list = await vectordb.is_exist(ids=ids)
+	# Assuming 'ids' is the key in the response
+	new_passage = corpus_data[~pd.Series(existed_bool_list)]
+	return new_passage
+
+
+async def filter_exist_ids_from_retrieval_gt(
+	vectordb: BaseVectorStore,
+	qa_data: pd.DataFrame,
+	corpus_data: pd.DataFrame,
+) -> pd.DataFrame:
+	qa_data = cast_qa_dataset(qa_data)
+	validate_qa_dataset(qa_data)
+	corpus_data = cast_corpus_dataset(corpus_data)
+	validate_corpus_dataset(corpus_data)
+	retrieval_gt = (
+		qa_data["retrieval_gt"]
+		.apply(lambda x: list(itertools.chain.from_iterable(x)))
+		.tolist()
+	)
+	retrieval_gt = list(itertools.chain.from_iterable(retrieval_gt))
+	retrieval_gt = list(set(retrieval_gt))
+
+	existed_bool_list = await vectordb.is_exist(ids=retrieval_gt)
+	add_ids = []
+	for ret_gt, is_exist in zip(retrieval_gt, existed_bool_list):
+		if not is_exist:
+			add_ids.append(ret_gt)
+	new_passage = corpus_data[corpus_data["doc_id"].isin(add_ids)]
+	return new_passage
+
+
+async def vectordb_ingest(
+	vectordb: BaseVectorStore,
+	corpus_data: pd.DataFrame,
+):
+	"""
+	Ingest given corpus data to the vectordb.
+	It truncates corpus content when the embedding model is OpenAIEmbedding to the 8000 tokens.
+	Plus, when the corpus content is empty (whitespace), it will be ignored.
+	And if there is a document id that already exists in the collection, it will be ignored.
+
+	:param vectordb: A vector stores instance that you want to ingest.
+	:param corpus_data: The corpus data that contains doc_id and contents columns.
+	"""
+	embedding_batch = vectordb.embedding_batch
+	if not corpus_data.empty:
+		new_contents = corpus_data["contents"].tolist()
+		new_ids = corpus_data["doc_id"].tolist()
+		content_batches = make_batch(new_contents, embedding_batch)
+		id_batches = make_batch(new_ids, embedding_batch)
+		for content_batch, id_batch in zip(content_batches, id_batches):
+			await vectordb.add(ids=id_batch, texts=content_batch)
+
+
+def run_query_embedding_batch(
+	queries: List[str], embedding_model: BaseEmbedding, batch_size: int
+) -> List[List[float]]:
+	result = []
+	for i in range(0, len(queries), batch_size):
+		batch = queries[i : i + batch_size]
+		embeddings = embedding_model.get_text_embedding_batch(batch)
+		result.extend(embeddings)
+	return result
+
+
+@convert_inputs_to_list
+def get_id_scores(  # To find the uncalculated score when fuse the scores for the hybrid retrieval
+	query_embeddings: List[
+		List[float]
+	],  # `queries` is input. This is one user input query.
+	content_embeddings: List[List[float]],
+	similarity_metric: str,
+) -> List[
+	float
+]:  # The most high scores among each query. The length of a result is the same as the contents length.
+	"""
+	Calculate the highest similarity scores between query embeddings and content embeddings.
+
+	:param query_embeddings: A list of lists containing query embeddings.
+	:param content_embeddings: A list of lists containing content embeddings.
+	:param similarity_metric: The similarity metric to use ('l2', 'ip', or 'cosine').
+	:return: A list of the highest similarity scores for each content embedding.
+	"""
+	metric_func_dict = {
+		"l2": lambda x, y: 1 - calculate_l2_distance(x, y),
+		"ip": calculate_inner_product,
+		"cosine": calculate_cosine_similarity,
+	}
+	metric_func = metric_func_dict[similarity_metric]
+
+	result = []
+	for content_embedding in content_embeddings:
+		scores = []
+		for query_embedding in query_embeddings:
+			scores.append(
+				metric_func(np.array(query_embedding), np.array(content_embedding))
+			)
+		result.append(max(scores))
+	return result