Fix Dockerfile build issue

This commit is contained in:
2025-03-18 16:41:12 +09:00
parent 6814230bfb
commit 9323aa254a
228 changed files with 467 additions and 3488 deletions

View File

@@ -0,0 +1,4 @@
from .bm25 import BM25
from .hybrid_cc import HybridCC
from .hybrid_rrf import HybridRRF
from .vectordb import VectorDB

View File

@@ -0,0 +1,127 @@
import abc
import logging
import os
from typing import List, Union, Tuple
import pandas as pd
from autorag.schema import BaseModule
from autorag.support import get_support_modules
from autorag.utils import fetch_contents, result_to_dataframe, validate_qa_dataset
from autorag.utils.util import pop_params
logger = logging.getLogger("AutoRAG")
class BaseRetrieval(BaseModule, metaclass=abc.ABCMeta):
def __init__(self, project_dir: str, *args, **kwargs):
logger.info(f"Initialize retrieval node - {self.__class__.__name__}")
self.resources_dir = os.path.join(project_dir, "resources")
data_dir = os.path.join(project_dir, "data")
# fetch data from corpus_data
self.corpus_df = pd.read_parquet(
os.path.join(data_dir, "corpus.parquet"), engine="pyarrow"
)
def __del__(self):
logger.info(f"Deleting retrieval node - {self.__class__.__name__} module...")
def cast_to_run(self, previous_result: pd.DataFrame, *args, **kwargs):
logger.info(f"Running retrieval node - {self.__class__.__name__} module...")
validate_qa_dataset(previous_result)
# find queries columns & type cast queries
assert (
"query" in previous_result.columns
), "previous_result must have query column."
if "queries" not in previous_result.columns:
previous_result["queries"] = previous_result["query"]
previous_result.loc[:, "queries"] = previous_result["queries"].apply(
cast_queries
)
queries = previous_result["queries"].tolist()
return queries
class HybridRetrieval(BaseRetrieval, metaclass=abc.ABCMeta):
def __init__(
self, project_dir: str, target_modules, target_module_params, *args, **kwargs
):
super().__init__(project_dir)
self.target_modules = list(
map(
lambda x, y: get_support_modules(x)(
**y,
project_dir=project_dir,
),
target_modules,
target_module_params,
)
)
self.target_module_params = target_module_params
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
result_dfs: List[pd.DataFrame] = list(
map(
lambda x, y: x.pure(
**y,
previous_result=previous_result,
),
self.target_modules,
self.target_module_params,
)
)
ids = tuple(
map(lambda df: df["retrieved_ids"].apply(list).tolist(), result_dfs)
)
scores = tuple(
map(
lambda df: df["retrieve_scores"].apply(list).tolist(),
result_dfs,
)
)
_pure_params = pop_params(self._pure, kwargs)
if "ids" in _pure_params or "scores" in _pure_params:
raise ValueError(
"With specifying ids or scores, you must use HybridRRF.run_evaluator instead."
)
ids, scores = self._pure(ids=ids, scores=scores, **_pure_params)
contents = fetch_contents(self.corpus_df, ids)
return contents, ids, scores
def cast_queries(queries: Union[str, List[str]]) -> List[str]:
if isinstance(queries, str):
return [queries]
elif isinstance(queries, List):
return queries
else:
raise ValueError(f"queries must be str or list, but got {type(queries)}")
def evenly_distribute_passages(
ids: List[List[str]], scores: List[List[float]], top_k: int
) -> Tuple[List[str], List[float]]:
assert len(ids) == len(scores), "ids and scores must have same length."
query_cnt = len(ids)
avg_len = top_k // query_cnt
remainder = top_k % query_cnt
new_ids = []
new_scores = []
for i in range(query_cnt):
if i < remainder:
new_ids.extend(ids[i][: avg_len + 1])
new_scores.extend(scores[i][: avg_len + 1])
else:
new_ids.extend(ids[i][:avg_len])
new_scores.extend(scores[i][:avg_len])
return new_ids, new_scores
def get_bm25_pkl_name(bm25_tokenizer: str):
bm25_tokenizer = bm25_tokenizer.replace("/", "")
return f"bm25_{bm25_tokenizer}.pkl"

View File

@@ -0,0 +1,365 @@
import asyncio
import os
import pickle
import re
from typing import List, Dict, Tuple, Callable, Union, Iterable, Optional
import numpy as np
import pandas as pd
from llama_index.core.indices.keyword_table.utils import simple_extract_keywords
from nltk import PorterStemmer
from rank_bm25 import BM25Okapi
from transformers import AutoTokenizer, PreTrainedTokenizerBase
from autorag.nodes.retrieval.base import (
evenly_distribute_passages,
BaseRetrieval,
get_bm25_pkl_name,
)
from autorag.utils import validate_corpus_dataset, fetch_contents
from autorag.utils.util import (
get_event_loop,
normalize_string,
result_to_dataframe,
pop_params,
)
def tokenize_ko_kiwi(texts: List[str]) -> List[List[str]]:
try:
from kiwipiepy import Kiwi, Token
except ImportError:
raise ImportError(
"You need to install kiwipiepy to use 'ko_kiwi' tokenizer. "
"Please install kiwipiepy by running 'pip install kiwipiepy'. "
"Or install Korean version of AutoRAG by running 'pip install AutoRAG[ko]'."
)
texts = list(map(lambda x: x.strip().lower(), texts))
kiwi = Kiwi()
tokenized_list: Iterable[List[Token]] = kiwi.tokenize(texts)
return [list(map(lambda x: x.form, token_list)) for token_list in tokenized_list]
def tokenize_ko_kkma(texts: List[str]) -> List[List[str]]:
try:
from konlpy.tag import Kkma
except ImportError:
raise ImportError(
"You need to install konlpy to use 'ko_kkma' tokenizer. "
"Please install konlpy by running 'pip install konlpy'. "
"Or install Korean version of AutoRAG by running 'pip install AutoRAG[ko]'."
)
tokenizer = Kkma()
tokenized_list: List[List[str]] = list(map(lambda x: tokenizer.morphs(x), texts))
return tokenized_list
def tokenize_ko_okt(texts: List[str]) -> List[List[str]]:
try:
from konlpy.tag import Okt
except ImportError:
raise ImportError(
"You need to install konlpy to use 'ko_kkma' tokenizer. "
"Please install konlpy by running 'pip install konlpy'. "
"Or install Korean version of AutoRAG by running 'pip install AutoRAG[ko]'."
)
tokenizer = Okt()
tokenized_list: List[List[str]] = list(map(lambda x: tokenizer.morphs(x), texts))
return tokenized_list
def tokenize_porter_stemmer(texts: List[str]) -> List[List[str]]:
def tokenize_remove_stopword(text: str, stemmer) -> List[str]:
text = text.lower()
words = list(simple_extract_keywords(text))
return [stemmer.stem(word) for word in words]
stemmer = PorterStemmer()
tokenized_list: List[List[str]] = list(
map(lambda x: tokenize_remove_stopword(x, stemmer), texts)
)
return tokenized_list
def tokenize_space(texts: List[str]) -> List[List[str]]:
def tokenize_space_text(text: str) -> List[str]:
text = normalize_string(text)
return re.split(r"\s+", text.strip())
return list(map(tokenize_space_text, texts))
def load_bm25_corpus(bm25_path: str) -> Dict:
if bm25_path is None:
return {}
with open(bm25_path, "rb") as f:
bm25_corpus = pickle.load(f)
return bm25_corpus
def tokenize_ja_sudachipy(texts: List[str]) -> List[List[str]]:
try:
from sudachipy import dictionary, tokenizer
except ImportError:
raise ImportError(
"You need to install SudachiPy to use 'sudachipy' tokenizer. "
"Please install SudachiPy by running 'pip install sudachipy'."
)
# Initialize SudachiPy with the default tokenizer
tokenizer_obj = dictionary.Dictionary(dict="core").create()
# Choose the tokenizer mode: NORMAL, SEARCH, A
mode = tokenizer.Tokenizer.SplitMode.A
# Tokenize the input texts
tokenized_list = []
for text in texts:
tokens = tokenizer_obj.tokenize(text, mode)
tokenized_list.append([token.surface() for token in tokens])
return tokenized_list
BM25_TOKENIZER = {
"porter_stemmer": tokenize_porter_stemmer,
"ko_kiwi": tokenize_ko_kiwi,
"space": tokenize_space,
"ko_kkma": tokenize_ko_kkma,
"ko_okt": tokenize_ko_okt,
"sudachipy": tokenize_ja_sudachipy,
}
class BM25(BaseRetrieval):
def __init__(self, project_dir: str, *args, **kwargs):
"""
Initialize BM25 module.
(Retrieval)
:param project_dir: The project directory path.
:param bm25_tokenizer: The tokenizer name that is used to the BM25.
It supports 'porter_stemmer', 'ko_kiwi', and huggingface `AutoTokenizer`.
You can pass huggingface tokenizer name.
Default is porter_stemmer.
:param kwargs: The optional arguments.
"""
super().__init__(project_dir)
# check if bm25_path and file exist
bm25_tokenizer = kwargs.get("bm25_tokenizer", None)
if bm25_tokenizer is None:
bm25_tokenizer = "porter_stemmer"
bm25_path = os.path.join(self.resources_dir, get_bm25_pkl_name(bm25_tokenizer))
assert (
bm25_path is not None
), "bm25_path must be specified for using bm25 retrieval."
assert os.path.exists(
bm25_path
), f"bm25_path {bm25_path} does not exist. Please ingest first."
self.bm25_corpus = load_bm25_corpus(bm25_path)
assert (
"tokens" and "passage_id" in list(self.bm25_corpus.keys())
), "bm25_corpus must contain tokens and passage_id. Please check you ingested bm25 corpus correctly."
self.tokenizer = select_bm25_tokenizer(bm25_tokenizer)
assert self.bm25_corpus["tokenizer_name"] == bm25_tokenizer, (
f"The bm25 corpus tokenizer is {self.bm25_corpus['tokenizer_name']}, but your input is {bm25_tokenizer}. "
f"You need to ingest again. Delete bm25 pkl file and re-ingest it."
)
self.bm25_instance = BM25Okapi(self.bm25_corpus["tokens"])
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries = self.cast_to_run(previous_result)
pure_params = pop_params(self._pure, kwargs)
ids, scores = self._pure(queries, *args, **pure_params)
contents = fetch_contents(self.corpus_df, ids)
return contents, ids, scores
def _pure(
self,
queries: List[List[str]],
top_k: int,
ids: Optional[List[List[str]]] = None,
) -> Tuple[List[List[str]], List[List[float]]]:
"""
BM25 retrieval function.
You have to load a pickle file that is already ingested.
:param queries: 2-d list of query strings.
Each element of the list is a query strings of each row.
:param top_k: The number of passages to be retrieved.
:param ids: The optional list of ids that you want to retrieve.
You don't need to specify this in the general use cases.
Default is None.
:return: The 2-d list contains a list of passage ids that retrieved from bm25 and 2-d list of its scores.
It will be a length of queries. And each element has a length of top_k.
"""
if ids is not None:
score_result = list(
map(
lambda query_list, id_list: get_bm25_scores(
query_list,
id_list,
self.tokenizer,
self.bm25_instance,
self.bm25_corpus,
),
queries,
ids,
)
)
return ids, score_result
# run async bm25_pure function
tasks = [
bm25_pure(
input_queries,
top_k,
self.tokenizer,
self.bm25_instance,
self.bm25_corpus,
)
for input_queries in queries
]
loop = get_event_loop()
results = loop.run_until_complete(asyncio.gather(*tasks))
id_result = list(map(lambda x: x[0], results))
score_result = list(map(lambda x: x[1], results))
return id_result, score_result
async def bm25_pure(
queries: List[str], top_k: int, tokenizer, bm25_api: BM25Okapi, bm25_corpus: Dict
) -> Tuple[List[str], List[float]]:
"""
Async BM25 retrieval function.
Its usage is for async retrieval of bm25 row by row.
:param queries: A list of query strings.
:param top_k: The number of passages to be retrieved.
:param tokenizer: A tokenizer that will be used to tokenize queries.
:param bm25_api: A bm25 api instance that will be used to retrieve passages.
:param bm25_corpus: A dictionary containing the bm25 corpus, which is doc_id from corpus and tokenized corpus.
Its data structure looks like this:
.. Code:: python
{
"tokens": [], # 2d list of tokens
"passage_id": [], # 2d list of passage_id. Type must be str.
}
:return: The tuple contains a list of passage ids that retrieved from bm25 and its scores.
"""
# I don't make queries operation to async, because queries length might be small, so it will occur overhead.
tokenized_queries = tokenize(queries, tokenizer)
id_result = []
score_result = []
for query in tokenized_queries:
scores = bm25_api.get_scores(query)
sorted_scores = sorted(scores, reverse=True)
top_n_index = np.argsort(scores)[::-1][:top_k]
ids = [bm25_corpus["passage_id"][i] for i in top_n_index]
id_result.append(ids)
score_result.append(sorted_scores[:top_k])
# make a total result to top_k
id_result, score_result = evenly_distribute_passages(id_result, score_result, top_k)
# sort id_result and score_result by score
result = [
(_id, score)
for score, _id in sorted(
zip(score_result, id_result), key=lambda pair: pair[0], reverse=True
)
]
id_result, score_result = zip(*result)
return list(id_result), list(score_result)
def get_bm25_scores(
queries: List[str],
ids: List[str],
tokenizer,
bm25_api: BM25Okapi,
bm25_corpus: Dict,
) -> List[float]:
if len(ids) == 0 or not bool(ids):
return []
tokenized_queries = tokenize(queries, tokenizer)
result_dict = {id_: [] for id_ in ids}
for query in tokenized_queries:
scores = bm25_api.get_scores(query)
for i, id_ in enumerate(ids):
result_dict[id_].append(scores[bm25_corpus["passage_id"].index(id_)])
result_df = pd.DataFrame(result_dict)
return result_df.max(axis=0).tolist()
def tokenize(queries: List[str], tokenizer) -> List[List[int]]:
if isinstance(tokenizer, PreTrainedTokenizerBase):
tokenized_queries = tokenizer(queries).input_ids
else:
tokenized_queries = tokenizer(queries)
return tokenized_queries
def bm25_ingest(
corpus_path: str, corpus_data: pd.DataFrame, bm25_tokenizer: str = "porter_stemmer"
):
if not corpus_path.endswith(".pkl"):
raise ValueError(f"Corpus path {corpus_path} is not a pickle file.")
validate_corpus_dataset(corpus_data)
ids = corpus_data["doc_id"].tolist()
# Initialize bm25_corpus
bm25_corpus = pd.DataFrame()
# Load the BM25 corpus if it exists and get the passage ids
if os.path.exists(corpus_path) and os.path.getsize(corpus_path) > 0:
with open(corpus_path, "rb") as r:
corpus = pickle.load(r)
bm25_corpus = pd.DataFrame.from_dict(corpus)
duplicated_passage_rows = bm25_corpus[bm25_corpus["passage_id"].isin(ids)]
new_passage = corpus_data[
~corpus_data["doc_id"].isin(duplicated_passage_rows["passage_id"])
]
else:
new_passage = corpus_data
if not new_passage.empty:
tokenizer = select_bm25_tokenizer(bm25_tokenizer)
if isinstance(tokenizer, PreTrainedTokenizerBase):
tokenized_corpus = tokenizer(new_passage["contents"].tolist()).input_ids
else:
tokenized_corpus = tokenizer(new_passage["contents"].tolist())
new_bm25_corpus = pd.DataFrame(
{
"tokens": tokenized_corpus,
"passage_id": new_passage["doc_id"].tolist(),
}
)
if not bm25_corpus.empty:
bm25_corpus_updated = pd.concat(
[bm25_corpus, new_bm25_corpus], ignore_index=True
)
bm25_dict = bm25_corpus_updated.to_dict("list")
else:
bm25_dict = new_bm25_corpus.to_dict("list")
# add tokenizer name to bm25_dict
bm25_dict["tokenizer_name"] = bm25_tokenizer
with open(corpus_path, "wb") as w:
pickle.dump(bm25_dict, w)
def select_bm25_tokenizer(
bm25_tokenizer: str,
) -> Callable[[str], List[Union[int, str]]]:
if bm25_tokenizer in list(BM25_TOKENIZER.keys()):
return BM25_TOKENIZER[bm25_tokenizer]
return AutoTokenizer.from_pretrained(bm25_tokenizer, use_fast=False)

View File

@@ -0,0 +1,214 @@
import os
from pathlib import Path
from typing import Tuple, List, Union
import numpy as np
import pandas as pd
from autorag.nodes.retrieval.base import HybridRetrieval
from autorag.utils.util import pop_params, fetch_contents, result_to_dataframe
def normalize_mm(scores: List[str], fixed_min_value: float = 0):
arr = np.array(scores)
max_value = np.max(arr)
min_value = np.min(arr)
norm_score = (arr - min_value) / (max_value - min_value)
return norm_score
def normalize_tmm(scores: List[str], fixed_min_value: float):
arr = np.array(scores)
max_value = np.max(arr)
norm_score = (arr - fixed_min_value) / (max_value - fixed_min_value)
return norm_score
def normalize_z(scores: List[str], fixed_min_value: float = 0):
arr = np.array(scores)
mean_value = np.mean(arr)
std_value = np.std(arr)
norm_score = (arr - mean_value) / std_value
return norm_score
def normalize_dbsf(scores: List[str], fixed_min_value: float = 0):
arr = np.array(scores)
mean_value = np.mean(arr)
std_value = np.std(arr)
min_value = mean_value - 3 * std_value
max_value = mean_value + 3 * std_value
norm_score = (arr - min_value) / (max_value - min_value)
return norm_score
normalize_method_dict = {
"mm": normalize_mm,
"tmm": normalize_tmm,
"z": normalize_z,
"dbsf": normalize_dbsf,
}
class HybridCC(HybridRetrieval):
def _pure(
self,
ids: Tuple,
scores: Tuple,
top_k: int,
weight: float,
normalize_method: str = "mm",
semantic_theoretical_min_value: float = -1.0,
lexical_theoretical_min_value: float = 0.0,
):
return hybrid_cc(
ids,
scores,
top_k,
weight,
normalize_method,
semantic_theoretical_min_value,
lexical_theoretical_min_value,
)
@classmethod
def run_evaluator(
cls,
project_dir: Union[str, Path],
previous_result: pd.DataFrame,
*args,
**kwargs,
):
if "ids" in kwargs and "scores" in kwargs:
data_dir = os.path.join(project_dir, "data")
corpus_df = pd.read_parquet(
os.path.join(data_dir, "corpus.parquet"), engine="pyarrow"
)
params = pop_params(hybrid_cc, kwargs)
assert (
"ids" in params and "scores" in params and "top_k" in params
), "ids, scores, and top_k must be specified."
@result_to_dataframe(
["retrieved_contents", "retrieved_ids", "retrieve_scores"]
)
def __cc(**cc_params):
ids, scores = hybrid_cc(**cc_params)
contents = fetch_contents(corpus_df, ids)
return contents, ids, scores
return __cc(**params)
else:
assert (
"target_modules" in kwargs and "target_module_params" in kwargs
), "target_modules and target_module_params must be specified if there is not ids and scores."
instance = cls(project_dir, *args, **kwargs)
result = instance.pure(previous_result, *args, **kwargs)
del instance
return result
def hybrid_cc(
ids: Tuple,
scores: Tuple,
top_k: int,
weight: float,
normalize_method: str = "mm",
semantic_theoretical_min_value: float = -1.0,
lexical_theoretical_min_value: float = 0.0,
) -> Tuple[List[List[str]], List[List[float]]]:
"""
Hybrid CC function.
CC (convex combination) is a method to fuse lexical and semantic retrieval results.
It is a method that first normalizes the scores of each retrieval result,
and then combines them with the given weights.
It is uniquer than other retrieval modules, because it does not really execute retrieval,
but just fuse the results of other retrieval functions.
So you have to run more than two retrieval modules before running this function.
And collect ids and scores result from each retrieval module.
Make it as tuple and input it to this function.
:param ids: The tuple of ids that you want to fuse.
The length of this must be the same as the length of scores.
The semantic retrieval ids must be the first index.
:param scores: The retrieve scores that you want to fuse.
The length of this must be the same as the length of ids.
The semantic retrieval scores must be the first index.
:param top_k: The number of passages to be retrieved.
:param normalize_method: The normalization method to use.
There are some normalization method that you can use at the hybrid cc method.
AutoRAG support following.
- `mm`: Min-max scaling
- `tmm`: Theoretical min-max scaling
- `z`: z-score normalization
- `dbsf`: 3-sigma normalization
:param weight: The weight value. If the weight is 1.0, it means the
weight to the semantic module will be 1.0 and weight to the lexical module will be 0.0.
:param semantic_theoretical_min_value: This value used by `tmm` normalization method. You can set the
theoretical minimum value by yourself. Default is -1.
:param lexical_theoretical_min_value: This value used by `tmm` normalization method. You can set the
theoretical minimum value by yourself. Default is 0.
:return: The tuple of ids and fused scores that fused by CC. Plus, the third element is selected weight value.
"""
assert len(ids) == len(scores), "The length of ids and scores must be the same."
assert len(ids) > 1, "You must input more than one retrieval results."
assert top_k > 0, "top_k must be greater than 0."
assert weight >= 0, "The weight must be greater than 0."
assert weight <= 1, "The weight must be less than 1."
df = pd.DataFrame(
{
"semantic_ids": ids[0],
"lexical_ids": ids[1],
"semantic_score": scores[0],
"lexical_score": scores[1],
}
)
def cc_pure_apply(row):
return fuse_per_query(
row["semantic_ids"],
row["lexical_ids"],
row["semantic_score"],
row["lexical_score"],
normalize_method=normalize_method,
weight=weight,
top_k=top_k,
semantic_theoretical_min_value=semantic_theoretical_min_value,
lexical_theoretical_min_value=lexical_theoretical_min_value,
)
# fixed weight
df[["cc_id", "cc_score"]] = df.apply(
lambda row: cc_pure_apply(row), axis=1, result_type="expand"
)
return df["cc_id"].tolist(), df["cc_score"].tolist()
def fuse_per_query(
semantic_ids: List[str],
lexical_ids: List[str],
semantic_scores: List[float],
lexical_scores: List[float],
normalize_method: str,
weight: float,
top_k: int,
semantic_theoretical_min_value: float,
lexical_theoretical_min_value: float,
):
normalize_func = normalize_method_dict[normalize_method]
norm_semantic_scores = normalize_func(
semantic_scores, semantic_theoretical_min_value
)
norm_lexical_scores = normalize_func(lexical_scores, lexical_theoretical_min_value)
ids = [semantic_ids, lexical_ids]
scores = [norm_semantic_scores, norm_lexical_scores]
df = pd.concat(
[pd.Series(dict(zip(_id, score))) for _id, score in zip(ids, scores)], axis=1
)
df.columns = ["semantic", "lexical"]
df = df.fillna(0)
df["weighted_sum"] = df.mul((weight, 1.0 - weight)).sum(axis=1)
df = df.sort_values(by="weighted_sum", ascending=False)
return df.index.tolist()[:top_k], df["weighted_sum"][:top_k].tolist()

View File

@@ -0,0 +1,128 @@
import os
from pathlib import Path
from typing import List, Tuple, Union
import pandas as pd
from autorag.nodes.retrieval.base import HybridRetrieval
from autorag.utils.util import pop_params, fetch_contents, result_to_dataframe
class HybridRRF(HybridRetrieval):
def _pure(self, ids, scores, top_k: int, weight: int = 60, rrf_k: int = -1):
return hybrid_rrf(ids, scores, top_k, weight, rrf_k)
@classmethod
def run_evaluator(
cls,
project_dir: Union[str, Path],
previous_result: pd.DataFrame,
*args,
**kwargs,
):
if "ids" in kwargs and "scores" in kwargs:
data_dir = os.path.join(project_dir, "data")
corpus_df = pd.read_parquet(
os.path.join(data_dir, "corpus.parquet"), engine="pyarrow"
)
params = pop_params(hybrid_rrf, kwargs)
assert (
"ids" in params and "scores" in params and "top_k" in params
), "ids, scores, and top_k must be specified."
@result_to_dataframe(
["retrieved_contents", "retrieved_ids", "retrieve_scores"]
)
def __rrf(**rrf_params):
ids, scores = hybrid_rrf(**rrf_params)
contents = fetch_contents(corpus_df, ids)
return contents, ids, scores
return __rrf(**params)
else:
assert (
"target_modules" in kwargs and "target_module_params" in kwargs
), "target_modules and target_module_params must be specified if there is not ids and scores."
instance = cls(project_dir, *args, **kwargs)
result = instance.pure(previous_result, *args, **kwargs)
del instance
return result
def hybrid_rrf(
ids: Tuple,
scores: Tuple,
top_k: int,
weight: int = 60,
rrf_k: int = -1,
) -> Tuple[List[List[str]], List[List[float]]]:
"""
Hybrid RRF function.
RRF (Rank Reciprocal Fusion) is a method to fuse multiple retrieval results.
It is common to fuse dense retrieval and sparse retrieval results using RRF.
To use this function, you must input ids and scores as tuple.
It is more unique than other retrieval modules because it does not really execute retrieval but just fuses
the results of other retrieval functions.
So you have to run more than two retrieval modules before running this function.
And collect ids and scores result from each retrieval module.
Make it as a tuple and input it to this function.
:param ids: The tuple of ids that you want to fuse.
The length of this must be the same as the length of scores.
:param scores: The retrieve scores that you want to fuse.
The length of this must be the same as the length of ids.
:param top_k: The number of passages to be retrieved.
:param weight: Hyperparameter for RRF.
It was originally rrf_k value.
Default is 60.
For more information, please visit our documentation.
:param rrf_k: (Deprecated) Hyperparameter for RRF.
It was originally rrf_k value. Will remove at a further version.
:return: The tuple of ids and fused scores that are fused by RRF.
"""
assert len(ids) == len(scores), "The length of ids and scores must be the same."
assert len(ids) > 1, "You must input more than one retrieval results."
assert top_k > 0, "top_k must be greater than 0."
assert weight > 0, "rrf_k must be greater than 0."
if rrf_k != -1:
weight = int(rrf_k)
else:
weight = int(weight)
id_df = pd.DataFrame({f"id_{i}": id_list for i, id_list in enumerate(ids)})
score_df = pd.DataFrame(
{f"score_{i}": score_list for i, score_list in enumerate(scores)}
)
df = pd.concat([id_df, score_df], axis=1)
def rrf_pure_apply(row):
ids_tuple = tuple(row[[f"id_{i}" for i in range(len(ids))]].values)
scores_tuple = tuple(row[[f"score_{i}" for i in range(len(scores))]].values)
return pd.Series(rrf_pure(ids_tuple, scores_tuple, weight, top_k))
df[["rrf_id", "rrf_score"]] = df.apply(rrf_pure_apply, axis=1)
return df["rrf_id"].tolist(), df["rrf_score"].tolist()
def rrf_pure(
ids: Tuple, scores: Tuple, rrf_k: int, top_k: int
) -> Tuple[List[str], List[float]]:
df = pd.concat(
[pd.Series(dict(zip(_id, score))) for _id, score in zip(ids, scores)], axis=1
)
rank_df = df.rank(ascending=False, method="min")
rank_df = rank_df.fillna(0)
rank_df["rrf"] = rank_df.apply(lambda row: rrf_calculate(row, rrf_k), axis=1)
rank_df = rank_df.sort_values(by="rrf", ascending=False)
return rank_df.index.tolist()[:top_k], rank_df["rrf"].tolist()[:top_k]
def rrf_calculate(row, rrf_k):
result = 0
for r in row:
if r == 0:
continue
result += 1 / (r + rrf_k)
return result

View File

@@ -0,0 +1,544 @@
import logging
import os
import pathlib
from copy import deepcopy
from typing import List, Callable, Dict, Tuple, Union
import numpy as np
import pandas as pd
from autorag.evaluation import evaluate_retrieval
from autorag.schema.metricinput import MetricInput
from autorag.strategy import measure_speed, filter_by_threshold, select_best
from autorag.support import get_support_modules
from autorag.utils.util import get_best_row, to_list, apply_recursive
logger = logging.getLogger("AutoRAG")
semantic_module_names = ["vectordb", "VectorDB"]
lexical_module_names = ["bm25", "BM25"]
hybrid_module_names = ["hybrid_rrf", "hybrid_cc", "HybridCC", "HybridRRF"]
def run_retrieval_node(
modules: List,
module_params: List[Dict],
previous_result: pd.DataFrame,
node_line_dir: str,
strategies: Dict,
) -> pd.DataFrame:
"""
Run evaluation and select the best module among retrieval node results.
:param modules: Retrieval modules to run.
:param module_params: Retrieval module parameters.
:param previous_result: Previous result dataframe.
Could be query expansion's best result or qa data.
:param node_line_dir: This node line's directory.
:param strategies: Strategies for retrieval node.
:return: The best result dataframe.
It contains previous result columns and retrieval node's result columns.
"""
if not os.path.exists(node_line_dir):
os.makedirs(node_line_dir)
project_dir = pathlib.PurePath(node_line_dir).parent.parent
qa_df = pd.read_parquet(
os.path.join(project_dir, "data", "qa.parquet"), engine="pyarrow"
)
retrieval_gt = qa_df["retrieval_gt"].tolist()
retrieval_gt = apply_recursive(lambda x: str(x), to_list(retrieval_gt))
# make rows to metric_inputs
metric_inputs = [
MetricInput(retrieval_gt=ret_gt, query=query, generation_gt=gen_gt)
for ret_gt, query, gen_gt in zip(
retrieval_gt, qa_df["query"].tolist(), qa_df["generation_gt"].tolist()
)
]
save_dir = os.path.join(node_line_dir, "retrieval") # node name
if not os.path.exists(save_dir):
os.makedirs(save_dir)
def run(input_modules, input_module_params) -> Tuple[List[pd.DataFrame], List]:
"""
Run input modules and parameters.
:param input_modules: Input modules
:param input_module_params: Input module parameters
:return: First, it returns list of result dataframe.
Second, it returns list of execution times.
"""
result, execution_times = zip(
*map(
lambda task: measure_speed(
task[0].run_evaluator,
project_dir=project_dir,
previous_result=previous_result,
**task[1],
),
zip(input_modules, input_module_params),
)
)
average_times = list(map(lambda x: x / len(result[0]), execution_times))
# run metrics before filtering
if strategies.get("metrics") is None:
raise ValueError("You must at least one metrics for retrieval evaluation.")
result = list(
map(
lambda x: evaluate_retrieval_node(
x,
metric_inputs,
strategies.get("metrics"),
),
result,
)
)
return result, average_times
def save_and_summary(
input_modules,
input_module_params,
result_list,
execution_time_list,
filename_start: int,
):
"""
Save the result and make summary file
:param input_modules: Input modules
:param input_module_params: Input module parameters
:param result_list: Result list
:param execution_time_list: Execution times
:param filename_start: The first filename to use
:return: First, it returns list of result dataframe.
Second, it returns list of execution times.
"""
# save results to folder
filepaths = list(
map(
lambda x: os.path.join(save_dir, f"{x}.parquet"),
range(filename_start, filename_start + len(input_modules)),
)
)
list(
map(
lambda x: x[0].to_parquet(x[1], index=False),
zip(result_list, filepaths),
)
) # execute save to parquet
filename_list = list(map(lambda x: os.path.basename(x), filepaths))
summary_df = pd.DataFrame(
{
"filename": filename_list,
"module_name": list(map(lambda module: module.__name__, input_modules)),
"module_params": input_module_params,
"execution_time": execution_time_list,
**{
metric: list(map(lambda result: result[metric].mean(), result_list))
for metric in strategies.get("metrics")
},
}
)
summary_df.to_csv(os.path.join(save_dir, "summary.csv"), index=False)
return summary_df
def find_best(results, average_times, filenames):
# filter by strategies
if strategies.get("speed_threshold") is not None:
results, filenames = filter_by_threshold(
results, average_times, strategies["speed_threshold"], filenames
)
selected_result, selected_filename = select_best(
results,
strategies.get("metrics"),
filenames,
strategies.get("strategy", "mean"),
)
return selected_result, selected_filename
filename_first = 0
# run semantic modules
logger.info("Running retrieval node - semantic retrieval module...")
if any([module.__name__ in semantic_module_names for module in modules]):
semantic_modules, semantic_module_params = zip(
*filter(
lambda x: x[0].__name__ in semantic_module_names,
zip(modules, module_params),
)
)
semantic_results, semantic_times = run(semantic_modules, semantic_module_params)
semantic_summary_df = save_and_summary(
semantic_modules,
semantic_module_params,
semantic_results,
semantic_times,
filename_first,
)
semantic_selected_result, semantic_selected_filename = find_best(
semantic_results, semantic_times, semantic_summary_df["filename"].tolist()
)
semantic_summary_df["is_best"] = (
semantic_summary_df["filename"] == semantic_selected_filename
)
filename_first += len(semantic_modules)
else:
(
semantic_selected_filename,
semantic_summary_df,
semantic_results,
semantic_times,
) = None, pd.DataFrame(), [], []
# run lexical modules
logger.info("Running retrieval node - lexical retrieval module...")
if any([module.__name__ in lexical_module_names for module in modules]):
lexical_modules, lexical_module_params = zip(
*filter(
lambda x: x[0].__name__ in lexical_module_names,
zip(modules, module_params),
)
)
lexical_results, lexical_times = run(lexical_modules, lexical_module_params)
lexical_summary_df = save_and_summary(
lexical_modules,
lexical_module_params,
lexical_results,
lexical_times,
filename_first,
)
lexical_selected_result, lexical_selected_filename = find_best(
lexical_results, lexical_times, lexical_summary_df["filename"].tolist()
)
lexical_summary_df["is_best"] = (
lexical_summary_df["filename"] == lexical_selected_filename
)
filename_first += len(lexical_modules)
else:
(
lexical_selected_filename,
lexical_summary_df,
lexical_results,
lexical_times,
) = None, pd.DataFrame(), [], []
logger.info("Running retrieval node - hybrid retrieval module...")
# Next, run hybrid retrieval
if any([module.__name__ in hybrid_module_names for module in modules]):
hybrid_modules, hybrid_module_params = zip(
*filter(
lambda x: x[0].__name__ in hybrid_module_names,
zip(modules, module_params),
)
)
if all(
["target_module_params" in x for x in hybrid_module_params]
): # for Runner.run
# If target_module_params are already given, run hybrid retrieval directly
hybrid_results, hybrid_times = run(hybrid_modules, hybrid_module_params)
hybrid_summary_df = save_and_summary(
hybrid_modules,
hybrid_module_params,
hybrid_results,
hybrid_times,
filename_first,
)
filename_first += len(hybrid_modules)
else: # for Evaluator
# get id and score
ids_scores = get_ids_and_scores(
save_dir,
[semantic_selected_filename, lexical_selected_filename],
semantic_summary_df,
lexical_summary_df,
previous_result,
)
hybrid_module_params = list(
map(lambda x: {**x, **ids_scores}, hybrid_module_params)
)
# optimize each modules
real_hybrid_times = [
get_hybrid_execution_times(semantic_summary_df, lexical_summary_df)
] * len(hybrid_module_params)
hybrid_times = real_hybrid_times.copy()
hybrid_results = []
for module, module_param in zip(hybrid_modules, hybrid_module_params):
module_result_df, module_best_weight = optimize_hybrid(
module,
module_param,
strategies,
metric_inputs,
project_dir,
previous_result,
)
module_param["weight"] = module_best_weight
hybrid_results.append(module_result_df)
hybrid_summary_df = save_and_summary(
hybrid_modules,
hybrid_module_params,
hybrid_results,
hybrid_times,
filename_first,
)
filename_first += len(hybrid_modules)
hybrid_summary_df["execution_time"] = hybrid_times
best_semantic_summary_row = semantic_summary_df.loc[
semantic_summary_df["is_best"]
].iloc[0]
best_lexical_summary_row = lexical_summary_df.loc[
lexical_summary_df["is_best"]
].iloc[0]
target_modules = (
best_semantic_summary_row["module_name"],
best_lexical_summary_row["module_name"],
)
target_module_params = (
best_semantic_summary_row["module_params"],
best_lexical_summary_row["module_params"],
)
hybrid_summary_df = edit_summary_df_params(
hybrid_summary_df, target_modules, target_module_params
)
else:
if any([module.__name__ in hybrid_module_names for module in modules]):
logger.warning(
"You must at least one semantic module and lexical module for hybrid evaluation."
"Passing hybrid module."
)
_, hybrid_summary_df, hybrid_results, hybrid_times = (
None,
pd.DataFrame(),
[],
[],
)
summary = pd.concat(
[semantic_summary_df, lexical_summary_df, hybrid_summary_df], ignore_index=True
)
results = semantic_results + lexical_results + hybrid_results
average_times = semantic_times + lexical_times + hybrid_times
filenames = summary["filename"].tolist()
# filter by strategies
selected_result, selected_filename = find_best(results, average_times, filenames)
best_result = pd.concat([previous_result, selected_result], axis=1)
# add summary.csv 'is_best' column
summary["is_best"] = summary["filename"] == selected_filename
# save the result files
best_result.to_parquet(
os.path.join(
save_dir, f"best_{os.path.splitext(selected_filename)[0]}.parquet"
),
index=False,
)
summary.to_csv(os.path.join(save_dir, "summary.csv"), index=False)
return best_result
def evaluate_retrieval_node(
result_df: pd.DataFrame,
metric_inputs: List[MetricInput],
metrics: Union[List[str], List[Dict]],
) -> pd.DataFrame:
"""
Evaluate retrieval node from retrieval node result dataframe.
:param result_df: The result dataframe from a retrieval node.
:param metric_inputs: List of metric input schema for AutoRAG.
:param metrics: Metric list from input strategies.
:return: Return result_df with metrics columns.
The columns will be 'retrieved_contents', 'retrieved_ids', 'retrieve_scores', and metric names.
"""
@evaluate_retrieval(
metric_inputs=metric_inputs,
metrics=metrics,
)
def evaluate_this_module(df: pd.DataFrame):
return (
df["retrieved_contents"].tolist(),
df["retrieved_ids"].tolist(),
df["retrieve_scores"].tolist(),
)
return evaluate_this_module(result_df)
def edit_summary_df_params(
summary_df: pd.DataFrame, target_modules, target_module_params
) -> pd.DataFrame:
def delete_ids_scores(x):
del x["ids"]
del x["scores"]
return x
summary_df["module_params"] = summary_df["module_params"].apply(delete_ids_scores)
summary_df["new_params"] = [
{"target_modules": target_modules, "target_module_params": target_module_params}
] * len(summary_df)
summary_df["module_params"] = summary_df.apply(
lambda row: {**row["module_params"], **row["new_params"]}, axis=1
)
summary_df = summary_df.drop(columns=["new_params"])
return summary_df
def get_ids_and_scores(
node_dir: str,
filenames: List[str],
semantic_summary_df: pd.DataFrame,
lexical_summary_df: pd.DataFrame,
previous_result,
) -> Dict[str, Tuple[List[List[str]], List[List[float]]]]:
project_dir = pathlib.PurePath(node_dir).parent.parent.parent
best_results_df = list(
map(
lambda filename: pd.read_parquet(
os.path.join(node_dir, filename), engine="pyarrow"
),
filenames,
)
)
ids = tuple(
map(lambda df: df["retrieved_ids"].apply(list).tolist(), best_results_df)
)
scores = tuple(
map(lambda df: df["retrieve_scores"].apply(list).tolist(), best_results_df)
)
# search non-duplicate ids
semantic_ids = deepcopy(ids[0])
lexical_ids = deepcopy(ids[1])
def get_non_duplicate_ids(target_ids, compare_ids) -> List[List[str]]:
"""
Get non-duplicate ids from target_ids and compare_ids.
If you want to non-duplicate ids of semantic_ids, you have to put it at target_ids.
"""
result_ids = []
assert len(target_ids) == len(compare_ids)
for target_id_list, compare_id_list in zip(target_ids, compare_ids):
query_duplicated = list(set(compare_id_list) - set(target_id_list))
duplicate_list = query_duplicated if len(query_duplicated) != 0 else []
result_ids.append(duplicate_list)
return result_ids
lexical_target_ids = get_non_duplicate_ids(lexical_ids, semantic_ids)
semantic_target_ids = get_non_duplicate_ids(semantic_ids, lexical_ids)
new_id_tuple = (
[a + b for a, b in zip(semantic_ids, semantic_target_ids)],
[a + b for a, b in zip(lexical_ids, lexical_target_ids)],
)
# search non-duplicate ids' scores
new_semantic_scores = get_scores_by_ids(
semantic_target_ids, semantic_summary_df, project_dir, previous_result
)
new_lexical_scores = get_scores_by_ids(
lexical_target_ids, lexical_summary_df, project_dir, previous_result
)
new_score_tuple = (
[a + b for a, b in zip(scores[0], new_semantic_scores)],
[a + b for a, b in zip(scores[1], new_lexical_scores)],
)
return {
"ids": new_id_tuple,
"scores": new_score_tuple,
}
def get_scores_by_ids(
ids: List[List[str]], module_summary_df: pd.DataFrame, project_dir, previous_result
) -> List[List[float]]:
module_name = get_best_row(module_summary_df)["module_name"]
module_params = get_best_row(module_summary_df)["module_params"]
module = get_support_modules(module_name)
result_df = module.run_evaluator(
project_dir=project_dir,
previous_result=previous_result,
ids=ids,
**module_params,
)
return to_list(result_df["retrieve_scores"].tolist())
def find_unique_elems(list1: List[str], list2: List[str]) -> List[str]:
return list(set(list1).symmetric_difference(set(list2)))
def get_hybrid_execution_times(lexical_summary, semantic_summary) -> float:
lexical_execution_time = lexical_summary.loc[lexical_summary["is_best"]].iloc[0][
"execution_time"
]
semantic_execution_time = semantic_summary.loc[semantic_summary["is_best"]].iloc[0][
"execution_time"
]
return lexical_execution_time + semantic_execution_time
def optimize_hybrid(
hybrid_module_func: Callable,
hybrid_module_param: Dict,
strategy: Dict,
input_metrics: List[MetricInput],
project_dir,
previous_result,
):
if (
hybrid_module_func.__name__ == "HybridRRF"
or hybrid_module_func.__name__ == "hybrid_rrf"
):
weight_range = hybrid_module_param.pop("weight_range", (4, 80))
test_weight_size = weight_range[1] - weight_range[0] + 1
elif (
hybrid_module_func.__name__ == "HybridCC"
or hybrid_module_func.__name__ == "hybrid_cc"
):
weight_range = hybrid_module_param.pop("weight_range", (0.0, 1.0))
test_weight_size = hybrid_module_param.pop("test_weight_size", 101)
else:
raise ValueError("You must input hybrid module function at hybrid_module_func.")
weight_candidates = np.linspace(
weight_range[0], weight_range[1], test_weight_size
).tolist()
result_list = []
for weight_value in weight_candidates:
result_df = hybrid_module_func.run_evaluator(
project_dir=project_dir,
previous_result=previous_result,
weight=weight_value,
**hybrid_module_param,
)
result_list.append(result_df)
# evaluate here
if strategy.get("metrics") is None:
raise ValueError("You must at least one metrics for retrieval evaluation.")
result_list = list(
map(
lambda x: evaluate_retrieval_node(
x,
input_metrics,
strategy.get("metrics"),
),
result_list,
)
)
# select best result
best_result_df, best_weight = select_best(
result_list,
strategy.get("metrics"),
metadatas=weight_candidates,
strategy_name=strategy.get("strategy", "normalize_mean"),
)
return best_result_df, best_weight

View File

@@ -0,0 +1,303 @@
import itertools
import logging
import os
from typing import List, Tuple, Optional
import numpy as np
import pandas as pd
from llama_index.core.embeddings import BaseEmbedding
from llama_index.embeddings.openai import OpenAIEmbedding
from autorag.evaluation.metric.util import (
calculate_l2_distance,
calculate_inner_product,
calculate_cosine_similarity,
)
from autorag.nodes.retrieval.base import evenly_distribute_passages, BaseRetrieval
from autorag.utils import (
validate_corpus_dataset,
cast_corpus_dataset,
cast_qa_dataset,
validate_qa_dataset,
)
from autorag.utils.util import (
get_event_loop,
process_batch,
openai_truncate_by_token,
flatten_apply,
result_to_dataframe,
pop_params,
fetch_contents,
empty_cuda_cache,
convert_inputs_to_list,
make_batch,
)
from autorag.vectordb import load_vectordb_from_yaml
from autorag.vectordb.base import BaseVectorStore
logger = logging.getLogger("AutoRAG")
class VectorDB(BaseRetrieval):
def __init__(self, project_dir: str, vectordb: str = "default", **kwargs):
"""
Initialize VectorDB retrieval node.
:param project_dir: The project directory path.
:param vectordb: The vectordb name.
You must configure the vectordb name in the config.yaml file.
If you don't configure, it uses the default vectordb.
:param kwargs: The optional arguments.
Not affected in the init method.
"""
super().__init__(project_dir)
vectordb_config_path = os.path.join(self.resources_dir, "vectordb.yaml")
self.vector_store = load_vectordb_from_yaml(
vectordb_config_path, vectordb, project_dir
)
self.embedding_model = self.vector_store.embedding
def __del__(self):
del self.vector_store
del self.embedding_model
empty_cuda_cache()
super().__del__()
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries = self.cast_to_run(previous_result)
pure_params = pop_params(self._pure, kwargs)
ids, scores = self._pure(queries, **pure_params)
contents = fetch_contents(self.corpus_df, ids)
return contents, ids, scores
def _pure(
self,
queries: List[List[str]],
top_k: int,
embedding_batch: int = 128,
ids: Optional[List[List[str]]] = None,
) -> Tuple[List[List[str]], List[List[float]]]:
"""
VectorDB retrieval function.
You have to get a chroma collection that is already ingested.
You have to get an embedding model that is already used in ingesting.
:param queries: 2-d list of query strings.
Each element of the list is a query strings of each row.
:param top_k: The number of passages to be retrieved.
:param embedding_batch: The number of queries to be processed in parallel.
This is used to prevent API error at the query embedding.
Default is 128.
:param ids: The optional list of ids that you want to retrieve.
You don't need to specify this in the general use cases.
Default is None.
:return: The 2-d list contains a list of passage ids that retrieved from vectordb and 2-d list of its scores.
It will be a length of queries. And each element has a length of top_k.
"""
# if ids are specified, fetch the ids score from Chroma
if ids is not None:
return self.__get_ids_scores(queries, ids, embedding_batch)
# run async vector_db_pure function
tasks = [
vectordb_pure(query_list, top_k, self.vector_store)
for query_list in queries
]
loop = get_event_loop()
results = loop.run_until_complete(
process_batch(tasks, batch_size=embedding_batch)
)
id_result = list(map(lambda x: x[0], results))
score_result = list(map(lambda x: x[1], results))
return id_result, score_result
def __get_ids_scores(self, queries, ids, embedding_batch: int):
# truncate queries and embedding execution here.
openai_embedding_limit = 8000
if isinstance(self.embedding_model, OpenAIEmbedding):
queries = list(
map(
lambda query_list: openai_truncate_by_token(
query_list,
openai_embedding_limit,
self.embedding_model.model_name,
),
queries,
)
)
query_embeddings = flatten_apply(
run_query_embedding_batch,
queries,
embedding_model=self.embedding_model,
batch_size=embedding_batch,
)
loop = get_event_loop()
async def run_fetch(ids):
final_result = []
for id_list in ids:
if len(id_list) == 0:
final_result.append([])
else:
result = await self.vector_store.fetch(id_list)
final_result.append(result)
return final_result
content_embeddings = loop.run_until_complete(run_fetch(ids))
score_result = list(
map(
lambda query_embedding_list, content_embedding_list: get_id_scores(
query_embedding_list,
content_embedding_list,
similarity_metric=self.vector_store.similarity_metric,
),
query_embeddings,
content_embeddings,
)
)
return ids, score_result
async def vectordb_pure(
queries: List[str], top_k: int, vectordb: BaseVectorStore
) -> Tuple[List[str], List[float]]:
"""
Async VectorDB retrieval function.
Its usage is for async retrieval of vector_db row by row.
:param query_embeddings: A list of query embeddings.
:param top_k: The number of passages to be retrieved.
:param vectordb: The vector store instance.
:return: The tuple contains a list of passage ids that are retrieved from vectordb and a list of its scores.
"""
id_result, score_result = await vectordb.query(queries=queries, top_k=top_k)
# Distribute passages evenly
id_result, score_result = evenly_distribute_passages(id_result, score_result, top_k)
# sort id_result and score_result by score
result = [
(_id, score)
for score, _id in sorted(
zip(score_result, id_result), key=lambda pair: pair[0], reverse=True
)
]
id_result, score_result = zip(*result)
return list(id_result), list(score_result)
async def filter_exist_ids(
vectordb: BaseVectorStore,
corpus_data: pd.DataFrame,
) -> pd.DataFrame:
corpus_data = cast_corpus_dataset(corpus_data)
validate_corpus_dataset(corpus_data)
ids = corpus_data["doc_id"].tolist()
# Query the collection to check if IDs already exist
existed_bool_list = await vectordb.is_exist(ids=ids)
# Assuming 'ids' is the key in the response
new_passage = corpus_data[~pd.Series(existed_bool_list)]
return new_passage
async def filter_exist_ids_from_retrieval_gt(
vectordb: BaseVectorStore,
qa_data: pd.DataFrame,
corpus_data: pd.DataFrame,
) -> pd.DataFrame:
qa_data = cast_qa_dataset(qa_data)
validate_qa_dataset(qa_data)
corpus_data = cast_corpus_dataset(corpus_data)
validate_corpus_dataset(corpus_data)
retrieval_gt = (
qa_data["retrieval_gt"]
.apply(lambda x: list(itertools.chain.from_iterable(x)))
.tolist()
)
retrieval_gt = list(itertools.chain.from_iterable(retrieval_gt))
retrieval_gt = list(set(retrieval_gt))
existed_bool_list = await vectordb.is_exist(ids=retrieval_gt)
add_ids = []
for ret_gt, is_exist in zip(retrieval_gt, existed_bool_list):
if not is_exist:
add_ids.append(ret_gt)
new_passage = corpus_data[corpus_data["doc_id"].isin(add_ids)]
return new_passage
async def vectordb_ingest(
vectordb: BaseVectorStore,
corpus_data: pd.DataFrame,
):
"""
Ingest given corpus data to the vectordb.
It truncates corpus content when the embedding model is OpenAIEmbedding to the 8000 tokens.
Plus, when the corpus content is empty (whitespace), it will be ignored.
And if there is a document id that already exists in the collection, it will be ignored.
:param vectordb: A vector stores instance that you want to ingest.
:param corpus_data: The corpus data that contains doc_id and contents columns.
"""
embedding_batch = vectordb.embedding_batch
if not corpus_data.empty:
new_contents = corpus_data["contents"].tolist()
new_ids = corpus_data["doc_id"].tolist()
content_batches = make_batch(new_contents, embedding_batch)
id_batches = make_batch(new_ids, embedding_batch)
for content_batch, id_batch in zip(content_batches, id_batches):
await vectordb.add(ids=id_batch, texts=content_batch)
def run_query_embedding_batch(
queries: List[str], embedding_model: BaseEmbedding, batch_size: int
) -> List[List[float]]:
result = []
for i in range(0, len(queries), batch_size):
batch = queries[i : i + batch_size]
embeddings = embedding_model.get_text_embedding_batch(batch)
result.extend(embeddings)
return result
@convert_inputs_to_list
def get_id_scores( # To find the uncalculated score when fuse the scores for the hybrid retrieval
query_embeddings: List[
List[float]
], # `queries` is input. This is one user input query.
content_embeddings: List[List[float]],
similarity_metric: str,
) -> List[
float
]: # The most high scores among each query. The length of a result is the same as the contents length.
"""
Calculate the highest similarity scores between query embeddings and content embeddings.
:param query_embeddings: A list of lists containing query embeddings.
:param content_embeddings: A list of lists containing content embeddings.
:param similarity_metric: The similarity metric to use ('l2', 'ip', or 'cosine').
:return: A list of the highest similarity scores for each content embedding.
"""
metric_func_dict = {
"l2": lambda x, y: 1 - calculate_l2_distance(x, y),
"ip": calculate_inner_product,
"cosine": calculate_cosine_similarity,
}
metric_func = metric_func_dict[similarity_metric]
result = []
for content_embedding in content_embeddings:
scores = []
for query_embedding in query_embeddings:
scores.append(
metric_func(np.array(query_embedding), np.array(content_embedding))
)
result.append(max(scores))
return result