Fix Dockerfile build issue

This commit is contained in:
kyy
2025-03-18 16:41:12 +09:00
parent 6814230bfb
commit 9323aa254a
228 changed files with 467 additions and 3488 deletions

View File

@@ -0,0 +1,18 @@
from .cohere import CohereReranker
from .colbert import ColbertReranker
from .flag_embedding import FlagEmbeddingReranker
from .flag_embedding_llm import FlagEmbeddingLLMReranker
from .jina import JinaReranker
from .koreranker import KoReranker
from .monot5 import MonoT5
from .pass_reranker import PassReranker
from .rankgpt import RankGPT
from .sentence_transformer import SentenceTransformerReranker
from .time_reranker import TimeReranker
from .upr import Upr
from .openvino import OpenVINOReranker
from .voyageai import VoyageAIReranker
from .mixedbreadai import MixedbreadAIReranker
from .flashrank import FlashRankReranker
from .dragonkue2 import DragonKue2 # 250313 추가 - 김용연

View File

@@ -0,0 +1,55 @@
import abc
import logging
from pathlib import Path
from typing import Union
import pandas as pd
from autorag.schema import BaseModule
from autorag.utils import validate_qa_dataset
logger = logging.getLogger("AutoRAG")
class BasePassageReranker(BaseModule, metaclass=abc.ABCMeta):
def __init__(self, project_dir: Union[str, Path], *args, **kwargs):
logger.info(
f"Initialize passage reranker node - {self.__class__.__name__} module..."
)
def __del__(self):
logger.info(
f"Deleting passage reranker node - {self.__class__.__name__} module..."
)
def cast_to_run(self, previous_result: pd.DataFrame, *args, **kwargs):
logger.info(
f"Running passage reranker node - {self.__class__.__name__} module..."
)
validate_qa_dataset(previous_result)
# find queries columns
assert (
"query" in previous_result.columns
), "previous_result must have query column."
queries = previous_result["query"].tolist()
# find contents_list columns
assert (
"retrieved_contents" in previous_result.columns
), "previous_result must have retrieved_contents column."
contents = previous_result["retrieved_contents"].tolist()
# find scores columns
assert (
"retrieve_scores" in previous_result.columns
), "previous_result must have retrieve_scores column."
scores = previous_result["retrieve_scores"].tolist()
# find ids columns
assert (
"retrieved_ids" in previous_result.columns
), "previous_result must have retrieved_ids column."
ids = previous_result["retrieved_ids"].tolist()
return queries, contents, scores, ids

View File

@@ -0,0 +1,119 @@
import os
from typing import List, Tuple
import cohere
import pandas as pd
from cohere import RerankResponseResultsItem
from autorag.nodes.passagereranker.base import BasePassageReranker
from autorag.utils.util import get_event_loop, process_batch, result_to_dataframe
class CohereReranker(BasePassageReranker):
def __init__(self, project_dir: str, *args, **kwargs):
"""
Initialize Cohere rerank node.
:param project_dir: The project directory path.
:param api_key: The API key for Cohere rerank.
You can set it in the environment variable COHERE_API_KEY.
Or, you can directly set it on the config YAML file using this parameter.
Default is env variable "COHERE_API_KEY".
:param kwargs: Extra arguments that are not affected
"""
super().__init__(project_dir)
api_key = kwargs.pop("api_key", None)
api_key = os.getenv("COHERE_API_KEY", None) if api_key is None else api_key
if api_key is None:
api_key = os.getenv("CO_API_KEY", None)
if api_key is None:
raise KeyError(
"Please set the API key for Cohere rerank in the environment variable COHERE_API_KEY "
"or directly set it on the config YAML file."
)
self.cohere_client = cohere.AsyncClientV2(api_key=api_key)
def __del__(self):
del self.cohere_client
super().__del__()
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries, contents, scores, ids = self.cast_to_run(previous_result)
top_k = kwargs.pop("top_k")
batch = kwargs.pop("batch", 64)
model = kwargs.pop("model", "rerank-v3.5")
return self._pure(queries, contents, scores, ids, top_k, batch, model)
def _pure(
self,
queries: List[str],
contents_list: List[List[str]],
scores_list: List[List[float]],
ids_list: List[List[str]],
top_k: int,
batch: int = 64,
model: str = "rerank-v3.5",
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Rerank a list of contents with Cohere rerank models.
You can get the API key from https://cohere.com/rerank and set it in the environment variable COHERE_API_KEY.
:param queries: The list of queries to use for reranking
:param contents_list: The list of lists of contents to rerank
:param scores_list: The list of lists of scores retrieved from the initial ranking
:param ids_list: The list of lists of ids retrieved from the initial ranking
:param top_k: The number of passages to be retrieved
:param batch: The number of queries to be processed in a batch
:param model: The model name for Cohere rerank.
You can choose between "rerank-v3.5", "rerank-english-v3.0", and "rerank-multilingual-v3.0".
Default is "rerank-v3.5".
:return: Tuple of lists containing the reranked contents, ids, and scores
"""
# Run async cohere_rerank_pure function
tasks = [
cohere_rerank_pure(self.cohere_client, model, query, document, ids, top_k)
for query, document, ids in zip(queries, contents_list, ids_list)
]
loop = get_event_loop()
results = loop.run_until_complete(process_batch(tasks, batch_size=batch))
content_result = list(map(lambda x: x[0], results))
id_result = list(map(lambda x: x[1], results))
score_result = list(map(lambda x: x[2], results))
return content_result, id_result, score_result
async def cohere_rerank_pure(
cohere_client: cohere.AsyncClient,
model: str,
query: str,
documents: List[str],
ids: List[str],
top_k: int,
) -> Tuple[List[str], List[str], List[float]]:
"""
Rerank a list of contents with Cohere rerank models.
:param cohere_client: The Cohere AsyncClient to use for reranking
:param model: The model name for Cohere rerank
:param query: The query to use for reranking
:param documents: The list of contents to rerank
:param ids: The list of ids corresponding to the documents
:param top_k: The number of passages to be retrieved
:return: Tuple of lists containing the reranked contents, ids, and scores
"""
rerank_results = await cohere_client.rerank(
model=model,
query=query,
documents=documents,
top_n=top_k,
return_documents=False,
)
results: List[RerankResponseResultsItem] = rerank_results.results
reranked_scores: List[float] = list(map(lambda x: x.relevance_score, results))
indices = list(map(lambda x: x.index, results))
reranked_contents: List[str] = list(map(lambda i: documents[i], indices))
reranked_ids: List[str] = list(map(lambda i: ids[i], indices))
return reranked_contents, reranked_ids, reranked_scores

View File

@@ -0,0 +1,213 @@
from typing import List, Tuple
import numpy as np
import pandas as pd
from autorag.nodes.passagereranker.base import BasePassageReranker
from autorag.utils.util import (
flatten_apply,
sort_by_scores,
select_top_k,
pop_params,
result_to_dataframe,
empty_cuda_cache,
)
class ColbertReranker(BasePassageReranker):
def __init__(
self,
project_dir: str,
model_name: str = "colbert-ir/colbertv2.0",
*args,
**kwargs,
):
"""
Initialize a colbert rerank model for reranking.
:param project_dir: The project directory
:param model_name: The model name for Colbert rerank.
You can choose a colbert model for reranking.
The default is "colbert-ir/colbertv2.0".
:param kwargs: Extra parameter for the model.
"""
super().__init__(project_dir)
try:
import torch
from transformers import AutoModel, AutoTokenizer
except ImportError:
raise ImportError(
"Pytorch is not installed. Please install pytorch to use Colbert reranker."
)
self.device = "cuda" if torch.cuda.is_available() else "cpu"
model_params = pop_params(AutoModel.from_pretrained, kwargs)
self.model = AutoModel.from_pretrained(model_name, **model_params).to(
self.device
)
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
def __del__(self):
del self.model
empty_cuda_cache()
super().__del__()
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries, contents, _, ids = self.cast_to_run(previous_result)
top_k = kwargs.pop("top_k")
batch = kwargs.pop("batch", 64)
return self._pure(queries, contents, ids, top_k, batch)
def _pure(
self,
queries: List[str],
contents_list: List[List[str]],
ids_list: List[List[str]],
top_k: int,
batch: int = 64,
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Rerank a list of contents with Colbert rerank models.
You can get more information about a Colbert model at https://huggingface.co/colbert-ir/colbertv2.0.
It uses BERT-based model, so recommend using CUDA gpu for faster reranking.
:param queries: The list of queries to use for reranking
:param contents_list: The list of lists of contents to rerank
:param ids_list: The list of lists of ids retrieved from the initial ranking
:param top_k: The number of passages to be retrieved
:param batch: The number of queries to be processed in a batch
Default is 64.
:return: Tuple of lists containing the reranked contents, ids, and scores
"""
# get query and content embeddings
query_embedding_list = get_colbert_embedding_batch(
queries, self.model, self.tokenizer, batch
)
content_embedding_list = flatten_apply(
get_colbert_embedding_batch,
contents_list,
model=self.model,
tokenizer=self.tokenizer,
batch_size=batch,
)
df = pd.DataFrame(
{
"ids": ids_list,
"query_embedding": query_embedding_list,
"contents": contents_list,
"content_embedding": content_embedding_list,
}
)
temp_df = df.explode("content_embedding")
temp_df["score"] = temp_df.apply(
lambda x: get_colbert_score(x["query_embedding"], x["content_embedding"]),
axis=1,
)
df["scores"] = (
temp_df.groupby(level=0, sort=False)["score"].apply(list).tolist()
)
df[["contents", "ids", "scores"]] = df.apply(
sort_by_scores, axis=1, result_type="expand"
)
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
return (
results["contents"].tolist(),
results["ids"].tolist(),
results["scores"].tolist(),
)
def get_colbert_embedding_batch(
input_strings: List[str], model, tokenizer, batch_size: int
) -> List[np.array]:
try:
import torch
except ImportError:
raise ImportError(
"Pytorch is not installed. Please install pytorch to use Colbert reranker."
)
encoding = tokenizer(
input_strings,
return_tensors="pt",
padding=True,
truncation=True,
max_length=model.config.max_position_embeddings,
)
input_batches = slice_tokenizer_result(encoding, batch_size)
result_embedding = []
with torch.no_grad():
for encoding_batch in input_batches:
result_embedding.append(model(**encoding_batch).last_hidden_state)
total_tensor = torch.cat(
result_embedding, dim=0
) # shape [batch_size, token_length, embedding_dim]
tensor_results = list(total_tensor.chunk(total_tensor.size()[0]))
if torch.cuda.is_available():
return list(map(lambda x: x.detach().cpu().numpy(), tensor_results))
else:
return list(map(lambda x: x.detach().numpy(), tensor_results))
def slice_tokenizer_result(tokenizer_output, batch_size):
input_ids_batches = slice_tensor(tokenizer_output["input_ids"], batch_size)
attention_mask_batches = slice_tensor(
tokenizer_output["attention_mask"], batch_size
)
token_type_ids_batches = slice_tensor(
tokenizer_output.get("token_type_ids", None), batch_size
)
return [
{
"input_ids": input_ids,
"attention_mask": attention_mask,
"token_type_ids": token_type_ids,
}
for input_ids, attention_mask, token_type_ids in zip(
input_ids_batches, attention_mask_batches, token_type_ids_batches
)
]
def slice_tensor(input_tensor, batch_size):
try:
import torch
except ImportError:
raise ImportError(
"Pytorch is not installed. Please install pytorch to use Colbert reranker."
)
# Calculate the number of full batches
num_full_batches = input_tensor.size(0) // batch_size
# Slice the tensor into batches
tensor_list = [
input_tensor[i * batch_size : (i + 1) * batch_size]
for i in range(num_full_batches)
]
# Handle the last batch if it's smaller than batch_size
remainder = input_tensor.size(0) % batch_size
if remainder:
tensor_list.append(input_tensor[-remainder:])
device = "cuda" if torch.cuda.is_available() else "cpu"
tensor_list = list(map(lambda x: x.to(device), tensor_list))
return tensor_list
def get_colbert_score(query_embedding: np.array, content_embedding: np.array) -> float:
if query_embedding.ndim == 3 and content_embedding.ndim == 3:
query_embedding = query_embedding.reshape(-1, query_embedding.shape[-1])
content_embedding = content_embedding.reshape(-1, content_embedding.shape[-1])
sim_matrix = np.dot(query_embedding, content_embedding.T) / (
np.linalg.norm(query_embedding, axis=1)[:, np.newaxis]
* np.linalg.norm(content_embedding, axis=1)
)
max_sim_scores = np.max(sim_matrix, axis=1)
return float(np.mean(max_sim_scores))

View File

@@ -0,0 +1,138 @@
# 250313 reranker module_type 추가 - 김용연
from typing import List, Tuple
import numpy as np
import pandas as pd
from autorag.nodes.passagereranker.base import BasePassageReranker
from autorag.utils.util import (
make_batch,
sort_by_scores,
flatten_apply,
select_top_k,
result_to_dataframe,
empty_cuda_cache,
)
class DragonKue2(BasePassageReranker):
def __init__(self, project_dir: str, *args, **kwargs):
super().__init__(project_dir)
try:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
except ImportError:
raise ImportError("For using dragonkue2, please install torch first.")
model_path = "dragonkue/bge-reranker-v2-m3-ko"
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
self.model.eval()
# Determine the device to run the model on (GPU if available, otherwise CPU)
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.model.to(self.device)
def __del__(self):
del self.model
empty_cuda_cache()
super().__del__()
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries, contents, _, ids = self.cast_to_run(previous_result)
top_k = kwargs.pop("top_k")
batch = kwargs.pop("batch", 64)
return self._pure(queries, contents, ids, top_k, batch)
def _pure(
self,
queries: List[str],
contents_list: List[List[str]],
ids_list: List[List[str]],
top_k: int,
batch: int = 64,
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Rerank a list of contents based on their relevance to a query using ko-reranker.
bge-reranker-v2-m3-ko is a reranker based on korean (https://huggingface.co/dragonkue/bge-reranker-v2-m3-ko).
:param queries: The list of queries to use for reranking
:param contents_list: The list of lists of contents to rerank
:param ids_list: The list of lists of ids retrieved from the initial ranking
:param top_k: The number of passages to be retrieved
:param batch: The number of queries to be processed in a batch
Default is 64.
:return: Tuple of lists containing the reranked contents, ids, and scores
"""
nested_list = [
list(map(lambda x: [query, x], content_list))
for query, content_list in zip(queries, contents_list)
]
scores_nps = flatten_apply(
dragonku2_run_model,
nested_list,
model=self.model,
batch_size=batch,
tokenizer=self.tokenizer,
device=self.device,
)
rerank_scores = list(
map(
lambda scores: exp_normalize(np.array(scores)).astype(float), scores_nps
)
)
df = pd.DataFrame(
{
"contents": contents_list,
"ids": ids_list,
"scores": rerank_scores,
}
)
df[["contents", "ids", "scores"]] = df.apply(
sort_by_scores, axis=1, result_type="expand"
)
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
return (
results["contents"].tolist(),
results["ids"].tolist(),
results["scores"].tolist(),
)
def dragonku2_run_model(input_texts, model, tokenizer, device, batch_size: int): # 250313 추가 - 김용연
try:
import torch
except ImportError:
raise ImportError("For using drangonku2, please install torch first.")
batch_input_texts = make_batch(input_texts, batch_size)
results = []
for batch_texts in batch_input_texts:
inputs = tokenizer(
batch_texts,
padding=True,
truncation=True,
return_tensors="pt",
max_length=512,
)
inputs = inputs.to(device)
with torch.no_grad():
scores = (
model(**inputs, return_dict=True)
.logits.view(
-1,
)
.float()
)
scores_np = scores.cpu().numpy()
results.extend(scores_np)
return results
def exp_normalize(x):
b = x.max()
y = np.exp(x - b)
return y / y.sum()

View File

@@ -0,0 +1,112 @@
from typing import List, Tuple, Iterable
import pandas as pd
from autorag.nodes.passagereranker.base import BasePassageReranker
from autorag.utils.util import (
make_batch,
sort_by_scores,
flatten_apply,
select_top_k,
pop_params,
result_to_dataframe,
empty_cuda_cache,
)
class FlagEmbeddingReranker(BasePassageReranker):
def __init__(
self, project_dir, model_name: str = "BAAI/bge-reranker-large", *args, **kwargs
):
"""
Initialize the FlagEmbeddingReranker module.
:param project_dir: The project directory.
:param model_name: The name of the BAAI Reranker normal-model name.
Default is "BAAI/bge-reranker-large"
:param kwargs: Extra parameter for FlagEmbedding.FlagReranker
"""
super().__init__(project_dir)
try:
from FlagEmbedding import FlagReranker
except ImportError:
raise ImportError(
"FlagEmbeddingReranker requires the 'FlagEmbedding' package to be installed."
)
model_params = pop_params(FlagReranker.__init__, kwargs)
model_params.pop("model_name_or_path", None)
self.model = FlagReranker(model_name_or_path=model_name, **model_params)
def __del__(self):
del self.model
empty_cuda_cache()
super().__del__()
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries, contents, _, ids = self.cast_to_run(previous_result)
top_k = kwargs.pop("top_k")
batch = kwargs.pop("batch", 64)
return self._pure(queries, contents, ids, top_k, batch)
def _pure(
self,
queries: List[str],
contents_list: List[List[str]],
ids_list: List[List[str]],
top_k: int,
batch: int = 64,
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Rerank a list of contents based on their relevance to a query using BAAI normal-Reranker model.
:param queries: The list of queries to use for reranking
:param contents_list: The list of lists of contents to rerank
:param ids_list: The list of lists of ids retrieved from the initial ranking
:param top_k: The number of passages to be retrieved
:param batch: The number of queries to be processed in a batch
Default is 64.
:return: Tuple of lists containing the reranked contents, ids, and scores
"""
nested_list = [
list(map(lambda x: [query, x], content_list))
for query, content_list in zip(queries, contents_list)
]
rerank_scores = flatten_apply(
flag_embedding_run_model, nested_list, model=self.model, batch_size=batch
)
df = pd.DataFrame(
{
"contents": contents_list,
"ids": ids_list,
"scores": rerank_scores,
}
)
df[["contents", "ids", "scores"]] = df.apply(
sort_by_scores, axis=1, result_type="expand"
)
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
return (
results["contents"].tolist(),
results["ids"].tolist(),
results["scores"].tolist(),
)
def flag_embedding_run_model(input_texts, model, batch_size: int):
try:
import torch
except ImportError:
raise ImportError("FlagEmbeddingReranker requires PyTorch to be installed.")
batch_input_texts = make_batch(input_texts, batch_size)
results = []
for batch_texts in batch_input_texts:
with torch.no_grad():
pred_scores = model.compute_score(sentence_pairs=batch_texts)
if batch_size == 1 or not isinstance(pred_scores, Iterable):
results.append(pred_scores)
else:
results.extend(pred_scores)
return results

View File

@@ -0,0 +1,101 @@
from typing import List, Tuple
import pandas as pd
from autorag.nodes.passagereranker.base import BasePassageReranker
from autorag.nodes.passagereranker.flag_embedding import flag_embedding_run_model
from autorag.utils.util import (
flatten_apply,
sort_by_scores,
select_top_k,
pop_params,
result_to_dataframe,
empty_cuda_cache,
)
class FlagEmbeddingLLMReranker(BasePassageReranker):
def __init__(
self,
project_dir,
model_name: str = "BAAI/bge-reranker-v2-gemma",
*args,
**kwargs,
):
"""
Initialize the FlagEmbeddingReranker module.
:param project_dir: The project directory.
:param model_name: The name of the BAAI Reranker LLM-based-model name.
Default is "BAAI/bge-reranker-v2-gemma"
:param kwargs: Extra parameter for FlagEmbedding.FlagReranker
"""
super().__init__(project_dir)
try:
from FlagEmbedding import FlagLLMReranker
except ImportError:
raise ImportError(
"FlagEmbeddingLLMReranker requires the 'FlagEmbedding' package to be installed."
)
model_params = pop_params(FlagLLMReranker.__init__, kwargs)
model_params.pop("model_name_or_path", None)
self.model = FlagLLMReranker(model_name_or_path=model_name, **model_params)
def __del__(self):
del self.model
empty_cuda_cache()
super().__del__()
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries, contents, _, ids = self.cast_to_run(previous_result)
top_k = kwargs.pop("top_k")
batch = kwargs.pop("batch", 64)
return self._pure(queries, contents, ids, top_k, batch)
def _pure(
self,
queries: List[str],
contents_list: List[List[str]],
ids_list: List[List[str]],
top_k: int,
batch: int = 64,
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Rerank a list of contents based on their relevance to a query using BAAI LLM-based-Reranker model.
:param queries: The list of queries to use for reranking
:param contents_list: The list of lists of contents to rerank
:param ids_list: The list of lists of ids retrieved from the initial ranking
:param top_k: The number of passages to be retrieved
:param batch: The number of queries to be processed in a batch
Default is 64.
:return: tuple of lists containing the reranked contents, ids, and scores
"""
nested_list = [
list(map(lambda x: [query, x], content_list))
for query, content_list in zip(queries, contents_list)
]
rerank_scores = flatten_apply(
flag_embedding_run_model, nested_list, model=self.model, batch_size=batch
)
df = pd.DataFrame(
{
"contents": contents_list,
"ids": ids_list,
"scores": rerank_scores,
}
)
df[["contents", "ids", "scores"]] = df.apply(
sort_by_scores, axis=1, result_type="expand"
)
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
return (
results["contents"].tolist(),
results["ids"].tolist(),
results["scores"].tolist(),
)

View File

@@ -0,0 +1,245 @@
import json
from pathlib import Path
import pandas as pd
import numpy as np
import os
import zipfile
import requests
from tqdm import tqdm
import collections
from typing import List, Dict, Tuple
from autorag.nodes.passagereranker.base import BasePassageReranker
from autorag.utils import result_to_dataframe
from autorag.utils.util import (
flatten_apply,
sort_by_scores,
select_top_k,
make_batch,
empty_cuda_cache,
)
model_url = "https://huggingface.co/prithivida/flashrank/resolve/main/{}.zip"
model_file_map = {
"ms-marco-TinyBERT-L-2-v2": "flashrank-TinyBERT-L-2-v2.onnx",
"ms-marco-MiniLM-L-12-v2": "flashrank-MiniLM-L-12-v2_Q.onnx",
"ms-marco-MultiBERT-L-12": "flashrank-MultiBERT-L12_Q.onnx",
"rank-T5-flan": "flashrank-rankt5_Q.onnx",
"ce-esci-MiniLM-L12-v2": "flashrank-ce-esci-MiniLM-L12-v2_Q.onnx",
"miniReranker_arabic_v1": "miniReranker_arabic_v1.onnx",
}
class FlashRankReranker(BasePassageReranker):
def __init__(
self, project_dir: str, model: str = "ms-marco-TinyBERT-L-2-v2", *args, **kwargs
):
"""
Initialize FlashRank rerank node.
:param project_dir: The project directory path.
:param model: The model name for FlashRank rerank.
You can get the list of available models from https://github.com/PrithivirajDamodaran/FlashRank.
Default is "ms-marco-TinyBERT-L-2-v2".
Not support “rank_zephyr_7b_v1_full” due to parallel inference issue.
:param kwargs: Extra arguments that are not affected
"""
super().__init__(project_dir)
try:
from tokenizers import Tokenizer
except ImportError:
raise ImportError(
"Tokenizer is not installed. Please install tokenizers to use FlashRank reranker."
)
cache_dir = kwargs.pop("cache_dir", "/tmp")
max_length = kwargs.pop("max_length", 512)
self.cache_dir: Path = Path(cache_dir)
self.model_dir: Path = self.cache_dir / model
self._prepare_model_dir(model)
model_file = model_file_map[model]
try:
import onnxruntime as ort
except ImportError:
raise ImportError(
"onnxruntime is not installed. Please install onnxruntime to use FlashRank reranker."
)
self.session = ort.InferenceSession(str(self.model_dir / model_file))
self.tokenizer: Tokenizer = self._get_tokenizer(max_length)
def __del__(self):
del self.session
del self.tokenizer
empty_cuda_cache()
super().__del__()
def _prepare_model_dir(self, model_name: str):
if not self.cache_dir.exists():
self.cache_dir.mkdir(parents=True, exist_ok=True)
if not self.model_dir.exists():
self._download_model_files(model_name)
def _download_model_files(self, model_name: str):
local_zip_file = self.cache_dir / f"{model_name}.zip"
formatted_model_url = model_url.format(model_name)
with requests.get(formatted_model_url, stream=True) as r:
r.raise_for_status()
total_size = int(r.headers.get("content-length", 0))
with (
open(local_zip_file, "wb") as f,
tqdm(
desc=local_zip_file.name,
total=total_size,
unit="iB",
unit_scale=True,
unit_divisor=1024,
) as bar,
):
for chunk in r.iter_content(chunk_size=8192):
size = f.write(chunk)
bar.update(size)
with zipfile.ZipFile(local_zip_file, "r") as zip_ref:
zip_ref.extractall(self.cache_dir)
os.remove(local_zip_file)
def _get_tokenizer(self, max_length: int = 512):
try:
from tokenizers import AddedToken, Tokenizer
except ImportError:
raise ImportError(
"Pytorch is not installed. Please install pytorch to use FlashRank reranker."
)
config = json.load(open(str(self.model_dir / "config.json")))
tokenizer_config = json.load(
open(str(self.model_dir / "tokenizer_config.json"))
)
tokens_map = json.load(open(str(self.model_dir / "special_tokens_map.json")))
tokenizer = Tokenizer.from_file(str(self.model_dir / "tokenizer.json"))
tokenizer.enable_truncation(
max_length=min(tokenizer_config["model_max_length"], max_length)
)
tokenizer.enable_padding(
pad_id=config["pad_token_id"], pad_token=tokenizer_config["pad_token"]
)
for token in tokens_map.values():
if isinstance(token, str):
tokenizer.add_special_tokens([token])
elif isinstance(token, dict):
tokenizer.add_special_tokens([AddedToken(**token)])
vocab_file = self.model_dir / "vocab.txt"
if vocab_file.exists():
tokenizer.vocab = self._load_vocab(vocab_file)
tokenizer.ids_to_tokens = collections.OrderedDict(
[(ids, tok) for tok, ids in tokenizer.vocab.items()]
)
return tokenizer
def _load_vocab(self, vocab_file: Path) -> Dict[str, int]:
vocab = collections.OrderedDict()
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip("\n")
vocab[token] = index
return vocab
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries, contents, _, ids = self.cast_to_run(previous_result)
top_k = kwargs.pop("top_k")
batch = kwargs.pop("batch", 64)
return self._pure(queries, contents, ids, top_k, batch)
def _pure(
self,
queries: List[str],
contents_list: List[List[str]],
ids_list: List[List[str]],
top_k: int,
batch: int = 64,
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Rerank a list of contents with FlashRank rerank models.
:param queries: The list of queries to use for reranking
:param contents_list: The list of lists of contents to rerank
:param ids_list: The list of lists of ids retrieved from the initial ranking
:param top_k: The number of passages to be retrieved
:param batch: The number of queries to be processed in a batch
:return: Tuple of lists containing the reranked contents, ids, and scores
"""
nested_list = [
list(map(lambda x: [query, x], content_list))
for query, content_list in zip(queries, contents_list)
]
rerank_scores = flatten_apply(
flashrank_run_model,
nested_list,
session=self.session,
batch_size=batch,
tokenizer=self.tokenizer,
)
df = pd.DataFrame(
{
"contents": contents_list,
"ids": ids_list,
"scores": rerank_scores,
}
)
df[["contents", "ids", "scores"]] = df.apply(
sort_by_scores, axis=1, result_type="expand"
)
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
return (
results["contents"].tolist(),
results["ids"].tolist(),
results["scores"].tolist(),
)
def flashrank_run_model(input_texts, tokenizer, session, batch_size: int):
batch_input_texts = make_batch(input_texts, batch_size)
results = []
for batch_texts in tqdm(batch_input_texts):
input_text = tokenizer.encode_batch(batch_texts)
input_ids = np.array([e.ids for e in input_text])
token_type_ids = np.array([e.type_ids for e in input_text])
attention_mask = np.array([e.attention_mask for e in input_text])
use_token_type_ids = token_type_ids is not None and not np.all(
token_type_ids == 0
)
onnx_input = {
"input_ids": input_ids.astype(np.int64),
"attention_mask": attention_mask.astype(np.int64),
}
if use_token_type_ids:
onnx_input["token_type_ids"] = token_type_ids.astype(np.int64)
outputs = session.run(None, onnx_input)
logits = outputs[0]
if logits.shape[1] == 1:
scores = 1 / (1 + np.exp(-logits.flatten()))
else:
exp_logits = np.exp(logits)
scores = exp_logits[:, 1] / np.sum(exp_logits, axis=1)
results.extend(scores)
return results

View File

@@ -0,0 +1,115 @@
import os
from typing import List, Tuple
import aiohttp
import pandas as pd
from autorag.nodes.passagereranker.base import BasePassageReranker
from autorag.utils.util import get_event_loop, process_batch, result_to_dataframe
JINA_API_URL = "https://api.jina.ai/v1/rerank"
class JinaReranker(BasePassageReranker):
def __init__(self, project_dir: str, api_key: str = None, *args, **kwargs):
"""
Initialize Jina rerank node.
:param project_dir: The project directory path.
:param api_key: The API key for Jina rerank.
You can set it in the environment variable JINAAI_API_KEY.
Or, you can directly set it on the config YAML file using this parameter.
Default is env variable "JINAAI_API_KEY".
:param kwargs: Extra arguments that are not affected
"""
super().__init__(project_dir)
if api_key is None:
api_key = os.getenv("JINAAI_API_KEY", None)
if api_key is None:
raise ValueError(
"API key is not provided."
"You can set it as an argument or as an environment variable 'JINAAI_API_KEY'"
)
self.session = aiohttp.ClientSession(loop=get_event_loop())
self.session.headers.update(
{"Authorization": f"Bearer {api_key}", "Accept-Encoding": "identity"}
)
def __del__(self):
self.session.close()
del self.session
super().__del__()
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries, contents, _, ids = self.cast_to_run(previous_result)
top_k = kwargs.pop("top_k")
batch = kwargs.pop("batch", 8)
model = kwargs.pop("model", "jina-reranker-v1-base-en")
return self._pure(queries, contents, ids, top_k, model, batch)
def _pure(
self,
queries: List[str],
contents_list: List[List[str]],
ids_list: List[List[str]],
top_k: int,
model: str = "jina-reranker-v1-base-en",
batch: int = 8,
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Rerank a list of contents with Jina rerank models.
You can get the API key from https://jina.ai/reranker and set it in the environment variable JINAAI_API_KEY.
:param queries: The list of queries to use for reranking
:param contents_list: The list of lists of contents to rerank
:param ids_list: The list of lists of ids retrieved from the initial ranking
:param top_k: The number of passages to be retrieved
:param model: The model name for Cohere rerank.
You can choose between "jina-reranker-v1-base-en" and "jina-colbert-v1-en".
Default is "jina-reranker-v1-base-en".
:param batch: The number of queries to be processed in a batch
:return: Tuple of lists containing the reranked contents, ids, and scores
"""
tasks = [
jina_reranker_pure(
self.session, query, contents, ids, top_k=top_k, model=model
)
for query, contents, ids in zip(queries, contents_list, ids_list)
]
loop = get_event_loop()
results = loop.run_until_complete(process_batch(tasks, batch))
content_result, id_result, score_result = zip(*results)
return list(content_result), list(id_result), list(score_result)
async def jina_reranker_pure(
session,
query: str,
contents: List[str],
ids: List[str],
top_k: int,
model: str = "jina-reranker-v1-base-en",
) -> Tuple[List[str], List[str], List[float]]:
async with session.post(
JINA_API_URL,
json={
"query": query,
"documents": contents,
"model": model,
"top_n": top_k,
},
) as resp:
resp_json = await resp.json()
if "results" not in resp_json:
raise RuntimeError(f"Invalid response from Jina API: {resp_json['detail']}")
results = resp_json["results"]
indices = list(map(lambda x: x["index"], results))
score_result = list(map(lambda x: x["relevance_score"], results))
id_result = list(map(lambda x: ids[x], indices))
content_result = list(map(lambda x: contents[x], indices))
return content_result, id_result, score_result

View File

@@ -0,0 +1,136 @@
from typing import List, Tuple
import numpy as np
import pandas as pd
from autorag.nodes.passagereranker.base import BasePassageReranker
from autorag.utils.util import (
make_batch,
sort_by_scores,
flatten_apply,
select_top_k,
result_to_dataframe,
empty_cuda_cache,
)
class KoReranker(BasePassageReranker):
def __init__(self, project_dir: str, *args, **kwargs):
super().__init__(project_dir)
try:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
except ImportError:
raise ImportError("For using KoReranker, please install torch first.")
model_path = "Dongjin-kr/ko-reranker"
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
self.model.eval()
# Determine the device to run the model on (GPU if available, otherwise CPU)
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.model.to(self.device)
def __del__(self):
del self.model
empty_cuda_cache()
super().__del__()
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries, contents, _, ids = self.cast_to_run(previous_result)
top_k = kwargs.pop("top_k")
batch = kwargs.pop("batch", 64)
return self._pure(queries, contents, ids, top_k, batch)
def _pure(
self,
queries: List[str],
contents_list: List[List[str]],
ids_list: List[List[str]],
top_k: int,
batch: int = 64,
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Rerank a list of contents based on their relevance to a query using ko-reranker.
ko-reranker is a reranker based on korean (https://huggingface.co/Dongjin-kr/ko-reranker).
:param queries: The list of queries to use for reranking
:param contents_list: The list of lists of contents to rerank
:param ids_list: The list of lists of ids retrieved from the initial ranking
:param top_k: The number of passages to be retrieved
:param batch: The number of queries to be processed in a batch
Default is 64.
:return: Tuple of lists containing the reranked contents, ids, and scores
"""
nested_list = [
list(map(lambda x: [query, x], content_list))
for query, content_list in zip(queries, contents_list)
]
scores_nps = flatten_apply(
koreranker_run_model,
nested_list,
model=self.model,
batch_size=batch,
tokenizer=self.tokenizer,
device=self.device,
)
rerank_scores = list(
map(
lambda scores: exp_normalize(np.array(scores)).astype(float), scores_nps
)
)
df = pd.DataFrame(
{
"contents": contents_list,
"ids": ids_list,
"scores": rerank_scores,
}
)
df[["contents", "ids", "scores"]] = df.apply(
sort_by_scores, axis=1, result_type="expand"
)
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
return (
results["contents"].tolist(),
results["ids"].tolist(),
results["scores"].tolist(),
)
def koreranker_run_model(input_texts, model, tokenizer, device, batch_size: int):
try:
import torch
except ImportError:
raise ImportError("For using KoReranker, please install torch first.")
batch_input_texts = make_batch(input_texts, batch_size)
results = []
for batch_texts in batch_input_texts:
inputs = tokenizer(
batch_texts,
padding=True,
truncation=True,
return_tensors="pt",
max_length=512,
)
inputs = inputs.to(device)
with torch.no_grad():
scores = (
model(**inputs, return_dict=True)
.logits.view(
-1,
)
.float()
)
scores_np = scores.cpu().numpy()
results.extend(scores_np)
return results
def exp_normalize(x):
b = x.max()
y = np.exp(x - b)
return y / y.sum()

View File

@@ -0,0 +1,126 @@
import os
from typing import List, Tuple
import pandas as pd
from mixedbread_ai.client import AsyncMixedbreadAI
from autorag.nodes.passagereranker.base import BasePassageReranker
from autorag.utils.util import (
result_to_dataframe,
get_event_loop,
process_batch,
pop_params,
)
class MixedbreadAIReranker(BasePassageReranker):
def __init__(
self,
project_dir: str,
*args,
**kwargs,
):
"""
Initialize mixedbread-ai rerank node.
:param project_dir: The project directory path.
:param api_key: The API key for MixedbreadAI rerank.
You can set it in the environment variable MXBAI_API_KEY.
Or, you can directly set it on the config YAML file using this parameter.
Default is env variable "MXBAI_API_KEY".
:param kwargs: Extra arguments that are not affected
"""
super().__init__(project_dir)
api_key = kwargs.pop("api_key", None)
api_key = os.getenv("MXBAI_API_KEY", None) if api_key is None else api_key
if api_key is None:
raise KeyError(
"Please set the API key for Mixedbread AI rerank in the environment variable MXBAI_API_KEY "
"or directly set it on the config YAML file."
)
self.client = AsyncMixedbreadAI(api_key=api_key)
def __del__(self):
del self.client
super().__del__()
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries, contents, scores, ids = self.cast_to_run(previous_result)
top_k = kwargs.pop("top_k")
batch = kwargs.pop("batch", 8)
model = kwargs.pop("model", "mixedbread-ai/mxbai-rerank-large-v1")
rerank_params = pop_params(self.client.reranking, kwargs)
return self._pure(queries, contents, ids, top_k, model, batch, **rerank_params)
def _pure(
self,
queries: List[str],
contents_list: List[List[str]],
ids_list: List[List[str]],
top_k: int,
model: str = "mixedbread-ai/mxbai-rerank-large-v1",
batch: int = 8,
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Rerank a list of contents with mixedbread-ai rerank models.
You can get the API key from https://www.mixedbread.ai/api-reference#quick-start-guide and set it in the environment variable MXBAI_API_KEY.
:param queries: The list of queries to use for reranking
:param contents_list: The list of lists of contents to rerank
:param ids_list: The list of lists of ids retrieved from the initial ranking
:param top_k: The number of passages to be retrieved
:param model: The model name for mixedbread-ai rerank.
You can choose between "mixedbread-ai/mxbai-rerank-large-v1", "mixedbread-ai/mxbai-rerank-base-v1" and "mixedbread-ai/mxbai-rerank-xsmall-v1".
Default is "mixedbread-ai/mxbai-rerank-large-v1".
:param batch: The number of queries to be processed in a batch
:return: Tuple of lists containing the reranked contents, ids, and scores
"""
tasks = [
mixedbreadai_rerank_pure(
self.client, query, contents, ids, top_k=top_k, model=model
)
for query, contents, ids in zip(queries, contents_list, ids_list)
]
loop = get_event_loop()
results = loop.run_until_complete(process_batch(tasks, batch))
content_result, id_result, score_result = zip(*results)
return list(content_result), list(id_result), list(score_result)
async def mixedbreadai_rerank_pure(
client: AsyncMixedbreadAI,
query: str,
documents: List[str],
ids: List[str],
top_k: int,
model: str = "mixedbread-ai/mxbai-rerank-large-v1",
) -> Tuple[List[str], List[str], List[float]]:
"""
Rerank a list of contents with mixedbread-ai rerank models.
:param client: The mixedbread-ai client to use for reranking
:param query: The query to use for reranking
:param documents: The list of contents to rerank
:param ids: The list of ids corresponding to the documents
:param top_k: The number of passages to be retrieved
:param model: The model name for mixedbread-ai rerank.
You can choose between "mixedbread-ai/mxbai-rerank-large-v1" and "mixedbread-ai/mxbai-rerank-base-v1".
Default is "mixedbread-ai/mxbai-rerank-large-v1".
:return: Tuple of lists containing the reranked contents, ids, and scores
"""
results = await client.reranking(
query=query,
input=documents,
top_k=top_k,
model=model,
)
reranked_scores: List[float] = list(map(lambda x: x.score, results.data))
reranked_scores_float = list(map(float, reranked_scores))
indices = list(map(lambda x: x.index, results.data))
reranked_contents = list(map(lambda x: documents[x], indices))
reranked_ids: List[str] = list(map(lambda i: ids[i], indices))
return reranked_contents, reranked_ids, reranked_scores_float

View File

@@ -0,0 +1,190 @@
from itertools import chain
from typing import List, Tuple
import pandas as pd
from autorag.nodes.passagereranker.base import BasePassageReranker
from autorag.utils.util import (
make_batch,
sort_by_scores,
flatten_apply,
select_top_k,
result_to_dataframe,
pop_params,
empty_cuda_cache,
)
prediction_tokens = {
"castorini/monot5-base-msmarco": ["▁false", "▁true"],
"castorini/monot5-base-msmarco-10k": ["▁false", "▁true"],
"castorini/monot5-large-msmarco": ["▁false", "▁true"],
"castorini/monot5-large-msmarco-10k": ["▁false", "▁true"],
"castorini/monot5-base-med-msmarco": ["▁false", "▁true"],
"castorini/monot5-3b-med-msmarco": ["▁false", "▁true"],
"castorini/monot5-3b-msmarco-10k": ["▁false", "▁true"],
"unicamp-dl/mt5-base-en-msmarco": ["▁no", "▁yes"],
"unicamp-dl/ptt5-base-pt-msmarco-10k-v2": ["▁não", "▁sim"],
"unicamp-dl/ptt5-base-pt-msmarco-100k-v2": ["▁não", "▁sim"],
"unicamp-dl/ptt5-base-en-pt-msmarco-100k-v2": ["▁não", "▁sim"],
"unicamp-dl/mt5-base-en-pt-msmarco-v2": ["▁no", "▁yes"],
"unicamp-dl/mt5-base-mmarco-v2": ["▁no", "▁yes"],
"unicamp-dl/mt5-base-en-pt-msmarco-v1": ["▁no", "▁yes"],
"unicamp-dl/mt5-base-mmarco-v1": ["▁no", "▁yes"],
"unicamp-dl/ptt5-base-pt-msmarco-10k-v1": ["▁não", "▁sim"],
"unicamp-dl/ptt5-base-pt-msmarco-100k-v1": ["▁não", "▁sim"],
"unicamp-dl/ptt5-base-en-pt-msmarco-10k-v1": ["▁não", "▁sim"],
"unicamp-dl/mt5-3B-mmarco-en-pt": ["", "▁true"],
"unicamp-dl/mt5-13b-mmarco-100k": ["", "▁true"],
}
class MonoT5(BasePassageReranker):
def __init__(
self,
project_dir: str,
model_name: str = "castorini/monot5-3b-msmarco-10k",
*args,
**kwargs,
):
"""
Initialize the MonoT5 reranker.
:param project_dir: The project directory
:param model_name: The name of the MonoT5 model to use for reranking
Note: default model name is 'castorini/monot5-3b-msmarco-10k'
If there is a '/' in the model name parameter,
when we create the file to store the results, the path will be twisted because of the '/'.
Therefore, it will be received as '_' instead of '/'.
:param kwargs: The extra arguments for the MonoT5 reranker
"""
super().__init__(project_dir)
try:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
except ImportError:
raise ImportError("For using MonoT5 Reranker, please install torch first.")
# replace '_' to '/'
if "_" in model_name:
model_name = model_name.replace("_", "/")
# Load the tokenizer and model from the pre-trained MonoT5 model
self.tokenizer = T5Tokenizer.from_pretrained(model_name)
model_params = pop_params(T5ForConditionalGeneration.from_pretrained, kwargs)
self.model = T5ForConditionalGeneration.from_pretrained(
model_name, **model_params
).eval()
# Determine the device to run the model on (GPU if available, otherwise CPU)
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.model.to(self.device)
token_false, token_true = prediction_tokens[model_name]
self.token_false_id = self.tokenizer.convert_tokens_to_ids(token_false)
self.token_true_id = self.tokenizer.convert_tokens_to_ids(token_true)
def __del__(self):
del self.model
del self.tokenizer
empty_cuda_cache()
super().__del__()
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries, contents, _, ids = self.cast_to_run(previous_result)
top_k = kwargs.get("top_k", 3)
batch = kwargs.get("batch", 64)
return self._pure(queries, contents, ids, top_k, batch)
def _pure(
self,
queries: List[str],
contents_list: List[List[str]],
ids_list: List[List[str]],
top_k: int,
batch: int = 64,
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Rerank a list of contents based on their relevance to a query using MonoT5.
:param queries: The list of queries to use for reranking
:param contents_list: The list of lists of contents to rerank
:param ids_list: The list of lists of ids retrieved from the initial ranking
:param top_k: The number of passages to be retrieved
:param batch: The number of queries to be processed in a batch
:return: tuple of lists containing the reranked contents, ids, and scores
"""
# Retrieve the tokens used by the model to represent false and true predictions
nested_list = [
list(map(lambda x: [f"Query: {query} Document: {x}"], content_list))
for query, content_list in zip(queries, contents_list)
]
rerank_scores = flatten_apply(
monot5_run_model,
nested_list,
model=self.model,
batch_size=batch,
tokenizer=self.tokenizer,
device=self.device,
token_false_id=self.token_false_id,
token_true_id=self.token_true_id,
)
df = pd.DataFrame(
{
"contents": contents_list,
"ids": ids_list,
"scores": rerank_scores,
}
)
df[["contents", "ids", "scores"]] = df.apply(
sort_by_scores, axis=1, result_type="expand"
)
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
return (
results["contents"].tolist(),
results["ids"].tolist(),
results["scores"].tolist(),
)
def monot5_run_model(
input_texts,
model,
batch_size: int,
tokenizer,
device,
token_false_id,
token_true_id,
):
try:
import torch
except ImportError:
raise ImportError("For using MonoT5 Reranker, please install torch first.")
batch_input_texts = make_batch(input_texts, batch_size)
results = []
for batch_texts in batch_input_texts:
flattened_batch_texts = list(chain.from_iterable(batch_texts))
input_encodings = tokenizer(
flattened_batch_texts,
padding=True,
truncation=True,
max_length=512,
return_tensors="pt",
).to(device)
with torch.no_grad():
outputs = model.generate(
input_ids=input_encodings["input_ids"],
attention_mask=input_encodings["attention_mask"],
output_scores=True,
return_dict_in_generate=True,
)
# Extract logits for the 'false' and 'true' tokens from the model's output
logits = outputs.scores[-1][:, [token_false_id, token_true_id]]
# Calculate the softmax probability of the 'true' token
probs = torch.nn.functional.softmax(logits, dim=-1)[:, 1]
results.extend(probs.tolist())
return results

View File

@@ -0,0 +1,191 @@
from pathlib import Path
from typing import Any, List, Tuple
import numpy as np
import pandas as pd
from autorag.nodes.passagereranker.base import BasePassageReranker
from autorag.utils.util import (
make_batch,
sort_by_scores,
flatten_apply,
select_top_k,
result_to_dataframe,
pop_params,
empty_cuda_cache,
)
class OpenVINOReranker(BasePassageReranker):
def __init__(
self,
project_dir: str,
model: str = "BAAI/bge-reranker-large",
*args,
**kwargs,
):
super().__init__(project_dir)
try:
from huggingface_hub import HfApi
from transformers import AutoTokenizer
except ImportError as e:
raise ValueError(
"Could not import huggingface_hub python package. "
"Please install it with: "
"`pip install -U huggingface_hub`."
) from e
def require_model_export(
model_id: str, revision: Any = None, subfolder: Any = None
) -> bool:
model_dir = Path(model_id)
if subfolder is not None:
model_dir = model_dir / subfolder
if model_dir.is_dir():
return (
not (model_dir / "openvino_model.xml").exists()
or not (model_dir / "openvino_model.bin").exists()
)
hf_api = HfApi()
try:
model_info = hf_api.model_info(model_id, revision=revision or "main")
normalized_subfolder = (
None if subfolder is None else Path(subfolder).as_posix()
)
model_files = [
file.rfilename
for file in model_info.siblings
if normalized_subfolder is None
or file.rfilename.startswith(normalized_subfolder)
]
ov_model_path = (
"openvino_model.xml"
if subfolder is None
else f"{normalized_subfolder}/openvino_model.xml"
)
return (
ov_model_path not in model_files
or ov_model_path.replace(".xml", ".bin") not in model_files
)
except Exception:
return True
try:
from optimum.intel.openvino import OVModelForSequenceClassification
except ImportError:
raise ImportError(
"Please install optimum package to use OpenVINOReranker"
"pip install 'optimum[openvino,nncf]'"
)
model_kwargs = pop_params(
OVModelForSequenceClassification.from_pretrained, kwargs
)
if require_model_export(model):
# use remote model
self.model = OVModelForSequenceClassification.from_pretrained(
model, export=True, **model_kwargs
)
else:
# use local model
self.model = OVModelForSequenceClassification.from_pretrained(
model, **model_kwargs
)
self.tokenizer = AutoTokenizer.from_pretrained(model)
def __del__(self):
del self.model
del self.tokenizer
empty_cuda_cache()
super().__del__()
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries, contents, _, ids = self.cast_to_run(previous_result)
top_k = kwargs.get("top_k", 3)
batch = kwargs.get("batch", 64)
return self._pure(queries, contents, ids, top_k, batch)
def _pure(
self,
queries: List[str],
contents_list: List[List[str]],
ids_list: List[List[str]],
top_k: int,
batch: int = 64,
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Rerank a list of contents based on their relevance to a query using MonoT5.
:param queries: The list of queries to use for reranking
:param contents_list: The list of lists of contents to rerank
:param ids_list: The list of lists of ids retrieved from the initial ranking
:param top_k: The number of passages to be retrieved
:param batch: The number of queries to be processed in a batch
:return: tuple of lists containing the reranked contents, ids, and scores
"""
# Retrieve the tokens used by the model to represent false and true predictions
nested_list = [
list(map(lambda x: [query, x], content_list))
for query, content_list in zip(queries, contents_list)
]
rerank_scores = flatten_apply(
openvino_run_model,
nested_list,
model=self.model,
batch_size=batch,
tokenizer=self.tokenizer,
)
df = pd.DataFrame(
{
"contents": contents_list,
"ids": ids_list,
"scores": rerank_scores,
}
)
df[["contents", "ids", "scores"]] = df.apply(
sort_by_scores, axis=1, result_type="expand"
)
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
return (
results["contents"].tolist(),
results["ids"].tolist(),
results["scores"].tolist(),
)
def openvino_run_model(
input_texts,
model,
batch_size: int,
tokenizer,
):
batch_input_texts = make_batch(input_texts, batch_size)
results = []
for batch_texts in batch_input_texts:
input_tensors = tokenizer(
batch_texts,
padding=True,
truncation=True,
return_tensors="pt",
)
outputs = model(**input_tensors, return_dict=True)
if outputs[0].shape[1] > 1:
scores = outputs[0][:, 1]
else:
scores = outputs[0].flatten()
scores = list(map(float, (1 / (1 + np.exp(-np.array(scores))))))
results.extend(scores)
return results

View File

@@ -0,0 +1,31 @@
from typing import List
import pandas as pd
from autorag.nodes.passagereranker.base import BasePassageReranker
from autorag.utils import result_to_dataframe
class PassReranker(BasePassageReranker):
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
top_k = kwargs.pop("top_k")
_, contents_list, scores_list, ids_list = self.cast_to_run(previous_result)
return self._pure(contents_list, scores_list, ids_list, top_k)
def _pure(
self,
contents_list: List[List[str]],
scores_list: List[List[float]],
ids_list: List[List[str]],
top_k: int,
):
"""
Do not perform reranking.
Return the given top-k passages as is.
"""
contents_list = list(map(lambda x: x[:top_k], contents_list))
scores_list = list(map(lambda x: x[:top_k], scores_list))
ids_list = list(map(lambda x: x[:top_k], ids_list))
return contents_list, ids_list, scores_list

View File

@@ -0,0 +1,170 @@
from typing import List, Optional, Sequence, Tuple, Union
import numpy as np
import pandas as pd
from llama_index.core.base.llms.types import ChatMessage, ChatResponse
from llama_index.core.llms import LLM
from llama_index.core.postprocessor.rankGPT_rerank import RankGPTRerank
from llama_index.core.schema import NodeWithScore, QueryBundle, TextNode
from llama_index.core.utils import print_text
from llama_index.llms.openai import OpenAI
from autorag import generator_models
from autorag.nodes.passagereranker.base import BasePassageReranker
from autorag.utils.util import (
get_event_loop,
process_batch,
pop_params,
result_to_dataframe,
empty_cuda_cache,
)
class RankGPT(BasePassageReranker):
def __init__(
self, project_dir: str, llm: Optional[Union[str, LLM]] = None, **kwargs
):
"""
Initialize the RankGPT reranker.
:param project_dir: The project directory
:param llm: The LLM model to use for RankGPT rerank.
It is a llama index model.
Default is the OpenAI model with gpt-4o-mini.
:param kwargs: The keyword arguments for the LLM model.
"""
super().__init__(project_dir)
if llm is None:
self.llm = OpenAI(model="gpt-4o-mini")
else:
if not isinstance(llm, LLM):
llm_class = generator_models[llm]
llm_param = pop_params(llm_class.__init__, kwargs)
self.llm = llm_class(**llm_param)
else:
self.llm = llm
def __del__(self):
del self.llm
empty_cuda_cache()
super().__del__()
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries, contents, scores, ids = self.cast_to_run(previous_result)
top_k = kwargs.get("top_k", 1)
verbose = kwargs.get("verbose", False)
rankgpt_rerank_prompt = kwargs.get("rankgpt_rerank_prompt", None)
batch = kwargs.get("batch", 16)
return self._pure(
queries=queries,
contents_list=contents,
scores_list=scores,
ids_list=ids,
top_k=top_k,
verbose=verbose,
rankgpt_rerank_prompt=rankgpt_rerank_prompt,
batch=batch,
)
def _pure(
self,
queries: List[str],
contents_list: List[List[str]],
scores_list: List[List[float]],
ids_list: List[List[str]],
top_k: int,
verbose: bool = False,
rankgpt_rerank_prompt: Optional[str] = None,
batch: int = 16,
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Rerank given context paragraphs using RankGPT.
Return pseudo scores, since the actual scores are not available on RankGPT.
:param queries: The list of queries to use for reranking
:param contents_list: The list of lists of contents to rerank
:param scores_list: The list of lists of scores retrieved from the initial ranking
:param ids_list: The list of lists of ids retrieved from the initial ranking
:param top_k: The number of passages to be retrieved
:param verbose: Whether to print intermediate steps.
:param rankgpt_rerank_prompt: The prompt template for RankGPT rerank.
Default is RankGPT's default prompt.
:param batch: The number of queries to be processed in a batch.
:return: Tuple of lists containing the reranked contents, ids, and scores
"""
query_bundles = list(map(lambda query: QueryBundle(query_str=query), queries))
nodes_list = [
list(
map(
lambda x: NodeWithScore(node=TextNode(text=x[0]), score=x[1]),
zip(content_list, score_list),
)
)
for content_list, score_list in zip(contents_list, scores_list)
]
reranker = AsyncRankGPTRerank(
top_n=top_k,
llm=self.llm,
verbose=verbose,
rankgpt_rerank_prompt=rankgpt_rerank_prompt,
)
tasks = [
reranker.async_postprocess_nodes(nodes, query, ids)
for nodes, query, ids in zip(nodes_list, query_bundles, ids_list)
]
loop = get_event_loop()
rerank_result = loop.run_until_complete(process_batch(tasks, batch_size=batch))
content_result = [
list(map(lambda x: x.node.text, res[0])) for res in rerank_result
]
score_result = [
np.linspace(1.0, 0.0, len(res[0])).tolist() for res in rerank_result
]
id_result = [res[1] for res in rerank_result]
del reranker
return content_result, id_result, score_result
class AsyncRankGPTRerank(RankGPTRerank):
async def async_run_llm(self, messages: Sequence[ChatMessage]) -> ChatResponse:
return await self.llm.achat(messages)
async def async_postprocess_nodes(
self,
nodes: List[NodeWithScore],
query_bundle: QueryBundle,
ids: Optional[List[str]] = None,
) -> Tuple[List[NodeWithScore], List[str]]:
if ids is None:
ids = [str(i) for i in range(len(nodes))]
items = {
"query": query_bundle.query_str,
"hits": [{"content": node.get_content()} for node in nodes],
}
messages = self.create_permutation_instruction(item=items)
permutation = await self.async_run_llm(messages=messages)
if permutation.message is not None and permutation.message.content is not None:
rerank_ranks = self._receive_permutation(
items, str(permutation.message.content)
)
if self.verbose:
print_text(f"After Reranking, new rank list for nodes: {rerank_ranks}")
initial_results: List[NodeWithScore] = []
id_results = []
for idx in rerank_ranks:
initial_results.append(
NodeWithScore(node=nodes[idx].node, score=nodes[idx].score)
)
id_results.append(ids[idx])
return initial_results[: self.top_n], id_results[: self.top_n]
else:
return nodes[: self.top_n], ids[: self.top_n]

View File

@@ -0,0 +1,145 @@
import logging
import os
import pathlib
from typing import List, Dict
import pandas as pd
from autorag.nodes.retrieval.run import evaluate_retrieval_node
from autorag.schema.metricinput import MetricInput
from autorag.strategy import measure_speed, filter_by_threshold, select_best
from autorag.utils.util import apply_recursive, to_list
logger = logging.getLogger("AutoRAG")
def run_passage_reranker_node(
modules: List,
module_params: List[Dict],
previous_result: pd.DataFrame,
node_line_dir: str,
strategies: Dict,
) -> pd.DataFrame:
"""
Run evaluation and select the best module among passage reranker node results.
:param modules: Passage reranker modules to run.
:param module_params: Passage reranker module parameters.
:param previous_result: Previous result dataframe.
Could be retrieval, reranker modules result.
It means it must contain 'query', 'retrieved_contents', 'retrieved_ids', 'retrieve_scores' columns.
:param node_line_dir: This node line's directory.
:param strategies: Strategies for passage reranker node.
In this node, we use 'retrieval_f1', 'retrieval_recall' and 'retrieval_precision'.
You can skip evaluation when you use only one module and a module parameter.
:return: The best result dataframe with previous result columns.
"""
if not os.path.exists(node_line_dir):
os.makedirs(node_line_dir)
project_dir = pathlib.PurePath(node_line_dir).parent.parent
qa_df = pd.read_parquet(
os.path.join(project_dir, "data", "qa.parquet"), engine="pyarrow"
)
retrieval_gt = qa_df["retrieval_gt"].tolist()
retrieval_gt = apply_recursive(lambda x: str(x), to_list(retrieval_gt))
# make rows to metric_inputs
metric_inputs = [
MetricInput(retrieval_gt=ret_gt, query=query, generation_gt=gen_gt)
for ret_gt, query, gen_gt in zip(
retrieval_gt, qa_df["query"].tolist(), qa_df["generation_gt"].tolist()
)
]
results, execution_times = zip(
*map(
lambda task: measure_speed(
task[0].run_evaluator,
project_dir=project_dir,
previous_result=previous_result,
**task[1],
),
zip(modules, module_params),
)
)
average_times = list(map(lambda x: x / len(results[0]), execution_times))
# run metrics before filtering
if strategies.get("metrics") is None:
raise ValueError(
"You must at least one metrics for passage_reranker evaluation."
)
results = list(
map(
lambda x: evaluate_retrieval_node(
x,
metric_inputs,
strategies.get("metrics"),
),
results,
)
)
# save results to folder
save_dir = os.path.join(node_line_dir, "passage_reranker") # node name
if not os.path.exists(save_dir):
os.makedirs(save_dir)
filepaths = list(
map(lambda x: os.path.join(save_dir, f"{x}.parquet"), range(len(modules)))
)
list(
map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths))
) # execute save to parquet
filenames = list(map(lambda x: os.path.basename(x), filepaths))
summary_df = pd.DataFrame(
{
"filename": filenames,
"module_name": list(map(lambda module: module.__name__, modules)),
"module_params": module_params,
"execution_time": average_times,
**{
f"passage_reranker_{metric}": list(
map(lambda result: result[metric].mean(), results)
)
for metric in strategies.get("metrics")
},
}
)
# filter by strategies
if strategies.get("speed_threshold") is not None:
results, filenames = filter_by_threshold(
results, average_times, strategies["speed_threshold"], filenames
)
selected_result, selected_filename = select_best(
results,
strategies.get("metrics"),
filenames,
strategies.get("strategy", "mean"),
)
# change metric name columns to passage_reranker_metric_name
selected_result = selected_result.rename(
columns={
metric_name: f"passage_reranker_{metric_name}"
for metric_name in strategies["metrics"]
}
)
# drop retrieval result columns in previous_result
previous_result = previous_result.drop(
columns=["retrieved_contents", "retrieved_ids", "retrieve_scores"]
)
best_result = pd.concat([previous_result, selected_result], axis=1)
# add 'is_best' column to summary file
summary_df["is_best"] = summary_df["filename"] == selected_filename
# save files
summary_df.to_csv(os.path.join(save_dir, "summary.csv"), index=False)
best_result.to_parquet(
os.path.join(
save_dir, f"best_{os.path.splitext(selected_filename)[0]}.parquet"
),
index=False,
)
return best_result

View File

@@ -0,0 +1,129 @@
from typing import List, Tuple
import pandas as pd
from autorag.nodes.passagereranker.base import BasePassageReranker
from autorag.utils.util import (
flatten_apply,
make_batch,
select_top_k,
sort_by_scores,
pop_params,
result_to_dataframe,
empty_cuda_cache,
)
class SentenceTransformerReranker(BasePassageReranker):
def __init__(
self,
project_dir: str,
model_name: str = "cross-encoder/ms-marco-MiniLM-L-2-v2",
*args,
**kwargs,
):
"""
Initialize the Sentence Transformer reranker node.
:param project_dir: The project directory
:param model_name: The name of the Sentence Transformer model to use for reranking
Default is "cross-encoder/ms-marco-MiniLM-L-2-v2"
:param kwargs: The CrossEncoder parameters
"""
super().__init__(project_dir, *args, **kwargs)
try:
import torch
from sentence_transformers import CrossEncoder
except ImportError:
raise ImportError(
"You have to install AutoRAG[gpu] to use SentenceTransformerReranker"
)
self.device = "cuda" if torch.cuda.is_available() else "cpu"
model_params = pop_params(CrossEncoder.__init__, kwargs)
self.model = CrossEncoder(model_name, device=self.device, **model_params)
def __del__(self):
del self.model
empty_cuda_cache()
super().__del__()
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
"""
Rerank a list of contents based on their relevance to a query using a Sentence Transformer model.
:param previous_result: The previous result
:param top_k: The number of passages to be retrieved
:param batch: The number of queries to be processed in a batch
:return: pd DataFrame containing the reranked contents, ids, and scores
"""
queries, contents_list, scores_list, ids_list = self.cast_to_run(
previous_result
)
top_k = kwargs.get("top_k", 1)
batch = kwargs.get("batch", 64)
return self._pure(queries, contents_list, ids_list, top_k, batch)
def _pure(
self,
queries: List[str],
contents_list: List[List[str]],
ids_list: List[List[str]],
top_k: int,
batch: int = 64,
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Rerank a list of contents based on their relevance to a query using a Sentence Transformer model.
:param queries: The list of queries to use for reranking
:param contents_list: The list of lists of contents to rerank
:param ids_list: The list of lists of ids retrieved from the initial ranking
:param top_k: The number of passages to be retrieved
:param batch: The number of queries to be processed in a batch
:return: tuple of lists containing the reranked contents, ids, and scores
"""
nested_list = [
list(map(lambda x: [query, x], content_list))
for query, content_list in zip(queries, contents_list)
]
rerank_scores = flatten_apply(
sentence_transformer_run_model,
nested_list,
model=self.model,
batch_size=batch,
)
df = pd.DataFrame(
{
"contents": contents_list,
"ids": ids_list,
"scores": rerank_scores,
}
)
df[["contents", "ids", "scores"]] = df.apply(
sort_by_scores, axis=1, result_type="expand"
)
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
return (
results["contents"].tolist(),
results["ids"].tolist(),
results["scores"].tolist(),
)
def sentence_transformer_run_model(input_texts, model, batch_size: int):
try:
import torch
except ImportError:
raise ImportError(
"You have to install AutoRAG[gpu] to use SentenceTransformerReranker"
)
batch_input_texts = make_batch(input_texts, batch_size)
results = []
for batch_texts in batch_input_texts:
with torch.no_grad():
pred_scores = model.predict(sentences=batch_texts, apply_softmax=True)
results.extend(pred_scores.tolist())
return results

View File

@@ -0,0 +1 @@
from .tart import Tart

View File

@@ -0,0 +1,152 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import copy
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers.models.t5.modeling_t5 import T5Config, T5PreTrainedModel, T5Stack
from transformers.utils.model_parallel_utils import assert_device_map, get_device_map
from autorag.utils.util import empty_cuda_cache
class EncT5ForSequenceClassification(T5PreTrainedModel):
_keys_to_ignore_on_load_missing = [
r"encoder\.embed_tokens\.weight",
]
def __init__(self, config: T5Config, dropout=0.1):
super().__init__(config)
try:
from torch import nn
except ImportError:
raise ImportError("Please install PyTorch to use TART reranker.")
self.num_labels = config.num_labels
self.config = config
self.shared = nn.Embedding(config.vocab_size, config.d_model)
encoder_config = copy.deepcopy(config)
encoder_config.use_cache = False
encoder_config.is_encoder_decoder = False
self.encoder = T5Stack(encoder_config, self.shared)
self.dropout = nn.Dropout(dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
# Initialize weights and apply final processing
self.post_init()
# Model parallel
self.model_parallel = False
self.device_map = None
def parallelize(self, device_map=None):
try:
import torch
except ImportError:
raise ImportError("Please install PyTorch to use TART reranker.")
self.device_map = (
get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
if device_map is None
else device_map
)
assert_device_map(self.device_map, len(self.encoder.block))
self.encoder.parallelize(self.device_map)
self.classifier = self.classifier.to(self.encoder.first_device)
self.model_parallel = True
def deparallelize(self):
self.encoder.deparallelize()
self.encoder = self.encoder.to("cpu")
self.model_parallel = False
self.device_map = None
empty_cuda_cache()
def get_input_embeddings(self):
return self.shared
def set_input_embeddings(self, new_embeddings):
self.shared = new_embeddings
self.encoder.set_input_embeddings(new_embeddings)
def get_encoder(self):
return self.encoder
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
def forward(
self,
input_ids=None,
attention_mask=None,
head_mask=None,
inputs_embeds=None,
labels=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
):
try:
import torch
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
except ImportError:
raise ImportError("Please install PyTorch to use TART reranker.")
return_dict = (
return_dict if return_dict is not None else self.config.use_return_dict
)
outputs = self.encoder(
input_ids=input_ids,
attention_mask=attention_mask,
inputs_embeds=inputs_embeds,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = outputs[0]
pooled_output = hidden_states[:, 0, :] # Take bos token (equiv. to <s>)
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (
labels.dtype == torch.long or labels.dtype == torch.int
):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)

View File

@@ -0,0 +1,139 @@
from itertools import chain
from typing import List, Tuple
import pandas as pd
from autorag.nodes.passagereranker.base import BasePassageReranker
from autorag.nodes.passagereranker.tart.modeling_enc_t5 import (
EncT5ForSequenceClassification,
)
from autorag.nodes.passagereranker.tart.tokenization_enc_t5 import EncT5Tokenizer
from autorag.utils.util import (
make_batch,
sort_by_scores,
flatten_apply,
select_top_k,
result_to_dataframe,
empty_cuda_cache,
)
class Tart(BasePassageReranker):
def __init__(self, project_dir: str, *args, **kwargs):
super().__init__(project_dir)
try:
import torch
except ImportError:
raise ImportError(
"torch is not installed. Please install torch first to use TART reranker."
)
model_name = "facebook/tart-full-flan-t5-xl"
self.model = EncT5ForSequenceClassification.from_pretrained(model_name)
self.tokenizer = EncT5Tokenizer.from_pretrained(model_name)
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.model = self.model.to(self.device)
def __del__(self):
del self.model
del self.tokenizer
empty_cuda_cache()
super().__del__()
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries, contents, _, ids = self.cast_to_run(previous_result)
top_k = kwargs.pop("top_k")
instruction = kwargs.pop("instruction", "Find passage to answer given question")
batch = kwargs.pop("batch", 64)
return self._pure(queries, contents, ids, top_k, instruction, batch)
def _pure(
self,
queries: List[str],
contents_list: List[List[str]],
ids_list: List[List[str]],
top_k: int,
instruction: str = "Find passage to answer given question",
batch: int = 64,
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Rerank a list of contents based on their relevance to a query using Tart.
TART is a reranker based on TART (https://github.com/facebookresearch/tart).
You can rerank the passages with the instruction using TARTReranker.
The default model is facebook/tart-full-flan-t5-xl.
:param queries: The list of queries to use for reranking
:param contents_list: The list of lists of contents to rerank
:param ids_list: The list of lists of ids retrieved from the initial ranking
:param top_k: The number of passages to be retrieved
:param instruction: The instruction for reranking.
Note: default instruction is "Find passage to answer given question"
The default instruction from the TART paper is being used.
If you want to use a different instruction, you can change the instruction through this parameter
:param batch: The number of queries to be processed in a batch
:return: tuple of lists containing the reranked contents, ids, and scores
"""
nested_list = [
[["{} [SEP] {}".format(instruction, query)] for _ in contents]
for query, contents in zip(queries, contents_list)
]
rerank_scores = flatten_apply(
tart_run_model,
nested_list,
model=self.model,
batch_size=batch,
tokenizer=self.tokenizer,
device=self.device,
contents_list=contents_list,
)
df = pd.DataFrame(
{
"contents": contents_list,
"ids": ids_list,
"scores": rerank_scores,
}
)
df[["contents", "ids", "scores"]] = df.apply(
sort_by_scores, axis=1, result_type="expand"
)
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
return (
results["contents"].tolist(),
results["ids"].tolist(),
results["scores"].tolist(),
)
def tart_run_model(
input_texts, contents_list, model, batch_size: int, tokenizer, device
):
try:
import torch
import torch.nn.functional as F
except ImportError:
raise ImportError(
"torch is not installed. Please install torch first to use TART reranker."
)
flattened_texts = list(chain.from_iterable(input_texts))
flattened_contents = list(chain.from_iterable(contents_list))
batch_input_texts = make_batch(flattened_texts, batch_size)
batch_contents_list = make_batch(flattened_contents, batch_size)
results = []
for batch_texts, batch_contents in zip(batch_input_texts, batch_contents_list):
feature = tokenizer(
batch_texts,
batch_contents,
padding=True,
truncation=True,
return_tensors="pt",
).to(device)
with torch.no_grad():
pred_scores = model(**feature).logits
normalized_scores = [
float(score[1]) for score in F.softmax(pred_scores, dim=1)
]
results.extend(normalized_scores)
return results

View File

@@ -0,0 +1,112 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from typing import Any, Dict, List, Optional
from transformers import T5Tokenizer
class EncT5Tokenizer(T5Tokenizer):
def __init__(
self,
vocab_file,
bos_token="<s>",
eos_token="</s>",
unk_token="<unk>",
pad_token="<pad>",
extra_ids=100,
additional_special_tokens=None,
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs,
) -> None:
sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
super().__init__(
vocab_file=vocab_file,
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
pad_token=pad_token,
extra_ids=extra_ids,
additional_special_tokens=additional_special_tokens,
sp_model_kwargs=sp_model_kwargs,
**kwargs,
)
def get_special_tokens_mask(
self,
token_ids_0: List[int],
token_ids_1: Optional[List[int]] = None,
already_has_special_tokens: bool = False,
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0,
token_ids_1=token_ids_1,
already_has_special_tokens=True,
)
# normal case: some special tokens
if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of zeros.
"""
bos = [self.bos_token_id]
eos = [self.eos_token_id]
if token_ids_1 is None:
return len(bos + token_ids_0 + eos) * [0]
return len(bos + token_ids_0 + eos + token_ids_1 + eos) * [0]
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A sequence has the following format:
- single sequence: `<s> X </s>`
- pair of sequences: `<s> A </s> B </s>`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
else:
return (
[self.bos_token_id]
+ token_ids_0
+ [self.eos_token_id]
+ token_ids_1
+ [self.eos_token_id]
)

View File

@@ -0,0 +1,72 @@
import os
from datetime import datetime
from typing import List, Tuple
import pandas as pd
from autorag.nodes.passagereranker.base import BasePassageReranker
from autorag.utils import result_to_dataframe, fetch_contents
class TimeReranker(BasePassageReranker):
def __init__(self, project_dir: str, *args, **kwargs):
super().__init__(project_dir, *args, **kwargs)
self.corpus_df = pd.read_parquet(
os.path.join(project_dir, "data", "corpus.parquet"), engine="pyarrow"
)
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
_, contents, scores, ids = self.cast_to_run(previous_result)
metadatas = fetch_contents(self.corpus_df, ids, column_name="metadata")
times = [
[time["last_modified_datetime"] for time in time_list]
for time_list in metadatas
]
top_k = kwargs.pop("top_k")
return self._pure(contents, scores, ids, top_k, times)
def _pure(
self,
contents_list: List[List[str]],
scores_list: List[List[float]],
ids_list: List[List[str]],
top_k: int,
time_list: List[List[datetime]],
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Rerank the passages based on merely the datetime of the passage.
It uses 'last_modified_datetime' key in the corpus metadata,
so the metadata should be in the format of {'last_modified_datetime': datetime.datetime} at the corpus data file.
:param contents_list: The list of lists of contents
:param scores_list: The list of lists of scores from the initial ranking
:param ids_list: The list of lists of ids
:param top_k: The number of passages to be retrieved after reranking
:param time_list: The metadata list of lists of datetime.datetime
It automatically extracts the 'last_modified_datetime' key from the metadata in the corpus data.
:return: The reranked contents, ids, and scores
"""
def sort_row(contents, scores, ids, time, top_k):
combined = list(zip(contents, scores, ids, time))
combined.sort(key=lambda x: x[3], reverse=True)
sorted_contents, sorted_scores, sorted_ids, _ = zip(*combined)
return (
list(sorted_contents)[:top_k],
list(sorted_scores)[:top_k],
list(sorted_ids)[:top_k],
)
reranked_contents, reranked_scores, reranked_ids = zip(
*map(
sort_row,
contents_list,
scores_list,
ids_list,
time_list,
[top_k] * len(contents_list),
)
)
return list(reranked_contents), list(reranked_ids), list(reranked_scores)

View File

@@ -0,0 +1,160 @@
import logging
from typing import List, Tuple
import pandas as pd
from autorag.nodes.passagereranker.base import BasePassageReranker
from autorag.utils import result_to_dataframe
from autorag.utils.util import select_top_k, sort_by_scores, empty_cuda_cache
logger = logging.getLogger("AutoRAG")
class Upr(BasePassageReranker):
def __init__(
self,
project_dir: str,
use_bf16: bool = False,
prefix_prompt: str = "Passage: ",
suffix_prompt: str = "Please write a question based on this passage.",
*args,
**kwargs,
):
"""
Initialize the UPR reranker node.
:param project_dir: The project directory
:param use_bf16: Whether to use bfloat16 for the model. Default is False.
:param prefix_prompt: The prefix prompt for the language model that generates question for reranking.
Default is "Passage: ".
The prefix prompt serves as the initial context or instruction for the language model.
It sets the stage for what is expected in the output
:param suffix_prompt: The suffix prompt for the language model that generates question for reranking.
Default is "Please write a question based on this passage.".
The suffix prompt provides a cue or a closing instruction to the language model,
signaling how to conclude the generated text or what format to follow at the end.
:param kwargs: Extra arguments
"""
super().__init__(project_dir, *args, **kwargs)
self.scorer = UPRScorer(
suffix_prompt=suffix_prompt, prefix_prompt=prefix_prompt, use_bf16=use_bf16
)
def __del__(self):
del self.scorer
super().__del__()
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries, contents, _, ids = self.cast_to_run(previous_result)
top_k = kwargs.pop("top_k")
return self._pure(queries, contents, ids, top_k)
def _pure(
self,
queries: List[str],
contents_list: List[List[str]],
ids_list: List[List[str]],
top_k: int,
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Rerank a list of contents based on their relevance to a query using UPR.
UPR is a reranker based on UPR (https://github.com/DevSinghSachan/unsupervised-passage-reranking).
The language model will make a question based on the passage and rerank the passages by the likelihood of the question.
The default model is t5-large.
:param queries: The list of queries to use for reranking
:param contents_list: The list of lists of contents to rerank
:param ids_list: The list of lists of ids retrieved from the initial ranking
:param top_k: The number of passages to be retrieved
:return: tuple of lists containing the reranked contents, ids, and scores
"""
df = pd.DataFrame(
{
"query": queries,
"contents": contents_list,
"ids": ids_list,
}
)
df["scores"] = df.apply(
lambda row: self.scorer.compute(
query=row["query"], contents=row["contents"]
),
axis=1,
)
df[["contents", "ids", "scores"]] = df.apply(
lambda x: sort_by_scores(x, reverse=False), axis=1, result_type="expand"
)
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
return (
results["contents"].tolist(),
results["ids"].tolist(),
results["scores"].tolist(),
)
class UPRScorer:
def __init__(self, suffix_prompt: str, prefix_prompt: str, use_bf16: bool = False):
try:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
except ImportError:
raise ImportError(
"torch is not installed. Please install torch to use UPRReranker."
)
model_name = "t5-large"
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.tokenizer = T5Tokenizer.from_pretrained(model_name)
self.model = T5ForConditionalGeneration.from_pretrained(
model_name, torch_dtype=torch.bfloat16 if use_bf16 else torch.float32
).to(self.device)
self.suffix_prompt = suffix_prompt
self.prefix_prompt = prefix_prompt
def compute(self, query: str, contents: List[str]) -> List[float]:
try:
import torch
except ImportError:
raise ImportError(
"torch is not installed. Please install torch to use UPRReranker."
)
query_token = self.tokenizer(
query, max_length=128, truncation=True, return_tensors="pt"
)
prompts = list(
map(
lambda content: f"{self.prefix_prompt} {content} {self.suffix_prompt}",
contents,
)
)
prompt_token_outputs = self.tokenizer(
prompts,
padding="longest",
max_length=512,
pad_to_multiple_of=8,
truncation=True,
return_tensors="pt",
)
query_input_ids = torch.repeat_interleave(
query_token["input_ids"], len(contents), dim=0
).to(self.device)
with torch.no_grad():
logits = self.model(
input_ids=prompt_token_outputs["input_ids"].to(self.device),
attention_mask=prompt_token_outputs["attention_mask"].to(self.device),
labels=query_input_ids,
).logits
log_softmax = torch.nn.functional.log_softmax(logits, dim=-1)
nll = -log_softmax.gather(2, query_input_ids.unsqueeze(2)).squeeze(2)
avg_nll = torch.sum(nll, dim=1)
return avg_nll.tolist()
def __del__(self):
del self.model
del self.tokenizer
empty_cuda_cache()

View File

@@ -0,0 +1,109 @@
import os
from typing import List, Tuple
import pandas as pd
import voyageai
from autorag.nodes.passagereranker.base import BasePassageReranker
from autorag.utils.util import result_to_dataframe, get_event_loop, process_batch
class VoyageAIReranker(BasePassageReranker):
def __init__(self, project_dir: str, *args, **kwargs):
super().__init__(project_dir)
api_key = kwargs.pop("api_key", None)
api_key = os.getenv("VOYAGE_API_KEY", None) if api_key is None else api_key
if api_key is None:
raise KeyError(
"Please set the API key for VoyageAI rerank in the environment variable VOYAGE_API_KEY "
"or directly set it on the config YAML file."
)
self.voyage_client = voyageai.AsyncClient(api_key=api_key)
def __del__(self):
del self.voyage_client
super().__del__()
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries, contents, scores, ids = self.cast_to_run(previous_result)
top_k = kwargs.pop("top_k")
batch = kwargs.pop("batch", 8)
model = kwargs.pop("model", "rerank-2")
truncation = kwargs.pop("truncation", True)
return self._pure(queries, contents, ids, top_k, model, batch, truncation)
def _pure(
self,
queries: List[str],
contents_list: List[List[str]],
ids_list: List[List[str]],
top_k: int,
model: str = "rerank-2",
batch: int = 8,
truncation: bool = True,
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Rerank a list of contents with VoyageAI rerank models.
You can get the API key from https://docs.voyageai.com/docs/api-key-and-installation and set it in the environment variable VOYAGE_API_KEY.
:param queries: The list of queries to use for reranking
:param contents_list: The list of lists of contents to rerank
:param ids_list: The list of lists of ids retrieved from the initial ranking
:param top_k: The number of passages to be retrieved
:param model: The model name for VoyageAI rerank.
You can choose between "rerank-2" and "rerank-2-lite".
Default is "rerank-2".
:param batch: The number of queries to be processed in a batch
:param truncation: Whether to truncate the input to satisfy the 'context length limit' on the query and the documents.
:return: Tuple of lists containing the reranked contents, ids, and scores
"""
tasks = [
voyageai_rerank_pure(
self.voyage_client, model, query, contents, ids, top_k, truncation
)
for query, contents, ids in zip(queries, contents_list, ids_list)
]
loop = get_event_loop()
results = loop.run_until_complete(process_batch(tasks, batch))
content_result, id_result, score_result = zip(*results)
return list(content_result), list(id_result), list(score_result)
async def voyageai_rerank_pure(
voyage_client: voyageai.AsyncClient,
model: str,
query: str,
documents: List[str],
ids: List[str],
top_k: int,
truncation: bool = True,
) -> Tuple[List[str], List[str], List[float]]:
"""
Rerank a list of contents with VoyageAI rerank models.
:param voyage_client: The Voyage Client to use for reranking
:param model: The model name for VoyageAI rerank
:param query: The query to use for reranking
:param documents: The list of contents to rerank
:param ids: The list of ids corresponding to the documents
:param top_k: The number of passages to be retrieved
:param truncation: Whether to truncate the input to satisfy the 'context length limit' on the query and the documents.
:return: Tuple of lists containing the reranked contents, ids, and scores
"""
rerank_results = await voyage_client.rerank(
model=model,
query=query,
documents=documents,
top_k=top_k,
truncation=truncation,
)
reranked_scores: List[float] = list(
map(lambda x: x.relevance_score, rerank_results.results)
)
indices = list(map(lambda x: x.index, rerank_results.results))
reranked_contents: List[str] = list(map(lambda i: documents[i], indices))
reranked_ids: List[str] = list(map(lambda i: ids[i], indices))
return reranked_contents, reranked_ids, reranked_scores