Fix Dockerfile build issue
This commit is contained in:
18
autorag/nodes/passagereranker/__init__.py
Normal file
18
autorag/nodes/passagereranker/__init__.py
Normal file
@@ -0,0 +1,18 @@
|
||||
from .cohere import CohereReranker
|
||||
from .colbert import ColbertReranker
|
||||
from .flag_embedding import FlagEmbeddingReranker
|
||||
from .flag_embedding_llm import FlagEmbeddingLLMReranker
|
||||
from .jina import JinaReranker
|
||||
from .koreranker import KoReranker
|
||||
from .monot5 import MonoT5
|
||||
from .pass_reranker import PassReranker
|
||||
from .rankgpt import RankGPT
|
||||
from .sentence_transformer import SentenceTransformerReranker
|
||||
from .time_reranker import TimeReranker
|
||||
from .upr import Upr
|
||||
from .openvino import OpenVINOReranker
|
||||
from .voyageai import VoyageAIReranker
|
||||
from .mixedbreadai import MixedbreadAIReranker
|
||||
from .flashrank import FlashRankReranker
|
||||
|
||||
from .dragonkue2 import DragonKue2 # 250313 추가 - 김용연
|
||||
55
autorag/nodes/passagereranker/base.py
Normal file
55
autorag/nodes/passagereranker/base.py
Normal file
@@ -0,0 +1,55 @@
|
||||
import abc
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.schema import BaseModule
|
||||
from autorag.utils import validate_qa_dataset
|
||||
|
||||
logger = logging.getLogger("AutoRAG")
|
||||
|
||||
|
||||
class BasePassageReranker(BaseModule, metaclass=abc.ABCMeta):
|
||||
def __init__(self, project_dir: Union[str, Path], *args, **kwargs):
|
||||
logger.info(
|
||||
f"Initialize passage reranker node - {self.__class__.__name__} module..."
|
||||
)
|
||||
|
||||
def __del__(self):
|
||||
logger.info(
|
||||
f"Deleting passage reranker node - {self.__class__.__name__} module..."
|
||||
)
|
||||
|
||||
def cast_to_run(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
logger.info(
|
||||
f"Running passage reranker node - {self.__class__.__name__} module..."
|
||||
)
|
||||
validate_qa_dataset(previous_result)
|
||||
|
||||
# find queries columns
|
||||
assert (
|
||||
"query" in previous_result.columns
|
||||
), "previous_result must have query column."
|
||||
queries = previous_result["query"].tolist()
|
||||
|
||||
# find contents_list columns
|
||||
assert (
|
||||
"retrieved_contents" in previous_result.columns
|
||||
), "previous_result must have retrieved_contents column."
|
||||
contents = previous_result["retrieved_contents"].tolist()
|
||||
|
||||
# find scores columns
|
||||
assert (
|
||||
"retrieve_scores" in previous_result.columns
|
||||
), "previous_result must have retrieve_scores column."
|
||||
scores = previous_result["retrieve_scores"].tolist()
|
||||
|
||||
# find ids columns
|
||||
assert (
|
||||
"retrieved_ids" in previous_result.columns
|
||||
), "previous_result must have retrieved_ids column."
|
||||
ids = previous_result["retrieved_ids"].tolist()
|
||||
|
||||
return queries, contents, scores, ids
|
||||
119
autorag/nodes/passagereranker/cohere.py
Normal file
119
autorag/nodes/passagereranker/cohere.py
Normal file
@@ -0,0 +1,119 @@
|
||||
import os
|
||||
from typing import List, Tuple
|
||||
|
||||
import cohere
|
||||
import pandas as pd
|
||||
from cohere import RerankResponseResultsItem
|
||||
|
||||
from autorag.nodes.passagereranker.base import BasePassageReranker
|
||||
from autorag.utils.util import get_event_loop, process_batch, result_to_dataframe
|
||||
|
||||
|
||||
class CohereReranker(BasePassageReranker):
|
||||
def __init__(self, project_dir: str, *args, **kwargs):
|
||||
"""
|
||||
Initialize Cohere rerank node.
|
||||
|
||||
:param project_dir: The project directory path.
|
||||
:param api_key: The API key for Cohere rerank.
|
||||
You can set it in the environment variable COHERE_API_KEY.
|
||||
Or, you can directly set it on the config YAML file using this parameter.
|
||||
Default is env variable "COHERE_API_KEY".
|
||||
:param kwargs: Extra arguments that are not affected
|
||||
"""
|
||||
super().__init__(project_dir)
|
||||
api_key = kwargs.pop("api_key", None)
|
||||
api_key = os.getenv("COHERE_API_KEY", None) if api_key is None else api_key
|
||||
if api_key is None:
|
||||
api_key = os.getenv("CO_API_KEY", None)
|
||||
if api_key is None:
|
||||
raise KeyError(
|
||||
"Please set the API key for Cohere rerank in the environment variable COHERE_API_KEY "
|
||||
"or directly set it on the config YAML file."
|
||||
)
|
||||
|
||||
self.cohere_client = cohere.AsyncClientV2(api_key=api_key)
|
||||
|
||||
def __del__(self):
|
||||
del self.cohere_client
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries, contents, scores, ids = self.cast_to_run(previous_result)
|
||||
top_k = kwargs.pop("top_k")
|
||||
batch = kwargs.pop("batch", 64)
|
||||
model = kwargs.pop("model", "rerank-v3.5")
|
||||
return self._pure(queries, contents, scores, ids, top_k, batch, model)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents_list: List[List[str]],
|
||||
scores_list: List[List[float]],
|
||||
ids_list: List[List[str]],
|
||||
top_k: int,
|
||||
batch: int = 64,
|
||||
model: str = "rerank-v3.5",
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Rerank a list of contents with Cohere rerank models.
|
||||
You can get the API key from https://cohere.com/rerank and set it in the environment variable COHERE_API_KEY.
|
||||
|
||||
:param queries: The list of queries to use for reranking
|
||||
:param contents_list: The list of lists of contents to rerank
|
||||
:param scores_list: The list of lists of scores retrieved from the initial ranking
|
||||
:param ids_list: The list of lists of ids retrieved from the initial ranking
|
||||
:param top_k: The number of passages to be retrieved
|
||||
:param batch: The number of queries to be processed in a batch
|
||||
:param model: The model name for Cohere rerank.
|
||||
You can choose between "rerank-v3.5", "rerank-english-v3.0", and "rerank-multilingual-v3.0".
|
||||
Default is "rerank-v3.5".
|
||||
:return: Tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
# Run async cohere_rerank_pure function
|
||||
tasks = [
|
||||
cohere_rerank_pure(self.cohere_client, model, query, document, ids, top_k)
|
||||
for query, document, ids in zip(queries, contents_list, ids_list)
|
||||
]
|
||||
loop = get_event_loop()
|
||||
results = loop.run_until_complete(process_batch(tasks, batch_size=batch))
|
||||
content_result = list(map(lambda x: x[0], results))
|
||||
id_result = list(map(lambda x: x[1], results))
|
||||
score_result = list(map(lambda x: x[2], results))
|
||||
|
||||
return content_result, id_result, score_result
|
||||
|
||||
|
||||
async def cohere_rerank_pure(
|
||||
cohere_client: cohere.AsyncClient,
|
||||
model: str,
|
||||
query: str,
|
||||
documents: List[str],
|
||||
ids: List[str],
|
||||
top_k: int,
|
||||
) -> Tuple[List[str], List[str], List[float]]:
|
||||
"""
|
||||
Rerank a list of contents with Cohere rerank models.
|
||||
|
||||
:param cohere_client: The Cohere AsyncClient to use for reranking
|
||||
:param model: The model name for Cohere rerank
|
||||
:param query: The query to use for reranking
|
||||
:param documents: The list of contents to rerank
|
||||
:param ids: The list of ids corresponding to the documents
|
||||
:param top_k: The number of passages to be retrieved
|
||||
:return: Tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
rerank_results = await cohere_client.rerank(
|
||||
model=model,
|
||||
query=query,
|
||||
documents=documents,
|
||||
top_n=top_k,
|
||||
return_documents=False,
|
||||
)
|
||||
results: List[RerankResponseResultsItem] = rerank_results.results
|
||||
reranked_scores: List[float] = list(map(lambda x: x.relevance_score, results))
|
||||
indices = list(map(lambda x: x.index, results))
|
||||
reranked_contents: List[str] = list(map(lambda i: documents[i], indices))
|
||||
reranked_ids: List[str] = list(map(lambda i: ids[i], indices))
|
||||
return reranked_contents, reranked_ids, reranked_scores
|
||||
213
autorag/nodes/passagereranker/colbert.py
Normal file
213
autorag/nodes/passagereranker/colbert.py
Normal file
@@ -0,0 +1,213 @@
|
||||
from typing import List, Tuple
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.passagereranker.base import BasePassageReranker
|
||||
from autorag.utils.util import (
|
||||
flatten_apply,
|
||||
sort_by_scores,
|
||||
select_top_k,
|
||||
pop_params,
|
||||
result_to_dataframe,
|
||||
empty_cuda_cache,
|
||||
)
|
||||
|
||||
|
||||
class ColbertReranker(BasePassageReranker):
|
||||
def __init__(
|
||||
self,
|
||||
project_dir: str,
|
||||
model_name: str = "colbert-ir/colbertv2.0",
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Initialize a colbert rerank model for reranking.
|
||||
|
||||
:param project_dir: The project directory
|
||||
:param model_name: The model name for Colbert rerank.
|
||||
You can choose a colbert model for reranking.
|
||||
The default is "colbert-ir/colbertv2.0".
|
||||
:param kwargs: Extra parameter for the model.
|
||||
"""
|
||||
super().__init__(project_dir)
|
||||
try:
|
||||
import torch
|
||||
from transformers import AutoModel, AutoTokenizer
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Pytorch is not installed. Please install pytorch to use Colbert reranker."
|
||||
)
|
||||
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
model_params = pop_params(AutoModel.from_pretrained, kwargs)
|
||||
self.model = AutoModel.from_pretrained(model_name, **model_params).to(
|
||||
self.device
|
||||
)
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
|
||||
def __del__(self):
|
||||
del self.model
|
||||
empty_cuda_cache()
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries, contents, _, ids = self.cast_to_run(previous_result)
|
||||
top_k = kwargs.pop("top_k")
|
||||
batch = kwargs.pop("batch", 64)
|
||||
return self._pure(queries, contents, ids, top_k, batch)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents_list: List[List[str]],
|
||||
ids_list: List[List[str]],
|
||||
top_k: int,
|
||||
batch: int = 64,
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Rerank a list of contents with Colbert rerank models.
|
||||
You can get more information about a Colbert model at https://huggingface.co/colbert-ir/colbertv2.0.
|
||||
It uses BERT-based model, so recommend using CUDA gpu for faster reranking.
|
||||
|
||||
:param queries: The list of queries to use for reranking
|
||||
:param contents_list: The list of lists of contents to rerank
|
||||
:param ids_list: The list of lists of ids retrieved from the initial ranking
|
||||
:param top_k: The number of passages to be retrieved
|
||||
:param batch: The number of queries to be processed in a batch
|
||||
Default is 64.
|
||||
|
||||
:return: Tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
|
||||
# get query and content embeddings
|
||||
query_embedding_list = get_colbert_embedding_batch(
|
||||
queries, self.model, self.tokenizer, batch
|
||||
)
|
||||
content_embedding_list = flatten_apply(
|
||||
get_colbert_embedding_batch,
|
||||
contents_list,
|
||||
model=self.model,
|
||||
tokenizer=self.tokenizer,
|
||||
batch_size=batch,
|
||||
)
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"ids": ids_list,
|
||||
"query_embedding": query_embedding_list,
|
||||
"contents": contents_list,
|
||||
"content_embedding": content_embedding_list,
|
||||
}
|
||||
)
|
||||
temp_df = df.explode("content_embedding")
|
||||
temp_df["score"] = temp_df.apply(
|
||||
lambda x: get_colbert_score(x["query_embedding"], x["content_embedding"]),
|
||||
axis=1,
|
||||
)
|
||||
df["scores"] = (
|
||||
temp_df.groupby(level=0, sort=False)["score"].apply(list).tolist()
|
||||
)
|
||||
df[["contents", "ids", "scores"]] = df.apply(
|
||||
sort_by_scores, axis=1, result_type="expand"
|
||||
)
|
||||
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
|
||||
|
||||
return (
|
||||
results["contents"].tolist(),
|
||||
results["ids"].tolist(),
|
||||
results["scores"].tolist(),
|
||||
)
|
||||
|
||||
|
||||
def get_colbert_embedding_batch(
|
||||
input_strings: List[str], model, tokenizer, batch_size: int
|
||||
) -> List[np.array]:
|
||||
try:
|
||||
import torch
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Pytorch is not installed. Please install pytorch to use Colbert reranker."
|
||||
)
|
||||
encoding = tokenizer(
|
||||
input_strings,
|
||||
return_tensors="pt",
|
||||
padding=True,
|
||||
truncation=True,
|
||||
max_length=model.config.max_position_embeddings,
|
||||
)
|
||||
|
||||
input_batches = slice_tokenizer_result(encoding, batch_size)
|
||||
result_embedding = []
|
||||
with torch.no_grad():
|
||||
for encoding_batch in input_batches:
|
||||
result_embedding.append(model(**encoding_batch).last_hidden_state)
|
||||
total_tensor = torch.cat(
|
||||
result_embedding, dim=0
|
||||
) # shape [batch_size, token_length, embedding_dim]
|
||||
tensor_results = list(total_tensor.chunk(total_tensor.size()[0]))
|
||||
|
||||
if torch.cuda.is_available():
|
||||
return list(map(lambda x: x.detach().cpu().numpy(), tensor_results))
|
||||
else:
|
||||
return list(map(lambda x: x.detach().numpy(), tensor_results))
|
||||
|
||||
|
||||
def slice_tokenizer_result(tokenizer_output, batch_size):
|
||||
input_ids_batches = slice_tensor(tokenizer_output["input_ids"], batch_size)
|
||||
attention_mask_batches = slice_tensor(
|
||||
tokenizer_output["attention_mask"], batch_size
|
||||
)
|
||||
token_type_ids_batches = slice_tensor(
|
||||
tokenizer_output.get("token_type_ids", None), batch_size
|
||||
)
|
||||
return [
|
||||
{
|
||||
"input_ids": input_ids,
|
||||
"attention_mask": attention_mask,
|
||||
"token_type_ids": token_type_ids,
|
||||
}
|
||||
for input_ids, attention_mask, token_type_ids in zip(
|
||||
input_ids_batches, attention_mask_batches, token_type_ids_batches
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
def slice_tensor(input_tensor, batch_size):
|
||||
try:
|
||||
import torch
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Pytorch is not installed. Please install pytorch to use Colbert reranker."
|
||||
)
|
||||
# Calculate the number of full batches
|
||||
num_full_batches = input_tensor.size(0) // batch_size
|
||||
|
||||
# Slice the tensor into batches
|
||||
tensor_list = [
|
||||
input_tensor[i * batch_size : (i + 1) * batch_size]
|
||||
for i in range(num_full_batches)
|
||||
]
|
||||
|
||||
# Handle the last batch if it's smaller than batch_size
|
||||
remainder = input_tensor.size(0) % batch_size
|
||||
if remainder:
|
||||
tensor_list.append(input_tensor[-remainder:])
|
||||
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
tensor_list = list(map(lambda x: x.to(device), tensor_list))
|
||||
|
||||
return tensor_list
|
||||
|
||||
|
||||
def get_colbert_score(query_embedding: np.array, content_embedding: np.array) -> float:
|
||||
if query_embedding.ndim == 3 and content_embedding.ndim == 3:
|
||||
query_embedding = query_embedding.reshape(-1, query_embedding.shape[-1])
|
||||
content_embedding = content_embedding.reshape(-1, content_embedding.shape[-1])
|
||||
|
||||
sim_matrix = np.dot(query_embedding, content_embedding.T) / (
|
||||
np.linalg.norm(query_embedding, axis=1)[:, np.newaxis]
|
||||
* np.linalg.norm(content_embedding, axis=1)
|
||||
)
|
||||
max_sim_scores = np.max(sim_matrix, axis=1)
|
||||
return float(np.mean(max_sim_scores))
|
||||
138
autorag/nodes/passagereranker/dragonkue2.py
Normal file
138
autorag/nodes/passagereranker/dragonkue2.py
Normal file
@@ -0,0 +1,138 @@
|
||||
# 250313 reranker module_type 추가 - 김용연
|
||||
|
||||
from typing import List, Tuple
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.passagereranker.base import BasePassageReranker
|
||||
from autorag.utils.util import (
|
||||
make_batch,
|
||||
sort_by_scores,
|
||||
flatten_apply,
|
||||
select_top_k,
|
||||
result_to_dataframe,
|
||||
empty_cuda_cache,
|
||||
)
|
||||
|
||||
|
||||
class DragonKue2(BasePassageReranker):
|
||||
def __init__(self, project_dir: str, *args, **kwargs):
|
||||
super().__init__(project_dir)
|
||||
try:
|
||||
import torch
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
||||
except ImportError:
|
||||
raise ImportError("For using dragonkue2, please install torch first.")
|
||||
|
||||
model_path = "dragonkue/bge-reranker-v2-m3-ko"
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
|
||||
self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
|
||||
self.model.eval()
|
||||
# Determine the device to run the model on (GPU if available, otherwise CPU)
|
||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
self.model.to(self.device)
|
||||
|
||||
def __del__(self):
|
||||
del self.model
|
||||
empty_cuda_cache()
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries, contents, _, ids = self.cast_to_run(previous_result)
|
||||
top_k = kwargs.pop("top_k")
|
||||
batch = kwargs.pop("batch", 64)
|
||||
return self._pure(queries, contents, ids, top_k, batch)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents_list: List[List[str]],
|
||||
ids_list: List[List[str]],
|
||||
top_k: int,
|
||||
batch: int = 64,
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Rerank a list of contents based on their relevance to a query using ko-reranker.
|
||||
bge-reranker-v2-m3-ko is a reranker based on korean (https://huggingface.co/dragonkue/bge-reranker-v2-m3-ko).
|
||||
|
||||
:param queries: The list of queries to use for reranking
|
||||
:param contents_list: The list of lists of contents to rerank
|
||||
:param ids_list: The list of lists of ids retrieved from the initial ranking
|
||||
:param top_k: The number of passages to be retrieved
|
||||
:param batch: The number of queries to be processed in a batch
|
||||
Default is 64.
|
||||
:return: Tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
nested_list = [
|
||||
list(map(lambda x: [query, x], content_list))
|
||||
for query, content_list in zip(queries, contents_list)
|
||||
]
|
||||
scores_nps = flatten_apply(
|
||||
dragonku2_run_model,
|
||||
nested_list,
|
||||
model=self.model,
|
||||
batch_size=batch,
|
||||
tokenizer=self.tokenizer,
|
||||
device=self.device,
|
||||
)
|
||||
|
||||
rerank_scores = list(
|
||||
map(
|
||||
lambda scores: exp_normalize(np.array(scores)).astype(float), scores_nps
|
||||
)
|
||||
)
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"contents": contents_list,
|
||||
"ids": ids_list,
|
||||
"scores": rerank_scores,
|
||||
}
|
||||
)
|
||||
df[["contents", "ids", "scores"]] = df.apply(
|
||||
sort_by_scores, axis=1, result_type="expand"
|
||||
)
|
||||
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
|
||||
|
||||
return (
|
||||
results["contents"].tolist(),
|
||||
results["ids"].tolist(),
|
||||
results["scores"].tolist(),
|
||||
)
|
||||
|
||||
|
||||
def dragonku2_run_model(input_texts, model, tokenizer, device, batch_size: int): # 250313 추가 - 김용연
|
||||
try:
|
||||
import torch
|
||||
except ImportError:
|
||||
raise ImportError("For using drangonku2, please install torch first.")
|
||||
batch_input_texts = make_batch(input_texts, batch_size)
|
||||
results = []
|
||||
for batch_texts in batch_input_texts:
|
||||
inputs = tokenizer(
|
||||
batch_texts,
|
||||
padding=True,
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
max_length=512,
|
||||
)
|
||||
inputs = inputs.to(device)
|
||||
with torch.no_grad():
|
||||
scores = (
|
||||
model(**inputs, return_dict=True)
|
||||
.logits.view(
|
||||
-1,
|
||||
)
|
||||
.float()
|
||||
)
|
||||
scores_np = scores.cpu().numpy()
|
||||
results.extend(scores_np)
|
||||
return results
|
||||
|
||||
|
||||
def exp_normalize(x):
|
||||
b = x.max()
|
||||
y = np.exp(x - b)
|
||||
return y / y.sum()
|
||||
112
autorag/nodes/passagereranker/flag_embedding.py
Normal file
112
autorag/nodes/passagereranker/flag_embedding.py
Normal file
@@ -0,0 +1,112 @@
|
||||
from typing import List, Tuple, Iterable
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.passagereranker.base import BasePassageReranker
|
||||
from autorag.utils.util import (
|
||||
make_batch,
|
||||
sort_by_scores,
|
||||
flatten_apply,
|
||||
select_top_k,
|
||||
pop_params,
|
||||
result_to_dataframe,
|
||||
empty_cuda_cache,
|
||||
)
|
||||
|
||||
|
||||
class FlagEmbeddingReranker(BasePassageReranker):
|
||||
def __init__(
|
||||
self, project_dir, model_name: str = "BAAI/bge-reranker-large", *args, **kwargs
|
||||
):
|
||||
"""
|
||||
Initialize the FlagEmbeddingReranker module.
|
||||
|
||||
:param project_dir: The project directory.
|
||||
:param model_name: The name of the BAAI Reranker normal-model name.
|
||||
Default is "BAAI/bge-reranker-large"
|
||||
:param kwargs: Extra parameter for FlagEmbedding.FlagReranker
|
||||
"""
|
||||
super().__init__(project_dir)
|
||||
try:
|
||||
from FlagEmbedding import FlagReranker
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"FlagEmbeddingReranker requires the 'FlagEmbedding' package to be installed."
|
||||
)
|
||||
model_params = pop_params(FlagReranker.__init__, kwargs)
|
||||
model_params.pop("model_name_or_path", None)
|
||||
self.model = FlagReranker(model_name_or_path=model_name, **model_params)
|
||||
|
||||
def __del__(self):
|
||||
del self.model
|
||||
empty_cuda_cache()
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries, contents, _, ids = self.cast_to_run(previous_result)
|
||||
top_k = kwargs.pop("top_k")
|
||||
batch = kwargs.pop("batch", 64)
|
||||
return self._pure(queries, contents, ids, top_k, batch)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents_list: List[List[str]],
|
||||
ids_list: List[List[str]],
|
||||
top_k: int,
|
||||
batch: int = 64,
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Rerank a list of contents based on their relevance to a query using BAAI normal-Reranker model.
|
||||
|
||||
:param queries: The list of queries to use for reranking
|
||||
:param contents_list: The list of lists of contents to rerank
|
||||
:param ids_list: The list of lists of ids retrieved from the initial ranking
|
||||
:param top_k: The number of passages to be retrieved
|
||||
:param batch: The number of queries to be processed in a batch
|
||||
Default is 64.
|
||||
:return: Tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
nested_list = [
|
||||
list(map(lambda x: [query, x], content_list))
|
||||
for query, content_list in zip(queries, contents_list)
|
||||
]
|
||||
rerank_scores = flatten_apply(
|
||||
flag_embedding_run_model, nested_list, model=self.model, batch_size=batch
|
||||
)
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"contents": contents_list,
|
||||
"ids": ids_list,
|
||||
"scores": rerank_scores,
|
||||
}
|
||||
)
|
||||
df[["contents", "ids", "scores"]] = df.apply(
|
||||
sort_by_scores, axis=1, result_type="expand"
|
||||
)
|
||||
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
|
||||
|
||||
return (
|
||||
results["contents"].tolist(),
|
||||
results["ids"].tolist(),
|
||||
results["scores"].tolist(),
|
||||
)
|
||||
|
||||
|
||||
def flag_embedding_run_model(input_texts, model, batch_size: int):
|
||||
try:
|
||||
import torch
|
||||
except ImportError:
|
||||
raise ImportError("FlagEmbeddingReranker requires PyTorch to be installed.")
|
||||
batch_input_texts = make_batch(input_texts, batch_size)
|
||||
results = []
|
||||
for batch_texts in batch_input_texts:
|
||||
with torch.no_grad():
|
||||
pred_scores = model.compute_score(sentence_pairs=batch_texts)
|
||||
if batch_size == 1 or not isinstance(pred_scores, Iterable):
|
||||
results.append(pred_scores)
|
||||
else:
|
||||
results.extend(pred_scores)
|
||||
return results
|
||||
101
autorag/nodes/passagereranker/flag_embedding_llm.py
Normal file
101
autorag/nodes/passagereranker/flag_embedding_llm.py
Normal file
@@ -0,0 +1,101 @@
|
||||
from typing import List, Tuple
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.passagereranker.base import BasePassageReranker
|
||||
from autorag.nodes.passagereranker.flag_embedding import flag_embedding_run_model
|
||||
from autorag.utils.util import (
|
||||
flatten_apply,
|
||||
sort_by_scores,
|
||||
select_top_k,
|
||||
pop_params,
|
||||
result_to_dataframe,
|
||||
empty_cuda_cache,
|
||||
)
|
||||
|
||||
|
||||
class FlagEmbeddingLLMReranker(BasePassageReranker):
|
||||
def __init__(
|
||||
self,
|
||||
project_dir,
|
||||
model_name: str = "BAAI/bge-reranker-v2-gemma",
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Initialize the FlagEmbeddingReranker module.
|
||||
|
||||
:param project_dir: The project directory.
|
||||
:param model_name: The name of the BAAI Reranker LLM-based-model name.
|
||||
Default is "BAAI/bge-reranker-v2-gemma"
|
||||
:param kwargs: Extra parameter for FlagEmbedding.FlagReranker
|
||||
"""
|
||||
super().__init__(project_dir)
|
||||
try:
|
||||
from FlagEmbedding import FlagLLMReranker
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"FlagEmbeddingLLMReranker requires the 'FlagEmbedding' package to be installed."
|
||||
)
|
||||
model_params = pop_params(FlagLLMReranker.__init__, kwargs)
|
||||
model_params.pop("model_name_or_path", None)
|
||||
self.model = FlagLLMReranker(model_name_or_path=model_name, **model_params)
|
||||
|
||||
def __del__(self):
|
||||
del self.model
|
||||
empty_cuda_cache()
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries, contents, _, ids = self.cast_to_run(previous_result)
|
||||
top_k = kwargs.pop("top_k")
|
||||
batch = kwargs.pop("batch", 64)
|
||||
return self._pure(queries, contents, ids, top_k, batch)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents_list: List[List[str]],
|
||||
ids_list: List[List[str]],
|
||||
top_k: int,
|
||||
batch: int = 64,
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Rerank a list of contents based on their relevance to a query using BAAI LLM-based-Reranker model.
|
||||
|
||||
:param queries: The list of queries to use for reranking
|
||||
:param contents_list: The list of lists of contents to rerank
|
||||
:param ids_list: The list of lists of ids retrieved from the initial ranking
|
||||
:param top_k: The number of passages to be retrieved
|
||||
:param batch: The number of queries to be processed in a batch
|
||||
Default is 64.
|
||||
|
||||
:return: tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
|
||||
nested_list = [
|
||||
list(map(lambda x: [query, x], content_list))
|
||||
for query, content_list in zip(queries, contents_list)
|
||||
]
|
||||
rerank_scores = flatten_apply(
|
||||
flag_embedding_run_model, nested_list, model=self.model, batch_size=batch
|
||||
)
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"contents": contents_list,
|
||||
"ids": ids_list,
|
||||
"scores": rerank_scores,
|
||||
}
|
||||
)
|
||||
df[["contents", "ids", "scores"]] = df.apply(
|
||||
sort_by_scores, axis=1, result_type="expand"
|
||||
)
|
||||
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
|
||||
|
||||
return (
|
||||
results["contents"].tolist(),
|
||||
results["ids"].tolist(),
|
||||
results["scores"].tolist(),
|
||||
)
|
||||
245
autorag/nodes/passagereranker/flashrank.py
Normal file
245
autorag/nodes/passagereranker/flashrank.py
Normal file
@@ -0,0 +1,245 @@
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import os
|
||||
import zipfile
|
||||
import requests
|
||||
from tqdm import tqdm
|
||||
import collections
|
||||
from typing import List, Dict, Tuple
|
||||
|
||||
from autorag.nodes.passagereranker.base import BasePassageReranker
|
||||
from autorag.utils import result_to_dataframe
|
||||
from autorag.utils.util import (
|
||||
flatten_apply,
|
||||
sort_by_scores,
|
||||
select_top_k,
|
||||
make_batch,
|
||||
empty_cuda_cache,
|
||||
)
|
||||
|
||||
model_url = "https://huggingface.co/prithivida/flashrank/resolve/main/{}.zip"
|
||||
|
||||
model_file_map = {
|
||||
"ms-marco-TinyBERT-L-2-v2": "flashrank-TinyBERT-L-2-v2.onnx",
|
||||
"ms-marco-MiniLM-L-12-v2": "flashrank-MiniLM-L-12-v2_Q.onnx",
|
||||
"ms-marco-MultiBERT-L-12": "flashrank-MultiBERT-L12_Q.onnx",
|
||||
"rank-T5-flan": "flashrank-rankt5_Q.onnx",
|
||||
"ce-esci-MiniLM-L12-v2": "flashrank-ce-esci-MiniLM-L12-v2_Q.onnx",
|
||||
"miniReranker_arabic_v1": "miniReranker_arabic_v1.onnx",
|
||||
}
|
||||
|
||||
|
||||
class FlashRankReranker(BasePassageReranker):
|
||||
def __init__(
|
||||
self, project_dir: str, model: str = "ms-marco-TinyBERT-L-2-v2", *args, **kwargs
|
||||
):
|
||||
"""
|
||||
Initialize FlashRank rerank node.
|
||||
|
||||
:param project_dir: The project directory path.
|
||||
:param model: The model name for FlashRank rerank.
|
||||
You can get the list of available models from https://github.com/PrithivirajDamodaran/FlashRank.
|
||||
Default is "ms-marco-TinyBERT-L-2-v2".
|
||||
Not support “rank_zephyr_7b_v1_full” due to parallel inference issue.
|
||||
:param kwargs: Extra arguments that are not affected
|
||||
"""
|
||||
super().__init__(project_dir)
|
||||
try:
|
||||
from tokenizers import Tokenizer
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Tokenizer is not installed. Please install tokenizers to use FlashRank reranker."
|
||||
)
|
||||
|
||||
cache_dir = kwargs.pop("cache_dir", "/tmp")
|
||||
max_length = kwargs.pop("max_length", 512)
|
||||
|
||||
self.cache_dir: Path = Path(cache_dir)
|
||||
self.model_dir: Path = self.cache_dir / model
|
||||
self._prepare_model_dir(model)
|
||||
model_file = model_file_map[model]
|
||||
|
||||
try:
|
||||
import onnxruntime as ort
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"onnxruntime is not installed. Please install onnxruntime to use FlashRank reranker."
|
||||
)
|
||||
|
||||
self.session = ort.InferenceSession(str(self.model_dir / model_file))
|
||||
self.tokenizer: Tokenizer = self._get_tokenizer(max_length)
|
||||
|
||||
def __del__(self):
|
||||
del self.session
|
||||
del self.tokenizer
|
||||
empty_cuda_cache()
|
||||
super().__del__()
|
||||
|
||||
def _prepare_model_dir(self, model_name: str):
|
||||
if not self.cache_dir.exists():
|
||||
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if not self.model_dir.exists():
|
||||
self._download_model_files(model_name)
|
||||
|
||||
def _download_model_files(self, model_name: str):
|
||||
local_zip_file = self.cache_dir / f"{model_name}.zip"
|
||||
formatted_model_url = model_url.format(model_name)
|
||||
|
||||
with requests.get(formatted_model_url, stream=True) as r:
|
||||
r.raise_for_status()
|
||||
total_size = int(r.headers.get("content-length", 0))
|
||||
with (
|
||||
open(local_zip_file, "wb") as f,
|
||||
tqdm(
|
||||
desc=local_zip_file.name,
|
||||
total=total_size,
|
||||
unit="iB",
|
||||
unit_scale=True,
|
||||
unit_divisor=1024,
|
||||
) as bar,
|
||||
):
|
||||
for chunk in r.iter_content(chunk_size=8192):
|
||||
size = f.write(chunk)
|
||||
bar.update(size)
|
||||
|
||||
with zipfile.ZipFile(local_zip_file, "r") as zip_ref:
|
||||
zip_ref.extractall(self.cache_dir)
|
||||
os.remove(local_zip_file)
|
||||
|
||||
def _get_tokenizer(self, max_length: int = 512):
|
||||
try:
|
||||
from tokenizers import AddedToken, Tokenizer
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Pytorch is not installed. Please install pytorch to use FlashRank reranker."
|
||||
)
|
||||
config = json.load(open(str(self.model_dir / "config.json")))
|
||||
tokenizer_config = json.load(
|
||||
open(str(self.model_dir / "tokenizer_config.json"))
|
||||
)
|
||||
tokens_map = json.load(open(str(self.model_dir / "special_tokens_map.json")))
|
||||
tokenizer = Tokenizer.from_file(str(self.model_dir / "tokenizer.json"))
|
||||
|
||||
tokenizer.enable_truncation(
|
||||
max_length=min(tokenizer_config["model_max_length"], max_length)
|
||||
)
|
||||
tokenizer.enable_padding(
|
||||
pad_id=config["pad_token_id"], pad_token=tokenizer_config["pad_token"]
|
||||
)
|
||||
|
||||
for token in tokens_map.values():
|
||||
if isinstance(token, str):
|
||||
tokenizer.add_special_tokens([token])
|
||||
elif isinstance(token, dict):
|
||||
tokenizer.add_special_tokens([AddedToken(**token)])
|
||||
|
||||
vocab_file = self.model_dir / "vocab.txt"
|
||||
if vocab_file.exists():
|
||||
tokenizer.vocab = self._load_vocab(vocab_file)
|
||||
tokenizer.ids_to_tokens = collections.OrderedDict(
|
||||
[(ids, tok) for tok, ids in tokenizer.vocab.items()]
|
||||
)
|
||||
return tokenizer
|
||||
|
||||
def _load_vocab(self, vocab_file: Path) -> Dict[str, int]:
|
||||
vocab = collections.OrderedDict()
|
||||
with open(vocab_file, "r", encoding="utf-8") as reader:
|
||||
tokens = reader.readlines()
|
||||
for index, token in enumerate(tokens):
|
||||
token = token.rstrip("\n")
|
||||
vocab[token] = index
|
||||
return vocab
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries, contents, _, ids = self.cast_to_run(previous_result)
|
||||
top_k = kwargs.pop("top_k")
|
||||
batch = kwargs.pop("batch", 64)
|
||||
return self._pure(queries, contents, ids, top_k, batch)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents_list: List[List[str]],
|
||||
ids_list: List[List[str]],
|
||||
top_k: int,
|
||||
batch: int = 64,
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Rerank a list of contents with FlashRank rerank models.
|
||||
|
||||
:param queries: The list of queries to use for reranking
|
||||
:param contents_list: The list of lists of contents to rerank
|
||||
:param ids_list: The list of lists of ids retrieved from the initial ranking
|
||||
:param top_k: The number of passages to be retrieved
|
||||
:param batch: The number of queries to be processed in a batch
|
||||
:return: Tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
nested_list = [
|
||||
list(map(lambda x: [query, x], content_list))
|
||||
for query, content_list in zip(queries, contents_list)
|
||||
]
|
||||
|
||||
rerank_scores = flatten_apply(
|
||||
flashrank_run_model,
|
||||
nested_list,
|
||||
session=self.session,
|
||||
batch_size=batch,
|
||||
tokenizer=self.tokenizer,
|
||||
)
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"contents": contents_list,
|
||||
"ids": ids_list,
|
||||
"scores": rerank_scores,
|
||||
}
|
||||
)
|
||||
df[["contents", "ids", "scores"]] = df.apply(
|
||||
sort_by_scores, axis=1, result_type="expand"
|
||||
)
|
||||
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
|
||||
|
||||
return (
|
||||
results["contents"].tolist(),
|
||||
results["ids"].tolist(),
|
||||
results["scores"].tolist(),
|
||||
)
|
||||
|
||||
|
||||
def flashrank_run_model(input_texts, tokenizer, session, batch_size: int):
|
||||
batch_input_texts = make_batch(input_texts, batch_size)
|
||||
results = []
|
||||
|
||||
for batch_texts in tqdm(batch_input_texts):
|
||||
input_text = tokenizer.encode_batch(batch_texts)
|
||||
input_ids = np.array([e.ids for e in input_text])
|
||||
token_type_ids = np.array([e.type_ids for e in input_text])
|
||||
attention_mask = np.array([e.attention_mask for e in input_text])
|
||||
|
||||
use_token_type_ids = token_type_ids is not None and not np.all(
|
||||
token_type_ids == 0
|
||||
)
|
||||
|
||||
onnx_input = {
|
||||
"input_ids": input_ids.astype(np.int64),
|
||||
"attention_mask": attention_mask.astype(np.int64),
|
||||
}
|
||||
if use_token_type_ids:
|
||||
onnx_input["token_type_ids"] = token_type_ids.astype(np.int64)
|
||||
|
||||
outputs = session.run(None, onnx_input)
|
||||
|
||||
logits = outputs[0]
|
||||
|
||||
if logits.shape[1] == 1:
|
||||
scores = 1 / (1 + np.exp(-logits.flatten()))
|
||||
else:
|
||||
exp_logits = np.exp(logits)
|
||||
scores = exp_logits[:, 1] / np.sum(exp_logits, axis=1)
|
||||
results.extend(scores)
|
||||
return results
|
||||
115
autorag/nodes/passagereranker/jina.py
Normal file
115
autorag/nodes/passagereranker/jina.py
Normal file
@@ -0,0 +1,115 @@
|
||||
import os
|
||||
from typing import List, Tuple
|
||||
|
||||
import aiohttp
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.passagereranker.base import BasePassageReranker
|
||||
from autorag.utils.util import get_event_loop, process_batch, result_to_dataframe
|
||||
|
||||
JINA_API_URL = "https://api.jina.ai/v1/rerank"
|
||||
|
||||
|
||||
class JinaReranker(BasePassageReranker):
|
||||
def __init__(self, project_dir: str, api_key: str = None, *args, **kwargs):
|
||||
"""
|
||||
Initialize Jina rerank node.
|
||||
|
||||
:param project_dir: The project directory path.
|
||||
:param api_key: The API key for Jina rerank.
|
||||
You can set it in the environment variable JINAAI_API_KEY.
|
||||
Or, you can directly set it on the config YAML file using this parameter.
|
||||
Default is env variable "JINAAI_API_KEY".
|
||||
:param kwargs: Extra arguments that are not affected
|
||||
"""
|
||||
super().__init__(project_dir)
|
||||
if api_key is None:
|
||||
api_key = os.getenv("JINAAI_API_KEY", None)
|
||||
if api_key is None:
|
||||
raise ValueError(
|
||||
"API key is not provided."
|
||||
"You can set it as an argument or as an environment variable 'JINAAI_API_KEY'"
|
||||
)
|
||||
self.session = aiohttp.ClientSession(loop=get_event_loop())
|
||||
self.session.headers.update(
|
||||
{"Authorization": f"Bearer {api_key}", "Accept-Encoding": "identity"}
|
||||
)
|
||||
|
||||
def __del__(self):
|
||||
self.session.close()
|
||||
del self.session
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries, contents, _, ids = self.cast_to_run(previous_result)
|
||||
top_k = kwargs.pop("top_k")
|
||||
batch = kwargs.pop("batch", 8)
|
||||
model = kwargs.pop("model", "jina-reranker-v1-base-en")
|
||||
return self._pure(queries, contents, ids, top_k, model, batch)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents_list: List[List[str]],
|
||||
ids_list: List[List[str]],
|
||||
top_k: int,
|
||||
model: str = "jina-reranker-v1-base-en",
|
||||
batch: int = 8,
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Rerank a list of contents with Jina rerank models.
|
||||
You can get the API key from https://jina.ai/reranker and set it in the environment variable JINAAI_API_KEY.
|
||||
|
||||
:param queries: The list of queries to use for reranking
|
||||
:param contents_list: The list of lists of contents to rerank
|
||||
:param ids_list: The list of lists of ids retrieved from the initial ranking
|
||||
:param top_k: The number of passages to be retrieved
|
||||
:param model: The model name for Cohere rerank.
|
||||
You can choose between "jina-reranker-v1-base-en" and "jina-colbert-v1-en".
|
||||
Default is "jina-reranker-v1-base-en".
|
||||
:param batch: The number of queries to be processed in a batch
|
||||
:return: Tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
tasks = [
|
||||
jina_reranker_pure(
|
||||
self.session, query, contents, ids, top_k=top_k, model=model
|
||||
)
|
||||
for query, contents, ids in zip(queries, contents_list, ids_list)
|
||||
]
|
||||
loop = get_event_loop()
|
||||
results = loop.run_until_complete(process_batch(tasks, batch))
|
||||
|
||||
content_result, id_result, score_result = zip(*results)
|
||||
|
||||
return list(content_result), list(id_result), list(score_result)
|
||||
|
||||
|
||||
async def jina_reranker_pure(
|
||||
session,
|
||||
query: str,
|
||||
contents: List[str],
|
||||
ids: List[str],
|
||||
top_k: int,
|
||||
model: str = "jina-reranker-v1-base-en",
|
||||
) -> Tuple[List[str], List[str], List[float]]:
|
||||
async with session.post(
|
||||
JINA_API_URL,
|
||||
json={
|
||||
"query": query,
|
||||
"documents": contents,
|
||||
"model": model,
|
||||
"top_n": top_k,
|
||||
},
|
||||
) as resp:
|
||||
resp_json = await resp.json()
|
||||
if "results" not in resp_json:
|
||||
raise RuntimeError(f"Invalid response from Jina API: {resp_json['detail']}")
|
||||
|
||||
results = resp_json["results"]
|
||||
indices = list(map(lambda x: x["index"], results))
|
||||
score_result = list(map(lambda x: x["relevance_score"], results))
|
||||
id_result = list(map(lambda x: ids[x], indices))
|
||||
content_result = list(map(lambda x: contents[x], indices))
|
||||
|
||||
return content_result, id_result, score_result
|
||||
136
autorag/nodes/passagereranker/koreranker.py
Normal file
136
autorag/nodes/passagereranker/koreranker.py
Normal file
@@ -0,0 +1,136 @@
|
||||
from typing import List, Tuple
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.passagereranker.base import BasePassageReranker
|
||||
from autorag.utils.util import (
|
||||
make_batch,
|
||||
sort_by_scores,
|
||||
flatten_apply,
|
||||
select_top_k,
|
||||
result_to_dataframe,
|
||||
empty_cuda_cache,
|
||||
)
|
||||
|
||||
|
||||
class KoReranker(BasePassageReranker):
|
||||
def __init__(self, project_dir: str, *args, **kwargs):
|
||||
super().__init__(project_dir)
|
||||
try:
|
||||
import torch
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
||||
except ImportError:
|
||||
raise ImportError("For using KoReranker, please install torch first.")
|
||||
|
||||
model_path = "Dongjin-kr/ko-reranker"
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
|
||||
self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
|
||||
self.model.eval()
|
||||
# Determine the device to run the model on (GPU if available, otherwise CPU)
|
||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
self.model.to(self.device)
|
||||
|
||||
def __del__(self):
|
||||
del self.model
|
||||
empty_cuda_cache()
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries, contents, _, ids = self.cast_to_run(previous_result)
|
||||
top_k = kwargs.pop("top_k")
|
||||
batch = kwargs.pop("batch", 64)
|
||||
return self._pure(queries, contents, ids, top_k, batch)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents_list: List[List[str]],
|
||||
ids_list: List[List[str]],
|
||||
top_k: int,
|
||||
batch: int = 64,
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Rerank a list of contents based on their relevance to a query using ko-reranker.
|
||||
ko-reranker is a reranker based on korean (https://huggingface.co/Dongjin-kr/ko-reranker).
|
||||
|
||||
:param queries: The list of queries to use for reranking
|
||||
:param contents_list: The list of lists of contents to rerank
|
||||
:param ids_list: The list of lists of ids retrieved from the initial ranking
|
||||
:param top_k: The number of passages to be retrieved
|
||||
:param batch: The number of queries to be processed in a batch
|
||||
Default is 64.
|
||||
:return: Tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
nested_list = [
|
||||
list(map(lambda x: [query, x], content_list))
|
||||
for query, content_list in zip(queries, contents_list)
|
||||
]
|
||||
scores_nps = flatten_apply(
|
||||
koreranker_run_model,
|
||||
nested_list,
|
||||
model=self.model,
|
||||
batch_size=batch,
|
||||
tokenizer=self.tokenizer,
|
||||
device=self.device,
|
||||
)
|
||||
|
||||
rerank_scores = list(
|
||||
map(
|
||||
lambda scores: exp_normalize(np.array(scores)).astype(float), scores_nps
|
||||
)
|
||||
)
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"contents": contents_list,
|
||||
"ids": ids_list,
|
||||
"scores": rerank_scores,
|
||||
}
|
||||
)
|
||||
df[["contents", "ids", "scores"]] = df.apply(
|
||||
sort_by_scores, axis=1, result_type="expand"
|
||||
)
|
||||
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
|
||||
|
||||
return (
|
||||
results["contents"].tolist(),
|
||||
results["ids"].tolist(),
|
||||
results["scores"].tolist(),
|
||||
)
|
||||
|
||||
|
||||
def koreranker_run_model(input_texts, model, tokenizer, device, batch_size: int):
|
||||
try:
|
||||
import torch
|
||||
except ImportError:
|
||||
raise ImportError("For using KoReranker, please install torch first.")
|
||||
batch_input_texts = make_batch(input_texts, batch_size)
|
||||
results = []
|
||||
for batch_texts in batch_input_texts:
|
||||
inputs = tokenizer(
|
||||
batch_texts,
|
||||
padding=True,
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
max_length=512,
|
||||
)
|
||||
inputs = inputs.to(device)
|
||||
with torch.no_grad():
|
||||
scores = (
|
||||
model(**inputs, return_dict=True)
|
||||
.logits.view(
|
||||
-1,
|
||||
)
|
||||
.float()
|
||||
)
|
||||
scores_np = scores.cpu().numpy()
|
||||
results.extend(scores_np)
|
||||
return results
|
||||
|
||||
|
||||
def exp_normalize(x):
|
||||
b = x.max()
|
||||
y = np.exp(x - b)
|
||||
return y / y.sum()
|
||||
126
autorag/nodes/passagereranker/mixedbreadai.py
Normal file
126
autorag/nodes/passagereranker/mixedbreadai.py
Normal file
@@ -0,0 +1,126 @@
|
||||
import os
|
||||
from typing import List, Tuple
|
||||
|
||||
import pandas as pd
|
||||
from mixedbread_ai.client import AsyncMixedbreadAI
|
||||
|
||||
from autorag.nodes.passagereranker.base import BasePassageReranker
|
||||
from autorag.utils.util import (
|
||||
result_to_dataframe,
|
||||
get_event_loop,
|
||||
process_batch,
|
||||
pop_params,
|
||||
)
|
||||
|
||||
|
||||
class MixedbreadAIReranker(BasePassageReranker):
|
||||
def __init__(
|
||||
self,
|
||||
project_dir: str,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Initialize mixedbread-ai rerank node.
|
||||
|
||||
:param project_dir: The project directory path.
|
||||
:param api_key: The API key for MixedbreadAI rerank.
|
||||
You can set it in the environment variable MXBAI_API_KEY.
|
||||
Or, you can directly set it on the config YAML file using this parameter.
|
||||
Default is env variable "MXBAI_API_KEY".
|
||||
:param kwargs: Extra arguments that are not affected
|
||||
"""
|
||||
super().__init__(project_dir)
|
||||
api_key = kwargs.pop("api_key", None)
|
||||
api_key = os.getenv("MXBAI_API_KEY", None) if api_key is None else api_key
|
||||
if api_key is None:
|
||||
raise KeyError(
|
||||
"Please set the API key for Mixedbread AI rerank in the environment variable MXBAI_API_KEY "
|
||||
"or directly set it on the config YAML file."
|
||||
)
|
||||
self.client = AsyncMixedbreadAI(api_key=api_key)
|
||||
|
||||
def __del__(self):
|
||||
del self.client
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries, contents, scores, ids = self.cast_to_run(previous_result)
|
||||
top_k = kwargs.pop("top_k")
|
||||
batch = kwargs.pop("batch", 8)
|
||||
model = kwargs.pop("model", "mixedbread-ai/mxbai-rerank-large-v1")
|
||||
rerank_params = pop_params(self.client.reranking, kwargs)
|
||||
return self._pure(queries, contents, ids, top_k, model, batch, **rerank_params)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents_list: List[List[str]],
|
||||
ids_list: List[List[str]],
|
||||
top_k: int,
|
||||
model: str = "mixedbread-ai/mxbai-rerank-large-v1",
|
||||
batch: int = 8,
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Rerank a list of contents with mixedbread-ai rerank models.
|
||||
You can get the API key from https://www.mixedbread.ai/api-reference#quick-start-guide and set it in the environment variable MXBAI_API_KEY.
|
||||
|
||||
:param queries: The list of queries to use for reranking
|
||||
:param contents_list: The list of lists of contents to rerank
|
||||
:param ids_list: The list of lists of ids retrieved from the initial ranking
|
||||
:param top_k: The number of passages to be retrieved
|
||||
:param model: The model name for mixedbread-ai rerank.
|
||||
You can choose between "mixedbread-ai/mxbai-rerank-large-v1", "mixedbread-ai/mxbai-rerank-base-v1" and "mixedbread-ai/mxbai-rerank-xsmall-v1".
|
||||
Default is "mixedbread-ai/mxbai-rerank-large-v1".
|
||||
:param batch: The number of queries to be processed in a batch
|
||||
:return: Tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
tasks = [
|
||||
mixedbreadai_rerank_pure(
|
||||
self.client, query, contents, ids, top_k=top_k, model=model
|
||||
)
|
||||
for query, contents, ids in zip(queries, contents_list, ids_list)
|
||||
]
|
||||
loop = get_event_loop()
|
||||
results = loop.run_until_complete(process_batch(tasks, batch))
|
||||
|
||||
content_result, id_result, score_result = zip(*results)
|
||||
|
||||
return list(content_result), list(id_result), list(score_result)
|
||||
|
||||
|
||||
async def mixedbreadai_rerank_pure(
|
||||
client: AsyncMixedbreadAI,
|
||||
query: str,
|
||||
documents: List[str],
|
||||
ids: List[str],
|
||||
top_k: int,
|
||||
model: str = "mixedbread-ai/mxbai-rerank-large-v1",
|
||||
) -> Tuple[List[str], List[str], List[float]]:
|
||||
"""
|
||||
Rerank a list of contents with mixedbread-ai rerank models.
|
||||
|
||||
:param client: The mixedbread-ai client to use for reranking
|
||||
:param query: The query to use for reranking
|
||||
:param documents: The list of contents to rerank
|
||||
:param ids: The list of ids corresponding to the documents
|
||||
:param top_k: The number of passages to be retrieved
|
||||
:param model: The model name for mixedbread-ai rerank.
|
||||
You can choose between "mixedbread-ai/mxbai-rerank-large-v1" and "mixedbread-ai/mxbai-rerank-base-v1".
|
||||
Default is "mixedbread-ai/mxbai-rerank-large-v1".
|
||||
:return: Tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
|
||||
results = await client.reranking(
|
||||
query=query,
|
||||
input=documents,
|
||||
top_k=top_k,
|
||||
model=model,
|
||||
)
|
||||
reranked_scores: List[float] = list(map(lambda x: x.score, results.data))
|
||||
reranked_scores_float = list(map(float, reranked_scores))
|
||||
indices = list(map(lambda x: x.index, results.data))
|
||||
reranked_contents = list(map(lambda x: documents[x], indices))
|
||||
reranked_ids: List[str] = list(map(lambda i: ids[i], indices))
|
||||
return reranked_contents, reranked_ids, reranked_scores_float
|
||||
190
autorag/nodes/passagereranker/monot5.py
Normal file
190
autorag/nodes/passagereranker/monot5.py
Normal file
@@ -0,0 +1,190 @@
|
||||
from itertools import chain
|
||||
from typing import List, Tuple
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.passagereranker.base import BasePassageReranker
|
||||
from autorag.utils.util import (
|
||||
make_batch,
|
||||
sort_by_scores,
|
||||
flatten_apply,
|
||||
select_top_k,
|
||||
result_to_dataframe,
|
||||
pop_params,
|
||||
empty_cuda_cache,
|
||||
)
|
||||
|
||||
prediction_tokens = {
|
||||
"castorini/monot5-base-msmarco": ["▁false", "▁true"],
|
||||
"castorini/monot5-base-msmarco-10k": ["▁false", "▁true"],
|
||||
"castorini/monot5-large-msmarco": ["▁false", "▁true"],
|
||||
"castorini/monot5-large-msmarco-10k": ["▁false", "▁true"],
|
||||
"castorini/monot5-base-med-msmarco": ["▁false", "▁true"],
|
||||
"castorini/monot5-3b-med-msmarco": ["▁false", "▁true"],
|
||||
"castorini/monot5-3b-msmarco-10k": ["▁false", "▁true"],
|
||||
"unicamp-dl/mt5-base-en-msmarco": ["▁no", "▁yes"],
|
||||
"unicamp-dl/ptt5-base-pt-msmarco-10k-v2": ["▁não", "▁sim"],
|
||||
"unicamp-dl/ptt5-base-pt-msmarco-100k-v2": ["▁não", "▁sim"],
|
||||
"unicamp-dl/ptt5-base-en-pt-msmarco-100k-v2": ["▁não", "▁sim"],
|
||||
"unicamp-dl/mt5-base-en-pt-msmarco-v2": ["▁no", "▁yes"],
|
||||
"unicamp-dl/mt5-base-mmarco-v2": ["▁no", "▁yes"],
|
||||
"unicamp-dl/mt5-base-en-pt-msmarco-v1": ["▁no", "▁yes"],
|
||||
"unicamp-dl/mt5-base-mmarco-v1": ["▁no", "▁yes"],
|
||||
"unicamp-dl/ptt5-base-pt-msmarco-10k-v1": ["▁não", "▁sim"],
|
||||
"unicamp-dl/ptt5-base-pt-msmarco-100k-v1": ["▁não", "▁sim"],
|
||||
"unicamp-dl/ptt5-base-en-pt-msmarco-10k-v1": ["▁não", "▁sim"],
|
||||
"unicamp-dl/mt5-3B-mmarco-en-pt": ["▁", "▁true"],
|
||||
"unicamp-dl/mt5-13b-mmarco-100k": ["▁", "▁true"],
|
||||
}
|
||||
|
||||
|
||||
class MonoT5(BasePassageReranker):
|
||||
def __init__(
|
||||
self,
|
||||
project_dir: str,
|
||||
model_name: str = "castorini/monot5-3b-msmarco-10k",
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Initialize the MonoT5 reranker.
|
||||
|
||||
:param project_dir: The project directory
|
||||
:param model_name: The name of the MonoT5 model to use for reranking
|
||||
Note: default model name is 'castorini/monot5-3b-msmarco-10k'
|
||||
If there is a '/' in the model name parameter,
|
||||
when we create the file to store the results, the path will be twisted because of the '/'.
|
||||
Therefore, it will be received as '_' instead of '/'.
|
||||
:param kwargs: The extra arguments for the MonoT5 reranker
|
||||
"""
|
||||
super().__init__(project_dir)
|
||||
try:
|
||||
import torch
|
||||
from transformers import T5Tokenizer, T5ForConditionalGeneration
|
||||
except ImportError:
|
||||
raise ImportError("For using MonoT5 Reranker, please install torch first.")
|
||||
# replace '_' to '/'
|
||||
if "_" in model_name:
|
||||
model_name = model_name.replace("_", "/")
|
||||
# Load the tokenizer and model from the pre-trained MonoT5 model
|
||||
self.tokenizer = T5Tokenizer.from_pretrained(model_name)
|
||||
model_params = pop_params(T5ForConditionalGeneration.from_pretrained, kwargs)
|
||||
self.model = T5ForConditionalGeneration.from_pretrained(
|
||||
model_name, **model_params
|
||||
).eval()
|
||||
|
||||
# Determine the device to run the model on (GPU if available, otherwise CPU)
|
||||
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
self.model.to(self.device)
|
||||
|
||||
token_false, token_true = prediction_tokens[model_name]
|
||||
self.token_false_id = self.tokenizer.convert_tokens_to_ids(token_false)
|
||||
self.token_true_id = self.tokenizer.convert_tokens_to_ids(token_true)
|
||||
|
||||
def __del__(self):
|
||||
del self.model
|
||||
del self.tokenizer
|
||||
empty_cuda_cache()
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries, contents, _, ids = self.cast_to_run(previous_result)
|
||||
top_k = kwargs.get("top_k", 3)
|
||||
batch = kwargs.get("batch", 64)
|
||||
return self._pure(queries, contents, ids, top_k, batch)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents_list: List[List[str]],
|
||||
ids_list: List[List[str]],
|
||||
top_k: int,
|
||||
batch: int = 64,
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Rerank a list of contents based on their relevance to a query using MonoT5.
|
||||
|
||||
:param queries: The list of queries to use for reranking
|
||||
:param contents_list: The list of lists of contents to rerank
|
||||
:param ids_list: The list of lists of ids retrieved from the initial ranking
|
||||
:param top_k: The number of passages to be retrieved
|
||||
|
||||
:param batch: The number of queries to be processed in a batch
|
||||
:return: tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
# Retrieve the tokens used by the model to represent false and true predictions
|
||||
|
||||
nested_list = [
|
||||
list(map(lambda x: [f"Query: {query} Document: {x}"], content_list))
|
||||
for query, content_list in zip(queries, contents_list)
|
||||
]
|
||||
|
||||
rerank_scores = flatten_apply(
|
||||
monot5_run_model,
|
||||
nested_list,
|
||||
model=self.model,
|
||||
batch_size=batch,
|
||||
tokenizer=self.tokenizer,
|
||||
device=self.device,
|
||||
token_false_id=self.token_false_id,
|
||||
token_true_id=self.token_true_id,
|
||||
)
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"contents": contents_list,
|
||||
"ids": ids_list,
|
||||
"scores": rerank_scores,
|
||||
}
|
||||
)
|
||||
df[["contents", "ids", "scores"]] = df.apply(
|
||||
sort_by_scores, axis=1, result_type="expand"
|
||||
)
|
||||
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
|
||||
|
||||
return (
|
||||
results["contents"].tolist(),
|
||||
results["ids"].tolist(),
|
||||
results["scores"].tolist(),
|
||||
)
|
||||
|
||||
|
||||
def monot5_run_model(
|
||||
input_texts,
|
||||
model,
|
||||
batch_size: int,
|
||||
tokenizer,
|
||||
device,
|
||||
token_false_id,
|
||||
token_true_id,
|
||||
):
|
||||
try:
|
||||
import torch
|
||||
except ImportError:
|
||||
raise ImportError("For using MonoT5 Reranker, please install torch first.")
|
||||
batch_input_texts = make_batch(input_texts, batch_size)
|
||||
results = []
|
||||
for batch_texts in batch_input_texts:
|
||||
flattened_batch_texts = list(chain.from_iterable(batch_texts))
|
||||
input_encodings = tokenizer(
|
||||
flattened_batch_texts,
|
||||
padding=True,
|
||||
truncation=True,
|
||||
max_length=512,
|
||||
return_tensors="pt",
|
||||
).to(device)
|
||||
with torch.no_grad():
|
||||
outputs = model.generate(
|
||||
input_ids=input_encodings["input_ids"],
|
||||
attention_mask=input_encodings["attention_mask"],
|
||||
output_scores=True,
|
||||
return_dict_in_generate=True,
|
||||
)
|
||||
|
||||
# Extract logits for the 'false' and 'true' tokens from the model's output
|
||||
logits = outputs.scores[-1][:, [token_false_id, token_true_id]]
|
||||
# Calculate the softmax probability of the 'true' token
|
||||
probs = torch.nn.functional.softmax(logits, dim=-1)[:, 1]
|
||||
results.extend(probs.tolist())
|
||||
return results
|
||||
191
autorag/nodes/passagereranker/openvino.py
Normal file
191
autorag/nodes/passagereranker/openvino.py
Normal file
@@ -0,0 +1,191 @@
|
||||
from pathlib import Path
|
||||
from typing import Any, List, Tuple
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.passagereranker.base import BasePassageReranker
|
||||
|
||||
|
||||
from autorag.utils.util import (
|
||||
make_batch,
|
||||
sort_by_scores,
|
||||
flatten_apply,
|
||||
select_top_k,
|
||||
result_to_dataframe,
|
||||
pop_params,
|
||||
empty_cuda_cache,
|
||||
)
|
||||
|
||||
|
||||
class OpenVINOReranker(BasePassageReranker):
|
||||
def __init__(
|
||||
self,
|
||||
project_dir: str,
|
||||
model: str = "BAAI/bge-reranker-large",
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(project_dir)
|
||||
|
||||
try:
|
||||
from huggingface_hub import HfApi
|
||||
from transformers import AutoTokenizer
|
||||
except ImportError as e:
|
||||
raise ValueError(
|
||||
"Could not import huggingface_hub python package. "
|
||||
"Please install it with: "
|
||||
"`pip install -U huggingface_hub`."
|
||||
) from e
|
||||
|
||||
def require_model_export(
|
||||
model_id: str, revision: Any = None, subfolder: Any = None
|
||||
) -> bool:
|
||||
model_dir = Path(model_id)
|
||||
if subfolder is not None:
|
||||
model_dir = model_dir / subfolder
|
||||
if model_dir.is_dir():
|
||||
return (
|
||||
not (model_dir / "openvino_model.xml").exists()
|
||||
or not (model_dir / "openvino_model.bin").exists()
|
||||
)
|
||||
hf_api = HfApi()
|
||||
try:
|
||||
model_info = hf_api.model_info(model_id, revision=revision or "main")
|
||||
normalized_subfolder = (
|
||||
None if subfolder is None else Path(subfolder).as_posix()
|
||||
)
|
||||
model_files = [
|
||||
file.rfilename
|
||||
for file in model_info.siblings
|
||||
if normalized_subfolder is None
|
||||
or file.rfilename.startswith(normalized_subfolder)
|
||||
]
|
||||
ov_model_path = (
|
||||
"openvino_model.xml"
|
||||
if subfolder is None
|
||||
else f"{normalized_subfolder}/openvino_model.xml"
|
||||
)
|
||||
return (
|
||||
ov_model_path not in model_files
|
||||
or ov_model_path.replace(".xml", ".bin") not in model_files
|
||||
)
|
||||
except Exception:
|
||||
return True
|
||||
|
||||
try:
|
||||
from optimum.intel.openvino import OVModelForSequenceClassification
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Please install optimum package to use OpenVINOReranker"
|
||||
"pip install 'optimum[openvino,nncf]'"
|
||||
)
|
||||
|
||||
model_kwargs = pop_params(
|
||||
OVModelForSequenceClassification.from_pretrained, kwargs
|
||||
)
|
||||
|
||||
if require_model_export(model):
|
||||
# use remote model
|
||||
self.model = OVModelForSequenceClassification.from_pretrained(
|
||||
model, export=True, **model_kwargs
|
||||
)
|
||||
else:
|
||||
# use local model
|
||||
self.model = OVModelForSequenceClassification.from_pretrained(
|
||||
model, **model_kwargs
|
||||
)
|
||||
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(model)
|
||||
|
||||
def __del__(self):
|
||||
del self.model
|
||||
del self.tokenizer
|
||||
empty_cuda_cache()
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries, contents, _, ids = self.cast_to_run(previous_result)
|
||||
top_k = kwargs.get("top_k", 3)
|
||||
batch = kwargs.get("batch", 64)
|
||||
return self._pure(queries, contents, ids, top_k, batch)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents_list: List[List[str]],
|
||||
ids_list: List[List[str]],
|
||||
top_k: int,
|
||||
batch: int = 64,
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Rerank a list of contents based on their relevance to a query using MonoT5.
|
||||
|
||||
:param queries: The list of queries to use for reranking
|
||||
:param contents_list: The list of lists of contents to rerank
|
||||
:param ids_list: The list of lists of ids retrieved from the initial ranking
|
||||
:param top_k: The number of passages to be retrieved
|
||||
|
||||
:param batch: The number of queries to be processed in a batch
|
||||
:return: tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
# Retrieve the tokens used by the model to represent false and true predictions
|
||||
|
||||
nested_list = [
|
||||
list(map(lambda x: [query, x], content_list))
|
||||
for query, content_list in zip(queries, contents_list)
|
||||
]
|
||||
|
||||
rerank_scores = flatten_apply(
|
||||
openvino_run_model,
|
||||
nested_list,
|
||||
model=self.model,
|
||||
batch_size=batch,
|
||||
tokenizer=self.tokenizer,
|
||||
)
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"contents": contents_list,
|
||||
"ids": ids_list,
|
||||
"scores": rerank_scores,
|
||||
}
|
||||
)
|
||||
df[["contents", "ids", "scores"]] = df.apply(
|
||||
sort_by_scores, axis=1, result_type="expand"
|
||||
)
|
||||
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
|
||||
|
||||
return (
|
||||
results["contents"].tolist(),
|
||||
results["ids"].tolist(),
|
||||
results["scores"].tolist(),
|
||||
)
|
||||
|
||||
|
||||
def openvino_run_model(
|
||||
input_texts,
|
||||
model,
|
||||
batch_size: int,
|
||||
tokenizer,
|
||||
):
|
||||
batch_input_texts = make_batch(input_texts, batch_size)
|
||||
results = []
|
||||
for batch_texts in batch_input_texts:
|
||||
input_tensors = tokenizer(
|
||||
batch_texts,
|
||||
padding=True,
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
outputs = model(**input_tensors, return_dict=True)
|
||||
if outputs[0].shape[1] > 1:
|
||||
scores = outputs[0][:, 1]
|
||||
else:
|
||||
scores = outputs[0].flatten()
|
||||
|
||||
scores = list(map(float, (1 / (1 + np.exp(-np.array(scores))))))
|
||||
results.extend(scores)
|
||||
return results
|
||||
31
autorag/nodes/passagereranker/pass_reranker.py
Normal file
31
autorag/nodes/passagereranker/pass_reranker.py
Normal file
@@ -0,0 +1,31 @@
|
||||
from typing import List
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.passagereranker.base import BasePassageReranker
|
||||
from autorag.utils import result_to_dataframe
|
||||
|
||||
|
||||
class PassReranker(BasePassageReranker):
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
top_k = kwargs.pop("top_k")
|
||||
|
||||
_, contents_list, scores_list, ids_list = self.cast_to_run(previous_result)
|
||||
return self._pure(contents_list, scores_list, ids_list, top_k)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
contents_list: List[List[str]],
|
||||
scores_list: List[List[float]],
|
||||
ids_list: List[List[str]],
|
||||
top_k: int,
|
||||
):
|
||||
"""
|
||||
Do not perform reranking.
|
||||
Return the given top-k passages as is.
|
||||
"""
|
||||
contents_list = list(map(lambda x: x[:top_k], contents_list))
|
||||
scores_list = list(map(lambda x: x[:top_k], scores_list))
|
||||
ids_list = list(map(lambda x: x[:top_k], ids_list))
|
||||
return contents_list, ids_list, scores_list
|
||||
170
autorag/nodes/passagereranker/rankgpt.py
Normal file
170
autorag/nodes/passagereranker/rankgpt.py
Normal file
@@ -0,0 +1,170 @@
|
||||
from typing import List, Optional, Sequence, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from llama_index.core.base.llms.types import ChatMessage, ChatResponse
|
||||
from llama_index.core.llms import LLM
|
||||
from llama_index.core.postprocessor.rankGPT_rerank import RankGPTRerank
|
||||
from llama_index.core.schema import NodeWithScore, QueryBundle, TextNode
|
||||
from llama_index.core.utils import print_text
|
||||
from llama_index.llms.openai import OpenAI
|
||||
|
||||
from autorag import generator_models
|
||||
from autorag.nodes.passagereranker.base import BasePassageReranker
|
||||
from autorag.utils.util import (
|
||||
get_event_loop,
|
||||
process_batch,
|
||||
pop_params,
|
||||
result_to_dataframe,
|
||||
empty_cuda_cache,
|
||||
)
|
||||
|
||||
|
||||
class RankGPT(BasePassageReranker):
|
||||
def __init__(
|
||||
self, project_dir: str, llm: Optional[Union[str, LLM]] = None, **kwargs
|
||||
):
|
||||
"""
|
||||
Initialize the RankGPT reranker.
|
||||
|
||||
:param project_dir: The project directory
|
||||
:param llm: The LLM model to use for RankGPT rerank.
|
||||
It is a llama index model.
|
||||
Default is the OpenAI model with gpt-4o-mini.
|
||||
:param kwargs: The keyword arguments for the LLM model.
|
||||
"""
|
||||
super().__init__(project_dir)
|
||||
if llm is None:
|
||||
self.llm = OpenAI(model="gpt-4o-mini")
|
||||
else:
|
||||
if not isinstance(llm, LLM):
|
||||
llm_class = generator_models[llm]
|
||||
llm_param = pop_params(llm_class.__init__, kwargs)
|
||||
self.llm = llm_class(**llm_param)
|
||||
else:
|
||||
self.llm = llm
|
||||
|
||||
def __del__(self):
|
||||
del self.llm
|
||||
empty_cuda_cache()
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries, contents, scores, ids = self.cast_to_run(previous_result)
|
||||
top_k = kwargs.get("top_k", 1)
|
||||
verbose = kwargs.get("verbose", False)
|
||||
rankgpt_rerank_prompt = kwargs.get("rankgpt_rerank_prompt", None)
|
||||
batch = kwargs.get("batch", 16)
|
||||
return self._pure(
|
||||
queries=queries,
|
||||
contents_list=contents,
|
||||
scores_list=scores,
|
||||
ids_list=ids,
|
||||
top_k=top_k,
|
||||
verbose=verbose,
|
||||
rankgpt_rerank_prompt=rankgpt_rerank_prompt,
|
||||
batch=batch,
|
||||
)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents_list: List[List[str]],
|
||||
scores_list: List[List[float]],
|
||||
ids_list: List[List[str]],
|
||||
top_k: int,
|
||||
verbose: bool = False,
|
||||
rankgpt_rerank_prompt: Optional[str] = None,
|
||||
batch: int = 16,
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Rerank given context paragraphs using RankGPT.
|
||||
Return pseudo scores, since the actual scores are not available on RankGPT.
|
||||
|
||||
:param queries: The list of queries to use for reranking
|
||||
:param contents_list: The list of lists of contents to rerank
|
||||
:param scores_list: The list of lists of scores retrieved from the initial ranking
|
||||
:param ids_list: The list of lists of ids retrieved from the initial ranking
|
||||
:param top_k: The number of passages to be retrieved
|
||||
:param verbose: Whether to print intermediate steps.
|
||||
:param rankgpt_rerank_prompt: The prompt template for RankGPT rerank.
|
||||
Default is RankGPT's default prompt.
|
||||
:param batch: The number of queries to be processed in a batch.
|
||||
:return: Tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
query_bundles = list(map(lambda query: QueryBundle(query_str=query), queries))
|
||||
nodes_list = [
|
||||
list(
|
||||
map(
|
||||
lambda x: NodeWithScore(node=TextNode(text=x[0]), score=x[1]),
|
||||
zip(content_list, score_list),
|
||||
)
|
||||
)
|
||||
for content_list, score_list in zip(contents_list, scores_list)
|
||||
]
|
||||
|
||||
reranker = AsyncRankGPTRerank(
|
||||
top_n=top_k,
|
||||
llm=self.llm,
|
||||
verbose=verbose,
|
||||
rankgpt_rerank_prompt=rankgpt_rerank_prompt,
|
||||
)
|
||||
|
||||
tasks = [
|
||||
reranker.async_postprocess_nodes(nodes, query, ids)
|
||||
for nodes, query, ids in zip(nodes_list, query_bundles, ids_list)
|
||||
]
|
||||
loop = get_event_loop()
|
||||
rerank_result = loop.run_until_complete(process_batch(tasks, batch_size=batch))
|
||||
content_result = [
|
||||
list(map(lambda x: x.node.text, res[0])) for res in rerank_result
|
||||
]
|
||||
score_result = [
|
||||
np.linspace(1.0, 0.0, len(res[0])).tolist() for res in rerank_result
|
||||
]
|
||||
id_result = [res[1] for res in rerank_result]
|
||||
|
||||
del reranker
|
||||
|
||||
return content_result, id_result, score_result
|
||||
|
||||
|
||||
class AsyncRankGPTRerank(RankGPTRerank):
|
||||
async def async_run_llm(self, messages: Sequence[ChatMessage]) -> ChatResponse:
|
||||
return await self.llm.achat(messages)
|
||||
|
||||
async def async_postprocess_nodes(
|
||||
self,
|
||||
nodes: List[NodeWithScore],
|
||||
query_bundle: QueryBundle,
|
||||
ids: Optional[List[str]] = None,
|
||||
) -> Tuple[List[NodeWithScore], List[str]]:
|
||||
if ids is None:
|
||||
ids = [str(i) for i in range(len(nodes))]
|
||||
|
||||
items = {
|
||||
"query": query_bundle.query_str,
|
||||
"hits": [{"content": node.get_content()} for node in nodes],
|
||||
}
|
||||
|
||||
messages = self.create_permutation_instruction(item=items)
|
||||
permutation = await self.async_run_llm(messages=messages)
|
||||
if permutation.message is not None and permutation.message.content is not None:
|
||||
rerank_ranks = self._receive_permutation(
|
||||
items, str(permutation.message.content)
|
||||
)
|
||||
if self.verbose:
|
||||
print_text(f"After Reranking, new rank list for nodes: {rerank_ranks}")
|
||||
|
||||
initial_results: List[NodeWithScore] = []
|
||||
id_results = []
|
||||
|
||||
for idx in rerank_ranks:
|
||||
initial_results.append(
|
||||
NodeWithScore(node=nodes[idx].node, score=nodes[idx].score)
|
||||
)
|
||||
id_results.append(ids[idx])
|
||||
return initial_results[: self.top_n], id_results[: self.top_n]
|
||||
else:
|
||||
return nodes[: self.top_n], ids[: self.top_n]
|
||||
145
autorag/nodes/passagereranker/run.py
Normal file
145
autorag/nodes/passagereranker/run.py
Normal file
@@ -0,0 +1,145 @@
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
from typing import List, Dict
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.retrieval.run import evaluate_retrieval_node
|
||||
from autorag.schema.metricinput import MetricInput
|
||||
from autorag.strategy import measure_speed, filter_by_threshold, select_best
|
||||
from autorag.utils.util import apply_recursive, to_list
|
||||
|
||||
logger = logging.getLogger("AutoRAG")
|
||||
|
||||
|
||||
def run_passage_reranker_node(
|
||||
modules: List,
|
||||
module_params: List[Dict],
|
||||
previous_result: pd.DataFrame,
|
||||
node_line_dir: str,
|
||||
strategies: Dict,
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Run evaluation and select the best module among passage reranker node results.
|
||||
|
||||
:param modules: Passage reranker modules to run.
|
||||
:param module_params: Passage reranker module parameters.
|
||||
:param previous_result: Previous result dataframe.
|
||||
Could be retrieval, reranker modules result.
|
||||
It means it must contain 'query', 'retrieved_contents', 'retrieved_ids', 'retrieve_scores' columns.
|
||||
:param node_line_dir: This node line's directory.
|
||||
:param strategies: Strategies for passage reranker node.
|
||||
In this node, we use 'retrieval_f1', 'retrieval_recall' and 'retrieval_precision'.
|
||||
You can skip evaluation when you use only one module and a module parameter.
|
||||
:return: The best result dataframe with previous result columns.
|
||||
"""
|
||||
if not os.path.exists(node_line_dir):
|
||||
os.makedirs(node_line_dir)
|
||||
project_dir = pathlib.PurePath(node_line_dir).parent.parent
|
||||
qa_df = pd.read_parquet(
|
||||
os.path.join(project_dir, "data", "qa.parquet"), engine="pyarrow"
|
||||
)
|
||||
retrieval_gt = qa_df["retrieval_gt"].tolist()
|
||||
retrieval_gt = apply_recursive(lambda x: str(x), to_list(retrieval_gt))
|
||||
|
||||
# make rows to metric_inputs
|
||||
metric_inputs = [
|
||||
MetricInput(retrieval_gt=ret_gt, query=query, generation_gt=gen_gt)
|
||||
for ret_gt, query, gen_gt in zip(
|
||||
retrieval_gt, qa_df["query"].tolist(), qa_df["generation_gt"].tolist()
|
||||
)
|
||||
]
|
||||
|
||||
results, execution_times = zip(
|
||||
*map(
|
||||
lambda task: measure_speed(
|
||||
task[0].run_evaluator,
|
||||
project_dir=project_dir,
|
||||
previous_result=previous_result,
|
||||
**task[1],
|
||||
),
|
||||
zip(modules, module_params),
|
||||
)
|
||||
)
|
||||
average_times = list(map(lambda x: x / len(results[0]), execution_times))
|
||||
|
||||
# run metrics before filtering
|
||||
if strategies.get("metrics") is None:
|
||||
raise ValueError(
|
||||
"You must at least one metrics for passage_reranker evaluation."
|
||||
)
|
||||
results = list(
|
||||
map(
|
||||
lambda x: evaluate_retrieval_node(
|
||||
x,
|
||||
metric_inputs,
|
||||
strategies.get("metrics"),
|
||||
),
|
||||
results,
|
||||
)
|
||||
)
|
||||
|
||||
# save results to folder
|
||||
save_dir = os.path.join(node_line_dir, "passage_reranker") # node name
|
||||
if not os.path.exists(save_dir):
|
||||
os.makedirs(save_dir)
|
||||
filepaths = list(
|
||||
map(lambda x: os.path.join(save_dir, f"{x}.parquet"), range(len(modules)))
|
||||
)
|
||||
list(
|
||||
map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths))
|
||||
) # execute save to parquet
|
||||
filenames = list(map(lambda x: os.path.basename(x), filepaths))
|
||||
|
||||
summary_df = pd.DataFrame(
|
||||
{
|
||||
"filename": filenames,
|
||||
"module_name": list(map(lambda module: module.__name__, modules)),
|
||||
"module_params": module_params,
|
||||
"execution_time": average_times,
|
||||
**{
|
||||
f"passage_reranker_{metric}": list(
|
||||
map(lambda result: result[metric].mean(), results)
|
||||
)
|
||||
for metric in strategies.get("metrics")
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
# filter by strategies
|
||||
if strategies.get("speed_threshold") is not None:
|
||||
results, filenames = filter_by_threshold(
|
||||
results, average_times, strategies["speed_threshold"], filenames
|
||||
)
|
||||
selected_result, selected_filename = select_best(
|
||||
results,
|
||||
strategies.get("metrics"),
|
||||
filenames,
|
||||
strategies.get("strategy", "mean"),
|
||||
)
|
||||
# change metric name columns to passage_reranker_metric_name
|
||||
selected_result = selected_result.rename(
|
||||
columns={
|
||||
metric_name: f"passage_reranker_{metric_name}"
|
||||
for metric_name in strategies["metrics"]
|
||||
}
|
||||
)
|
||||
# drop retrieval result columns in previous_result
|
||||
previous_result = previous_result.drop(
|
||||
columns=["retrieved_contents", "retrieved_ids", "retrieve_scores"]
|
||||
)
|
||||
best_result = pd.concat([previous_result, selected_result], axis=1)
|
||||
|
||||
# add 'is_best' column to summary file
|
||||
summary_df["is_best"] = summary_df["filename"] == selected_filename
|
||||
|
||||
# save files
|
||||
summary_df.to_csv(os.path.join(save_dir, "summary.csv"), index=False)
|
||||
best_result.to_parquet(
|
||||
os.path.join(
|
||||
save_dir, f"best_{os.path.splitext(selected_filename)[0]}.parquet"
|
||||
),
|
||||
index=False,
|
||||
)
|
||||
return best_result
|
||||
129
autorag/nodes/passagereranker/sentence_transformer.py
Normal file
129
autorag/nodes/passagereranker/sentence_transformer.py
Normal file
@@ -0,0 +1,129 @@
|
||||
from typing import List, Tuple
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.passagereranker.base import BasePassageReranker
|
||||
from autorag.utils.util import (
|
||||
flatten_apply,
|
||||
make_batch,
|
||||
select_top_k,
|
||||
sort_by_scores,
|
||||
pop_params,
|
||||
result_to_dataframe,
|
||||
empty_cuda_cache,
|
||||
)
|
||||
|
||||
|
||||
class SentenceTransformerReranker(BasePassageReranker):
|
||||
def __init__(
|
||||
self,
|
||||
project_dir: str,
|
||||
model_name: str = "cross-encoder/ms-marco-MiniLM-L-2-v2",
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Initialize the Sentence Transformer reranker node.
|
||||
|
||||
:param project_dir: The project directory
|
||||
:param model_name: The name of the Sentence Transformer model to use for reranking
|
||||
Default is "cross-encoder/ms-marco-MiniLM-L-2-v2"
|
||||
:param kwargs: The CrossEncoder parameters
|
||||
"""
|
||||
super().__init__(project_dir, *args, **kwargs)
|
||||
try:
|
||||
import torch
|
||||
from sentence_transformers import CrossEncoder
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"You have to install AutoRAG[gpu] to use SentenceTransformerReranker"
|
||||
)
|
||||
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
model_params = pop_params(CrossEncoder.__init__, kwargs)
|
||||
self.model = CrossEncoder(model_name, device=self.device, **model_params)
|
||||
|
||||
def __del__(self):
|
||||
del self.model
|
||||
empty_cuda_cache()
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
"""
|
||||
Rerank a list of contents based on their relevance to a query using a Sentence Transformer model.
|
||||
|
||||
:param previous_result: The previous result
|
||||
:param top_k: The number of passages to be retrieved
|
||||
:param batch: The number of queries to be processed in a batch
|
||||
:return: pd DataFrame containing the reranked contents, ids, and scores
|
||||
"""
|
||||
queries, contents_list, scores_list, ids_list = self.cast_to_run(
|
||||
previous_result
|
||||
)
|
||||
top_k = kwargs.get("top_k", 1)
|
||||
batch = kwargs.get("batch", 64)
|
||||
return self._pure(queries, contents_list, ids_list, top_k, batch)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents_list: List[List[str]],
|
||||
ids_list: List[List[str]],
|
||||
top_k: int,
|
||||
batch: int = 64,
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Rerank a list of contents based on their relevance to a query using a Sentence Transformer model.
|
||||
|
||||
:param queries: The list of queries to use for reranking
|
||||
:param contents_list: The list of lists of contents to rerank
|
||||
:param ids_list: The list of lists of ids retrieved from the initial ranking
|
||||
:param top_k: The number of passages to be retrieved
|
||||
:param batch: The number of queries to be processed in a batch
|
||||
|
||||
:return: tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
nested_list = [
|
||||
list(map(lambda x: [query, x], content_list))
|
||||
for query, content_list in zip(queries, contents_list)
|
||||
]
|
||||
rerank_scores = flatten_apply(
|
||||
sentence_transformer_run_model,
|
||||
nested_list,
|
||||
model=self.model,
|
||||
batch_size=batch,
|
||||
)
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"contents": contents_list,
|
||||
"ids": ids_list,
|
||||
"scores": rerank_scores,
|
||||
}
|
||||
)
|
||||
df[["contents", "ids", "scores"]] = df.apply(
|
||||
sort_by_scores, axis=1, result_type="expand"
|
||||
)
|
||||
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
|
||||
|
||||
return (
|
||||
results["contents"].tolist(),
|
||||
results["ids"].tolist(),
|
||||
results["scores"].tolist(),
|
||||
)
|
||||
|
||||
|
||||
def sentence_transformer_run_model(input_texts, model, batch_size: int):
|
||||
try:
|
||||
import torch
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"You have to install AutoRAG[gpu] to use SentenceTransformerReranker"
|
||||
)
|
||||
batch_input_texts = make_batch(input_texts, batch_size)
|
||||
results = []
|
||||
for batch_texts in batch_input_texts:
|
||||
with torch.no_grad():
|
||||
pred_scores = model.predict(sentences=batch_texts, apply_softmax=True)
|
||||
results.extend(pred_scores.tolist())
|
||||
return results
|
||||
1
autorag/nodes/passagereranker/tart/__init__.py
Normal file
1
autorag/nodes/passagereranker/tart/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from .tart import Tart
|
||||
152
autorag/nodes/passagereranker/tart/modeling_enc_t5.py
Normal file
152
autorag/nodes/passagereranker/tart/modeling_enc_t5.py
Normal file
@@ -0,0 +1,152 @@
|
||||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
import copy
|
||||
|
||||
from transformers.modeling_outputs import SequenceClassifierOutput
|
||||
from transformers.models.t5.modeling_t5 import T5Config, T5PreTrainedModel, T5Stack
|
||||
from transformers.utils.model_parallel_utils import assert_device_map, get_device_map
|
||||
|
||||
from autorag.utils.util import empty_cuda_cache
|
||||
|
||||
|
||||
class EncT5ForSequenceClassification(T5PreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
r"encoder\.embed_tokens\.weight",
|
||||
]
|
||||
|
||||
def __init__(self, config: T5Config, dropout=0.1):
|
||||
super().__init__(config)
|
||||
try:
|
||||
from torch import nn
|
||||
except ImportError:
|
||||
raise ImportError("Please install PyTorch to use TART reranker.")
|
||||
self.num_labels = config.num_labels
|
||||
self.config = config
|
||||
|
||||
self.shared = nn.Embedding(config.vocab_size, config.d_model)
|
||||
|
||||
encoder_config = copy.deepcopy(config)
|
||||
encoder_config.use_cache = False
|
||||
encoder_config.is_encoder_decoder = False
|
||||
self.encoder = T5Stack(encoder_config, self.shared)
|
||||
|
||||
self.dropout = nn.Dropout(dropout)
|
||||
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
# Model parallel
|
||||
self.model_parallel = False
|
||||
self.device_map = None
|
||||
|
||||
def parallelize(self, device_map=None):
|
||||
try:
|
||||
import torch
|
||||
except ImportError:
|
||||
raise ImportError("Please install PyTorch to use TART reranker.")
|
||||
self.device_map = (
|
||||
get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
|
||||
if device_map is None
|
||||
else device_map
|
||||
)
|
||||
assert_device_map(self.device_map, len(self.encoder.block))
|
||||
self.encoder.parallelize(self.device_map)
|
||||
self.classifier = self.classifier.to(self.encoder.first_device)
|
||||
self.model_parallel = True
|
||||
|
||||
def deparallelize(self):
|
||||
self.encoder.deparallelize()
|
||||
self.encoder = self.encoder.to("cpu")
|
||||
self.model_parallel = False
|
||||
self.device_map = None
|
||||
empty_cuda_cache()
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.shared
|
||||
|
||||
def set_input_embeddings(self, new_embeddings):
|
||||
self.shared = new_embeddings
|
||||
self.encoder.set_input_embeddings(new_embeddings)
|
||||
|
||||
def get_encoder(self):
|
||||
return self.encoder
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
"""
|
||||
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
|
||||
class PreTrainedModel
|
||||
"""
|
||||
for layer, heads in heads_to_prune.items():
|
||||
self.encoder.layer[layer].attention.prune_heads(heads)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids=None,
|
||||
attention_mask=None,
|
||||
head_mask=None,
|
||||
inputs_embeds=None,
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_dict=None,
|
||||
):
|
||||
try:
|
||||
import torch
|
||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
except ImportError:
|
||||
raise ImportError("Please install PyTorch to use TART reranker.")
|
||||
return_dict = (
|
||||
return_dict if return_dict is not None else self.config.use_return_dict
|
||||
)
|
||||
|
||||
outputs = self.encoder(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
inputs_embeds=inputs_embeds,
|
||||
head_mask=head_mask,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
hidden_states = outputs[0]
|
||||
pooled_output = hidden_states[:, 0, :] # Take bos token (equiv. to <s>)
|
||||
|
||||
pooled_output = self.dropout(pooled_output)
|
||||
logits = self.classifier(pooled_output)
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
if self.config.problem_type is None:
|
||||
if self.num_labels == 1:
|
||||
self.config.problem_type = "regression"
|
||||
elif self.num_labels > 1 and (
|
||||
labels.dtype == torch.long or labels.dtype == torch.int
|
||||
):
|
||||
self.config.problem_type = "single_label_classification"
|
||||
else:
|
||||
self.config.problem_type = "multi_label_classification"
|
||||
|
||||
if self.config.problem_type == "regression":
|
||||
loss_fct = MSELoss()
|
||||
if self.num_labels == 1:
|
||||
loss = loss_fct(logits.squeeze(), labels.squeeze())
|
||||
else:
|
||||
loss = loss_fct(logits, labels)
|
||||
elif self.config.problem_type == "single_label_classification":
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||
elif self.config.problem_type == "multi_label_classification":
|
||||
loss_fct = BCEWithLogitsLoss()
|
||||
loss = loss_fct(logits, labels)
|
||||
if not return_dict:
|
||||
output = (logits,) + outputs[1:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
return SequenceClassifierOutput(
|
||||
loss=loss,
|
||||
logits=logits,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
139
autorag/nodes/passagereranker/tart/tart.py
Normal file
139
autorag/nodes/passagereranker/tart/tart.py
Normal file
@@ -0,0 +1,139 @@
|
||||
from itertools import chain
|
||||
from typing import List, Tuple
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.passagereranker.base import BasePassageReranker
|
||||
from autorag.nodes.passagereranker.tart.modeling_enc_t5 import (
|
||||
EncT5ForSequenceClassification,
|
||||
)
|
||||
from autorag.nodes.passagereranker.tart.tokenization_enc_t5 import EncT5Tokenizer
|
||||
from autorag.utils.util import (
|
||||
make_batch,
|
||||
sort_by_scores,
|
||||
flatten_apply,
|
||||
select_top_k,
|
||||
result_to_dataframe,
|
||||
empty_cuda_cache,
|
||||
)
|
||||
|
||||
|
||||
class Tart(BasePassageReranker):
|
||||
def __init__(self, project_dir: str, *args, **kwargs):
|
||||
super().__init__(project_dir)
|
||||
try:
|
||||
import torch
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"torch is not installed. Please install torch first to use TART reranker."
|
||||
)
|
||||
model_name = "facebook/tart-full-flan-t5-xl"
|
||||
self.model = EncT5ForSequenceClassification.from_pretrained(model_name)
|
||||
self.tokenizer = EncT5Tokenizer.from_pretrained(model_name)
|
||||
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
self.model = self.model.to(self.device)
|
||||
|
||||
def __del__(self):
|
||||
del self.model
|
||||
del self.tokenizer
|
||||
empty_cuda_cache()
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries, contents, _, ids = self.cast_to_run(previous_result)
|
||||
top_k = kwargs.pop("top_k")
|
||||
instruction = kwargs.pop("instruction", "Find passage to answer given question")
|
||||
batch = kwargs.pop("batch", 64)
|
||||
return self._pure(queries, contents, ids, top_k, instruction, batch)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents_list: List[List[str]],
|
||||
ids_list: List[List[str]],
|
||||
top_k: int,
|
||||
instruction: str = "Find passage to answer given question",
|
||||
batch: int = 64,
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Rerank a list of contents based on their relevance to a query using Tart.
|
||||
TART is a reranker based on TART (https://github.com/facebookresearch/tart).
|
||||
You can rerank the passages with the instruction using TARTReranker.
|
||||
The default model is facebook/tart-full-flan-t5-xl.
|
||||
|
||||
:param queries: The list of queries to use for reranking
|
||||
:param contents_list: The list of lists of contents to rerank
|
||||
:param ids_list: The list of lists of ids retrieved from the initial ranking
|
||||
:param top_k: The number of passages to be retrieved
|
||||
:param instruction: The instruction for reranking.
|
||||
Note: default instruction is "Find passage to answer given question"
|
||||
The default instruction from the TART paper is being used.
|
||||
If you want to use a different instruction, you can change the instruction through this parameter
|
||||
:param batch: The number of queries to be processed in a batch
|
||||
:return: tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
nested_list = [
|
||||
[["{} [SEP] {}".format(instruction, query)] for _ in contents]
|
||||
for query, contents in zip(queries, contents_list)
|
||||
]
|
||||
|
||||
rerank_scores = flatten_apply(
|
||||
tart_run_model,
|
||||
nested_list,
|
||||
model=self.model,
|
||||
batch_size=batch,
|
||||
tokenizer=self.tokenizer,
|
||||
device=self.device,
|
||||
contents_list=contents_list,
|
||||
)
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"contents": contents_list,
|
||||
"ids": ids_list,
|
||||
"scores": rerank_scores,
|
||||
}
|
||||
)
|
||||
df[["contents", "ids", "scores"]] = df.apply(
|
||||
sort_by_scores, axis=1, result_type="expand"
|
||||
)
|
||||
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
|
||||
|
||||
return (
|
||||
results["contents"].tolist(),
|
||||
results["ids"].tolist(),
|
||||
results["scores"].tolist(),
|
||||
)
|
||||
|
||||
|
||||
def tart_run_model(
|
||||
input_texts, contents_list, model, batch_size: int, tokenizer, device
|
||||
):
|
||||
try:
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"torch is not installed. Please install torch first to use TART reranker."
|
||||
)
|
||||
flattened_texts = list(chain.from_iterable(input_texts))
|
||||
flattened_contents = list(chain.from_iterable(contents_list))
|
||||
batch_input_texts = make_batch(flattened_texts, batch_size)
|
||||
batch_contents_list = make_batch(flattened_contents, batch_size)
|
||||
results = []
|
||||
for batch_texts, batch_contents in zip(batch_input_texts, batch_contents_list):
|
||||
feature = tokenizer(
|
||||
batch_texts,
|
||||
batch_contents,
|
||||
padding=True,
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
).to(device)
|
||||
with torch.no_grad():
|
||||
pred_scores = model(**feature).logits
|
||||
normalized_scores = [
|
||||
float(score[1]) for score in F.softmax(pred_scores, dim=1)
|
||||
]
|
||||
results.extend(normalized_scores)
|
||||
return results
|
||||
112
autorag/nodes/passagereranker/tart/tokenization_enc_t5.py
Normal file
112
autorag/nodes/passagereranker/tart/tokenization_enc_t5.py
Normal file
@@ -0,0 +1,112 @@
|
||||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from transformers import T5Tokenizer
|
||||
|
||||
|
||||
class EncT5Tokenizer(T5Tokenizer):
|
||||
def __init__(
|
||||
self,
|
||||
vocab_file,
|
||||
bos_token="<s>",
|
||||
eos_token="</s>",
|
||||
unk_token="<unk>",
|
||||
pad_token="<pad>",
|
||||
extra_ids=100,
|
||||
additional_special_tokens=None,
|
||||
sp_model_kwargs: Optional[Dict[str, Any]] = None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
||||
|
||||
super().__init__(
|
||||
vocab_file=vocab_file,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
pad_token=pad_token,
|
||||
extra_ids=extra_ids,
|
||||
additional_special_tokens=additional_special_tokens,
|
||||
sp_model_kwargs=sp_model_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def get_special_tokens_mask(
|
||||
self,
|
||||
token_ids_0: List[int],
|
||||
token_ids_1: Optional[List[int]] = None,
|
||||
already_has_special_tokens: bool = False,
|
||||
) -> List[int]:
|
||||
"""
|
||||
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
|
||||
special tokens using the tokenizer `prepare_for_model` method.
|
||||
Args:
|
||||
token_ids_0 (`List[int]`):
|
||||
List of IDs.
|
||||
token_ids_1 (`List[int]`, *optional*):
|
||||
Optional second list of IDs for sequence pairs.
|
||||
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not the token list is already formatted with special tokens for the model.
|
||||
Returns:
|
||||
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
||||
"""
|
||||
if already_has_special_tokens:
|
||||
return super().get_special_tokens_mask(
|
||||
token_ids_0=token_ids_0,
|
||||
token_ids_1=token_ids_1,
|
||||
already_has_special_tokens=True,
|
||||
)
|
||||
|
||||
# normal case: some special tokens
|
||||
if token_ids_1 is None:
|
||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||
|
||||
def create_token_type_ids_from_sequences(
|
||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||
) -> List[int]:
|
||||
"""
|
||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
|
||||
use of token type ids, therefore a list of zeros is returned.
|
||||
Args:
|
||||
token_ids_0 (`List[int]`):
|
||||
List of IDs.
|
||||
token_ids_1 (`List[int]`, *optional*):
|
||||
Optional second list of IDs for sequence pairs.
|
||||
Returns:
|
||||
`List[int]`: List of zeros.
|
||||
"""
|
||||
bos = [self.bos_token_id]
|
||||
eos = [self.eos_token_id]
|
||||
|
||||
if token_ids_1 is None:
|
||||
return len(bos + token_ids_0 + eos) * [0]
|
||||
return len(bos + token_ids_0 + eos + token_ids_1 + eos) * [0]
|
||||
|
||||
def build_inputs_with_special_tokens(
|
||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||
) -> List[int]:
|
||||
"""
|
||||
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
|
||||
adding special tokens. A sequence has the following format:
|
||||
- single sequence: `<s> X </s>`
|
||||
- pair of sequences: `<s> A </s> B </s>`
|
||||
Args:
|
||||
token_ids_0 (`List[int]`):
|
||||
List of IDs to which the special tokens will be added.
|
||||
token_ids_1 (`List[int]`, *optional*):
|
||||
Optional second list of IDs for sequence pairs.
|
||||
Returns:
|
||||
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
|
||||
"""
|
||||
if token_ids_1 is None:
|
||||
return [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
|
||||
else:
|
||||
return (
|
||||
[self.bos_token_id]
|
||||
+ token_ids_0
|
||||
+ [self.eos_token_id]
|
||||
+ token_ids_1
|
||||
+ [self.eos_token_id]
|
||||
)
|
||||
72
autorag/nodes/passagereranker/time_reranker.py
Normal file
72
autorag/nodes/passagereranker/time_reranker.py
Normal file
@@ -0,0 +1,72 @@
|
||||
import os
|
||||
from datetime import datetime
|
||||
from typing import List, Tuple
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.passagereranker.base import BasePassageReranker
|
||||
from autorag.utils import result_to_dataframe, fetch_contents
|
||||
|
||||
|
||||
class TimeReranker(BasePassageReranker):
|
||||
def __init__(self, project_dir: str, *args, **kwargs):
|
||||
super().__init__(project_dir, *args, **kwargs)
|
||||
self.corpus_df = pd.read_parquet(
|
||||
os.path.join(project_dir, "data", "corpus.parquet"), engine="pyarrow"
|
||||
)
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
_, contents, scores, ids = self.cast_to_run(previous_result)
|
||||
metadatas = fetch_contents(self.corpus_df, ids, column_name="metadata")
|
||||
times = [
|
||||
[time["last_modified_datetime"] for time in time_list]
|
||||
for time_list in metadatas
|
||||
]
|
||||
top_k = kwargs.pop("top_k")
|
||||
return self._pure(contents, scores, ids, top_k, times)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
contents_list: List[List[str]],
|
||||
scores_list: List[List[float]],
|
||||
ids_list: List[List[str]],
|
||||
top_k: int,
|
||||
time_list: List[List[datetime]],
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Rerank the passages based on merely the datetime of the passage.
|
||||
It uses 'last_modified_datetime' key in the corpus metadata,
|
||||
so the metadata should be in the format of {'last_modified_datetime': datetime.datetime} at the corpus data file.
|
||||
|
||||
:param contents_list: The list of lists of contents
|
||||
:param scores_list: The list of lists of scores from the initial ranking
|
||||
:param ids_list: The list of lists of ids
|
||||
:param top_k: The number of passages to be retrieved after reranking
|
||||
:param time_list: The metadata list of lists of datetime.datetime
|
||||
It automatically extracts the 'last_modified_datetime' key from the metadata in the corpus data.
|
||||
:return: The reranked contents, ids, and scores
|
||||
"""
|
||||
|
||||
def sort_row(contents, scores, ids, time, top_k):
|
||||
combined = list(zip(contents, scores, ids, time))
|
||||
combined.sort(key=lambda x: x[3], reverse=True)
|
||||
sorted_contents, sorted_scores, sorted_ids, _ = zip(*combined)
|
||||
return (
|
||||
list(sorted_contents)[:top_k],
|
||||
list(sorted_scores)[:top_k],
|
||||
list(sorted_ids)[:top_k],
|
||||
)
|
||||
|
||||
reranked_contents, reranked_scores, reranked_ids = zip(
|
||||
*map(
|
||||
sort_row,
|
||||
contents_list,
|
||||
scores_list,
|
||||
ids_list,
|
||||
time_list,
|
||||
[top_k] * len(contents_list),
|
||||
)
|
||||
)
|
||||
|
||||
return list(reranked_contents), list(reranked_ids), list(reranked_scores)
|
||||
160
autorag/nodes/passagereranker/upr.py
Normal file
160
autorag/nodes/passagereranker/upr.py
Normal file
@@ -0,0 +1,160 @@
|
||||
import logging
|
||||
from typing import List, Tuple
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.passagereranker.base import BasePassageReranker
|
||||
from autorag.utils import result_to_dataframe
|
||||
from autorag.utils.util import select_top_k, sort_by_scores, empty_cuda_cache
|
||||
|
||||
logger = logging.getLogger("AutoRAG")
|
||||
|
||||
|
||||
class Upr(BasePassageReranker):
|
||||
def __init__(
|
||||
self,
|
||||
project_dir: str,
|
||||
use_bf16: bool = False,
|
||||
prefix_prompt: str = "Passage: ",
|
||||
suffix_prompt: str = "Please write a question based on this passage.",
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Initialize the UPR reranker node.
|
||||
|
||||
:param project_dir: The project directory
|
||||
:param use_bf16: Whether to use bfloat16 for the model. Default is False.
|
||||
:param prefix_prompt: The prefix prompt for the language model that generates question for reranking.
|
||||
Default is "Passage: ".
|
||||
The prefix prompt serves as the initial context or instruction for the language model.
|
||||
It sets the stage for what is expected in the output
|
||||
:param suffix_prompt: The suffix prompt for the language model that generates question for reranking.
|
||||
Default is "Please write a question based on this passage.".
|
||||
The suffix prompt provides a cue or a closing instruction to the language model,
|
||||
signaling how to conclude the generated text or what format to follow at the end.
|
||||
:param kwargs: Extra arguments
|
||||
"""
|
||||
super().__init__(project_dir, *args, **kwargs)
|
||||
|
||||
self.scorer = UPRScorer(
|
||||
suffix_prompt=suffix_prompt, prefix_prompt=prefix_prompt, use_bf16=use_bf16
|
||||
)
|
||||
|
||||
def __del__(self):
|
||||
del self.scorer
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries, contents, _, ids = self.cast_to_run(previous_result)
|
||||
top_k = kwargs.pop("top_k")
|
||||
return self._pure(queries, contents, ids, top_k)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents_list: List[List[str]],
|
||||
ids_list: List[List[str]],
|
||||
top_k: int,
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Rerank a list of contents based on their relevance to a query using UPR.
|
||||
UPR is a reranker based on UPR (https://github.com/DevSinghSachan/unsupervised-passage-reranking).
|
||||
The language model will make a question based on the passage and rerank the passages by the likelihood of the question.
|
||||
The default model is t5-large.
|
||||
|
||||
:param queries: The list of queries to use for reranking
|
||||
:param contents_list: The list of lists of contents to rerank
|
||||
:param ids_list: The list of lists of ids retrieved from the initial ranking
|
||||
:param top_k: The number of passages to be retrieved
|
||||
|
||||
:return: tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"query": queries,
|
||||
"contents": contents_list,
|
||||
"ids": ids_list,
|
||||
}
|
||||
)
|
||||
|
||||
df["scores"] = df.apply(
|
||||
lambda row: self.scorer.compute(
|
||||
query=row["query"], contents=row["contents"]
|
||||
),
|
||||
axis=1,
|
||||
)
|
||||
df[["contents", "ids", "scores"]] = df.apply(
|
||||
lambda x: sort_by_scores(x, reverse=False), axis=1, result_type="expand"
|
||||
)
|
||||
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
|
||||
return (
|
||||
results["contents"].tolist(),
|
||||
results["ids"].tolist(),
|
||||
results["scores"].tolist(),
|
||||
)
|
||||
|
||||
|
||||
class UPRScorer:
|
||||
def __init__(self, suffix_prompt: str, prefix_prompt: str, use_bf16: bool = False):
|
||||
try:
|
||||
import torch
|
||||
from transformers import T5Tokenizer, T5ForConditionalGeneration
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"torch is not installed. Please install torch to use UPRReranker."
|
||||
)
|
||||
model_name = "t5-large"
|
||||
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
self.tokenizer = T5Tokenizer.from_pretrained(model_name)
|
||||
self.model = T5ForConditionalGeneration.from_pretrained(
|
||||
model_name, torch_dtype=torch.bfloat16 if use_bf16 else torch.float32
|
||||
).to(self.device)
|
||||
self.suffix_prompt = suffix_prompt
|
||||
self.prefix_prompt = prefix_prompt
|
||||
|
||||
def compute(self, query: str, contents: List[str]) -> List[float]:
|
||||
try:
|
||||
import torch
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"torch is not installed. Please install torch to use UPRReranker."
|
||||
)
|
||||
query_token = self.tokenizer(
|
||||
query, max_length=128, truncation=True, return_tensors="pt"
|
||||
)
|
||||
prompts = list(
|
||||
map(
|
||||
lambda content: f"{self.prefix_prompt} {content} {self.suffix_prompt}",
|
||||
contents,
|
||||
)
|
||||
)
|
||||
prompt_token_outputs = self.tokenizer(
|
||||
prompts,
|
||||
padding="longest",
|
||||
max_length=512,
|
||||
pad_to_multiple_of=8,
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
query_input_ids = torch.repeat_interleave(
|
||||
query_token["input_ids"], len(contents), dim=0
|
||||
).to(self.device)
|
||||
|
||||
with torch.no_grad():
|
||||
logits = self.model(
|
||||
input_ids=prompt_token_outputs["input_ids"].to(self.device),
|
||||
attention_mask=prompt_token_outputs["attention_mask"].to(self.device),
|
||||
labels=query_input_ids,
|
||||
).logits
|
||||
log_softmax = torch.nn.functional.log_softmax(logits, dim=-1)
|
||||
nll = -log_softmax.gather(2, query_input_ids.unsqueeze(2)).squeeze(2)
|
||||
avg_nll = torch.sum(nll, dim=1)
|
||||
return avg_nll.tolist()
|
||||
|
||||
def __del__(self):
|
||||
del self.model
|
||||
del self.tokenizer
|
||||
empty_cuda_cache()
|
||||
109
autorag/nodes/passagereranker/voyageai.py
Normal file
109
autorag/nodes/passagereranker/voyageai.py
Normal file
@@ -0,0 +1,109 @@
|
||||
import os
|
||||
from typing import List, Tuple
|
||||
import pandas as pd
|
||||
import voyageai
|
||||
|
||||
from autorag.nodes.passagereranker.base import BasePassageReranker
|
||||
from autorag.utils.util import result_to_dataframe, get_event_loop, process_batch
|
||||
|
||||
|
||||
class VoyageAIReranker(BasePassageReranker):
|
||||
def __init__(self, project_dir: str, *args, **kwargs):
|
||||
super().__init__(project_dir)
|
||||
api_key = kwargs.pop("api_key", None)
|
||||
api_key = os.getenv("VOYAGE_API_KEY", None) if api_key is None else api_key
|
||||
if api_key is None:
|
||||
raise KeyError(
|
||||
"Please set the API key for VoyageAI rerank in the environment variable VOYAGE_API_KEY "
|
||||
"or directly set it on the config YAML file."
|
||||
)
|
||||
|
||||
self.voyage_client = voyageai.AsyncClient(api_key=api_key)
|
||||
|
||||
def __del__(self):
|
||||
del self.voyage_client
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries, contents, scores, ids = self.cast_to_run(previous_result)
|
||||
top_k = kwargs.pop("top_k")
|
||||
batch = kwargs.pop("batch", 8)
|
||||
model = kwargs.pop("model", "rerank-2")
|
||||
truncation = kwargs.pop("truncation", True)
|
||||
return self._pure(queries, contents, ids, top_k, model, batch, truncation)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents_list: List[List[str]],
|
||||
ids_list: List[List[str]],
|
||||
top_k: int,
|
||||
model: str = "rerank-2",
|
||||
batch: int = 8,
|
||||
truncation: bool = True,
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Rerank a list of contents with VoyageAI rerank models.
|
||||
You can get the API key from https://docs.voyageai.com/docs/api-key-and-installation and set it in the environment variable VOYAGE_API_KEY.
|
||||
|
||||
:param queries: The list of queries to use for reranking
|
||||
:param contents_list: The list of lists of contents to rerank
|
||||
:param ids_list: The list of lists of ids retrieved from the initial ranking
|
||||
:param top_k: The number of passages to be retrieved
|
||||
:param model: The model name for VoyageAI rerank.
|
||||
You can choose between "rerank-2" and "rerank-2-lite".
|
||||
Default is "rerank-2".
|
||||
:param batch: The number of queries to be processed in a batch
|
||||
:param truncation: Whether to truncate the input to satisfy the 'context length limit' on the query and the documents.
|
||||
:return: Tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
tasks = [
|
||||
voyageai_rerank_pure(
|
||||
self.voyage_client, model, query, contents, ids, top_k, truncation
|
||||
)
|
||||
for query, contents, ids in zip(queries, contents_list, ids_list)
|
||||
]
|
||||
loop = get_event_loop()
|
||||
results = loop.run_until_complete(process_batch(tasks, batch))
|
||||
|
||||
content_result, id_result, score_result = zip(*results)
|
||||
|
||||
return list(content_result), list(id_result), list(score_result)
|
||||
|
||||
|
||||
async def voyageai_rerank_pure(
|
||||
voyage_client: voyageai.AsyncClient,
|
||||
model: str,
|
||||
query: str,
|
||||
documents: List[str],
|
||||
ids: List[str],
|
||||
top_k: int,
|
||||
truncation: bool = True,
|
||||
) -> Tuple[List[str], List[str], List[float]]:
|
||||
"""
|
||||
Rerank a list of contents with VoyageAI rerank models.
|
||||
|
||||
:param voyage_client: The Voyage Client to use for reranking
|
||||
:param model: The model name for VoyageAI rerank
|
||||
:param query: The query to use for reranking
|
||||
:param documents: The list of contents to rerank
|
||||
:param ids: The list of ids corresponding to the documents
|
||||
:param top_k: The number of passages to be retrieved
|
||||
:param truncation: Whether to truncate the input to satisfy the 'context length limit' on the query and the documents.
|
||||
:return: Tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
rerank_results = await voyage_client.rerank(
|
||||
model=model,
|
||||
query=query,
|
||||
documents=documents,
|
||||
top_k=top_k,
|
||||
truncation=truncation,
|
||||
)
|
||||
reranked_scores: List[float] = list(
|
||||
map(lambda x: x.relevance_score, rerank_results.results)
|
||||
)
|
||||
indices = list(map(lambda x: x.index, rerank_results.results))
|
||||
reranked_contents: List[str] = list(map(lambda i: documents[i], indices))
|
||||
reranked_ids: List[str] = list(map(lambda i: ids[i], indices))
|
||||
return reranked_contents, reranked_ids, reranked_scores
|
||||
Reference in New Issue
Block a user