Fix Dockerfile build issue

This commit is contained in:
kyy
2025-03-18 16:41:12 +09:00
parent 6814230bfb
commit 9323aa254a
228 changed files with 467 additions and 3488 deletions

View File

View File

@@ -0,0 +1,4 @@
from .llama_index_llm import LlamaIndexLLM
from .openai_llm import OpenAILLM
from .vllm import Vllm
from .vllm_api import VllmAPI

View File

@@ -0,0 +1,103 @@
import abc
import functools
import logging
from pathlib import Path
from typing import Union, Tuple, List
import pandas as pd
from llama_index.core.output_parsers import PydanticOutputParser
from autorag import generator_models
from autorag.schema import BaseModule
from autorag.utils import result_to_dataframe
logger = logging.getLogger("AutoRAG")
class BaseGenerator(BaseModule, metaclass=abc.ABCMeta):
def __init__(self, project_dir: str, llm: str, *args, **kwargs):
logger.info(f"Initialize generator node - {self.__class__.__name__}")
self.llm = llm
def __del__(self):
logger.info(f"Deleting generator module - {self.__class__.__name__}")
def cast_to_run(self, previous_result: pd.DataFrame, *args, **kwargs):
logger.info(f"Running generator node - {self.__class__.__name__} module...")
assert (
"prompts" in previous_result.columns
), "previous_result must contain prompts column."
prompts = previous_result["prompts"].tolist()
return prompts
def structured_output(self, prompts: List[str], output_cls):
response, _, _ = self._pure(prompts)
parser = PydanticOutputParser(output_cls)
result = []
for res in response:
try:
result.append(parser.parse(res))
except Exception as e:
logger.warning(
f"Error parsing response: {e} \nSo returning None instead in this case."
)
result.append(None)
return result
@abc.abstractmethod
async def astream(self, prompt: str, **kwargs):
pass
@abc.abstractmethod
def stream(self, prompt: str, **kwargs):
pass
def generator_node(func):
@functools.wraps(func)
@result_to_dataframe(["generated_texts", "generated_tokens", "generated_log_probs"])
def wrapper(
project_dir: Union[str, Path], previous_result: pd.DataFrame, llm: str, **kwargs
) -> Tuple[List[str], List[List[int]], List[List[float]]]:
"""
This decorator makes a generator module to be a node.
It automatically extracts prompts from previous_result and runs the generator function.
Plus, it retrieves the llm instance from autorag.generator_models.
:param project_dir: The project directory.
:param previous_result: The previous result that contains prompts,
:param llm: The llm name that you want to use.
:param kwargs: The extra parameters for initializing the llm instance.
:return: Pandas dataframe that contains generated texts, generated tokens, and generated log probs.
Each column is "generated_texts", "generated_tokens", and "generated_log_probs".
"""
logger.info(f"Running generator node - {func.__name__} module...")
assert (
"prompts" in previous_result.columns
), "previous_result must contain prompts column."
prompts = previous_result["prompts"].tolist()
if func.__name__ == "llama_index_llm":
if llm not in generator_models:
raise ValueError(
f"{llm} is not a valid llm name. Please check the llm name."
"You can check valid llm names from autorag.generator_models."
)
batch = kwargs.pop("batch", 16)
if llm == "huggingfacellm":
model_name = kwargs.pop("model", None)
if model_name is not None:
kwargs["model_name"] = model_name
else:
if "model_name" not in kwargs.keys():
raise ValueError(
"`model` or `model_name` parameter must be provided for using huggingfacellm."
)
kwargs["tokenizer_name"] = kwargs["model_name"]
llm_instance = generator_models[llm](**kwargs)
result = func(prompts=prompts, llm=llm_instance, batch=batch)
del llm_instance
return result
else:
return func(prompts=prompts, llm=llm, **kwargs)
return wrapper

View File

@@ -0,0 +1,97 @@
from typing import List, Tuple
import pandas as pd
from llama_index.core.base.llms.base import BaseLLM
from transformers import AutoTokenizer
from autorag import generator_models
from autorag.nodes.generator.base import BaseGenerator
from autorag.utils.util import (
get_event_loop,
process_batch,
result_to_dataframe,
pop_params,
)
class LlamaIndexLLM(BaseGenerator):
def __init__(self, project_dir: str, llm: str, batch: int = 16, *args, **kwargs):
"""
Initialize the Llama Index LLM module.
:param project_dir: The project directory.
:param llm: A llama index LLM instance.
:param batch: The batch size for llm.
Set low if you face some errors.
Default is 16.
:param kwargs: The extra parameters for initializing the llm instance.
"""
super().__init__(project_dir=project_dir, llm=llm)
if self.llm not in generator_models.keys():
raise ValueError(
f"{self.llm} is not a valid llm name. Please check the llm name."
"You can check valid llm names from autorag.generator_models."
)
self.batch = batch
llm_class = generator_models[self.llm]
if llm_class.class_name() in [
"HuggingFace_LLM",
"HuggingFaceInferenceAPI",
"TextGenerationInference",
]:
model_name = kwargs.pop("model", None)
if model_name is not None:
kwargs["model_name"] = model_name
else:
if "model_name" not in kwargs.keys():
raise ValueError(
"`model` or `model_name` parameter must be provided for using huggingfacellm."
)
kwargs["tokenizer_name"] = kwargs["model_name"]
self.llm_instance: BaseLLM = llm_class(**pop_params(llm_class.__init__, kwargs))
def __del__(self):
super().__del__()
del self.llm_instance
@result_to_dataframe(["generated_texts", "generated_tokens", "generated_log_probs"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
prompts = self.cast_to_run(previous_result=previous_result)
return self._pure(prompts)
def _pure(
self,
prompts: List[str],
) -> Tuple[List[str], List[List[int]], List[List[float]]]:
"""
Llama Index LLM module.
It gets the LLM instance from llama index, and returns generated text by the input prompt.
It does not generate the right log probs, but it returns the pseudo log probs,
which are not meant to be used for other modules.
:param prompts: A list of prompts.
:return: A tuple of three elements.
The first element is a list of a generated text.
The second element is a list of generated text's token ids, used tokenizer is GPT2Tokenizer.
The third element is a list of generated text's pseudo log probs.
"""
tasks = [self.llm_instance.acomplete(prompt) for prompt in prompts]
loop = get_event_loop() # get_event_loop()
results = loop.run_until_complete(process_batch(tasks, batch_size=self.batch))
generated_texts = list(map(lambda x: x.text, results))
tokenizer = AutoTokenizer.from_pretrained("gpt2", use_fast=False)
tokenized_ids = tokenizer(generated_texts).data["input_ids"]
pseudo_log_probs = list(map(lambda x: [0.5] * len(x), tokenized_ids))
return generated_texts, tokenized_ids, pseudo_log_probs
async def astream(self, prompt: str, **kwargs):
async for completion_response in await self.llm_instance.astream_complete(
prompt
):
yield completion_response.text
def stream(self, prompt: str, **kwargs):
for completion_response in self.llm_instance.stream_complete(prompt):
yield completion_response.text

View File

@@ -0,0 +1,296 @@
import logging
from typing import List, Tuple
import pandas as pd
import tiktoken
from openai import AsyncOpenAI
from tiktoken import Encoding
from autorag.nodes.generator.base import BaseGenerator
from autorag.utils.util import (
get_event_loop,
process_batch,
pop_params,
result_to_dataframe,
)
logger = logging.getLogger("AutoRAG")
MAX_TOKEN_DICT = { # model name : token limit
"gpt-4.5-preview": 128_000,
"gpt-4.5-preview-2025-02-27": 128_000,
"o1": 200_000,
"o1-preview": 128_000,
"o1-preview-2024-09-12": 128_000,
"o1-mini": 128_000,
"o1-mini-2024-09-12": 128_000,
"o3-mini": 200_000,
"gpt-4o-mini": 128_000,
"gpt-4o-mini-2024-07-18": 128_000,
"gpt-4o": 128_000,
"gpt-4o-2024-08-06": 128_000,
"gpt-4o-2024-05-13": 128_000,
"chatgpt-4o-latest": 128_000,
"gpt-4-turbo": 128_000,
"gpt-4-turbo-2024-04-09": 128_000,
"gpt-4-turbo-preview": 128_000,
"gpt-4-0125-preview": 128_000,
"gpt-4-1106-preview": 128_000,
"gpt-4-vision-preview": 128_000,
"gpt-4-1106-vision-preview": 128_000,
"gpt-4": 8_192,
"gpt-4-0613": 8_192,
"gpt-4-32k": 32_768,
"gpt-4-32k-0613": 32_768,
"gpt-3.5-turbo-0125": 16_385,
"gpt-3.5-turbo": 16_385,
"gpt-3.5-turbo-1106": 16_385,
"gpt-3.5-turbo-instruct": 4_096,
"gpt-3.5-turbo-16k": 16_385,
"gpt-3.5-turbo-0613": 4_096,
"gpt-3.5-turbo-16k-0613": 16_385,
}
class OpenAILLM(BaseGenerator):
def __init__(self, project_dir, llm: str, batch: int = 16, *args, **kwargs):
super().__init__(project_dir, llm, *args, **kwargs)
assert batch > 0, "batch size must be greater than 0."
self.batch = batch
client_init_params = pop_params(AsyncOpenAI.__init__, kwargs)
self.client = AsyncOpenAI(**client_init_params)
if self.llm.startswith("gpt-4.5"):
self.tokenizer = tiktoken.get_encoding("o200k_base")
else:
self.tokenizer = tiktoken.encoding_for_model(self.llm)
self.max_token_size = (
MAX_TOKEN_DICT.get(self.llm) - 7
) # because of chat token usage
if self.max_token_size is None:
raise ValueError(
f"Model {self.llm} does not supported. "
f"Please select the model between {list(MAX_TOKEN_DICT.keys())}"
)
@result_to_dataframe(["generated_texts", "generated_tokens", "generated_log_probs"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
prompts = self.cast_to_run(previous_result)
return self._pure(prompts, **kwargs)
def _pure(
self,
prompts: List[str],
truncate: bool = True,
**kwargs,
) -> Tuple[List[str], List[List[int]], List[List[float]]]:
"""
OpenAI generator module.
Uses an official openai library for generating answer from the given prompt.
It returns real token ids and log probs, so you must use this for using token ids and log probs.
:param prompts: A list of prompts.
:param llm: A model name for openai.
Default is gpt-3.5-turbo.
:param batch: Batch size for openai api call.
If you get API limit errors, you should lower the batch size.
Default is 16.
:param truncate: Whether to truncate the input prompt.
Default is True.
:param api_key: OpenAI API key. You can set this by passing env variable `OPENAI_API_KEY`
:param kwargs: The optional parameter for openai api call `openai.chat.completion`
See https://platform.openai.com/docs/api-reference/chat/create for more details.
:return: A tuple of three elements.
The first element is a list of generated text.
The second element is a list of generated text's token ids.
The third element is a list of generated text's log probs.
"""
if kwargs.get("logprobs") is not None:
kwargs.pop("logprobs")
logger.warning(
"parameter logprob does not effective. It always set to True."
)
if kwargs.get("n") is not None:
kwargs.pop("n")
logger.warning("parameter n does not effective. It always set to 1.")
# TODO: fix this after updating tiktoken for the gpt-4.5 model. It is not yet supported yet.
if truncate:
prompts = list(
map(
lambda prompt: truncate_by_token(
prompt, self.tokenizer, self.max_token_size
),
prompts,
)
)
openai_chat_params = pop_params(self.client.chat.completions.create, kwargs)
loop = get_event_loop()
if self.llm.startswith("o1") or self.llm.startswith("o3"):
tasks = [
self.get_result_o1(prompt, **openai_chat_params) for prompt in prompts
]
else:
tasks = [
self.get_result(prompt, **openai_chat_params) for prompt in prompts
]
result = loop.run_until_complete(process_batch(tasks, self.batch))
answer_result = list(map(lambda x: x[0], result))
token_result = list(map(lambda x: x[1], result))
logprob_result = list(map(lambda x: x[2], result))
return answer_result, token_result, logprob_result
def structured_output(self, prompts: List[str], output_cls, **kwargs):
supported_models = [
"gpt-4o-mini-2024-07-18",
"gpt-4o-2024-08-06",
]
if self.llm not in supported_models:
raise ValueError(
f"{self.llm} is not a valid model name for structured output. "
f"Please select the model between {supported_models}"
)
if kwargs.get("logprobs") is not None:
kwargs.pop("logprobs")
logger.warning(
"parameter logprob does not effective. It always set to False."
)
if kwargs.get("n") is not None:
kwargs.pop("n")
logger.warning("parameter n does not effective. It always set to 1.")
# TODO: fix this after updating tiktoken for the gpt-4.5 model. It is not yet supported yet.
prompts = list(
map(
lambda prompt: truncate_by_token(
prompt, self.tokenizer, self.max_token_size
),
prompts,
)
)
openai_chat_params = pop_params(self.client.beta.chat.completions.parse, kwargs)
loop = get_event_loop()
tasks = [
self.get_structured_result(prompt, output_cls, **openai_chat_params)
for prompt in prompts
]
result = loop.run_until_complete(process_batch(tasks, self.batch))
return result
async def astream(self, prompt: str, **kwargs):
# TODO: gpt-4.5-preview does not support logprobs. It should be fixed after the openai update.
if kwargs.get("logprobs") is not None:
kwargs.pop("logprobs")
logger.warning(
"parameter logprob does not effective. It always set to False."
)
if kwargs.get("n") is not None:
kwargs.pop("n")
logger.warning("parameter n does not effective. It always set to 1.")
prompt = truncate_by_token(prompt, self.tokenizer, self.max_token_size)
openai_chat_params = pop_params(self.client.chat.completions.create, kwargs)
stream = await self.client.chat.completions.create(
model=self.llm,
messages=[
{"role": "user", "content": prompt},
],
logprobs=False,
n=1,
stream=True,
**openai_chat_params,
)
result = ""
async for chunk in stream:
if chunk.choices[0].delta.content is not None:
result += chunk.choices[0].delta.content
yield result
def stream(self, prompt: str, **kwargs):
raise NotImplementedError("stream method is not implemented yet.")
async def get_structured_result(self, prompt: str, output_cls, **kwargs):
logprobs = True
if self.llm.startswith("gpt-4.5"):
logprobs = False
response = await self.client.beta.chat.completions.parse(
model=self.llm,
messages=[
{"role": "user", "content": prompt},
],
response_format=output_cls,
logprobs=logprobs,
n=1,
**kwargs,
)
return response.choices[0].message.parsed
async def get_result(self, prompt: str, **kwargs):
# TODO: gpt-4.5-preview does not support logprobs. It should be fixed after the openai update.
logprobs = True
if self.llm.startswith("gpt-4.5"):
logprobs = False
response = await self.client.chat.completions.create(
model=self.llm,
messages=[
{"role": "user", "content": prompt},
],
logprobs=logprobs,
n=1,
**kwargs,
)
choice = response.choices[0]
answer = choice.message.content
# TODO: gpt-4.5-preview does not support logprobs. It should be fixed after the openai update.
if self.llm.startswith("gpt-4.5"):
tokens = self.tokenizer.encode(answer, allowed_special="all")
logprobs = [0.5] * len(tokens)
logger.warning("gpt-4.5-preview does not support logprobs yet.")
else:
logprobs = list(map(lambda x: x.logprob, choice.logprobs.content))
tokens = list(
map(
lambda x: self.tokenizer.encode(x.token, allowed_special="all")[0],
choice.logprobs.content,
)
)
assert len(tokens) == len(
logprobs
), "tokens and logprobs size is different."
return answer, tokens, logprobs
async def get_result_o1(self, prompt: str, **kwargs):
assert self.llm.startswith("o1") or self.llm.startswith(
"o3"
), "This function only supports o1 or o3 model."
# The default temperature for the o1 model is 1. 1 is only supported.
# See https://platform.openai.com/docs/guides/reasoning about beta limitation of o1 models.
kwargs["temperature"] = 1
kwargs["top_p"] = 1
kwargs["presence_penalty"] = 0
kwargs["frequency_penalty"] = 0
response = await self.client.chat.completions.create(
model=self.llm,
messages=[
{"role": "user", "content": prompt},
],
logprobs=False,
n=1,
**kwargs,
)
answer = response.choices[0].message.content
tokens = self.tokenizer.encode(answer, allowed_special="all")
pseudo_log_probs = [0.5] * len(tokens)
return answer, tokens, pseudo_log_probs
def truncate_by_token(prompt: str, tokenizer: Encoding, max_token_size: int):
tokens = tokenizer.encode(prompt, allowed_special="all")
return tokenizer.decode(tokens[:max_token_size])

View File

@@ -0,0 +1,144 @@
import os
import pathlib
from typing import List, Dict, Union
import pandas as pd
from autorag.evaluation import evaluate_generation
from autorag.evaluation.util import cast_metrics
from autorag.schema.metricinput import MetricInput
from autorag.strategy import measure_speed, filter_by_threshold, select_best
from autorag.utils.util import to_list
def run_generator_node(
modules: List,
module_params: List[Dict],
previous_result: pd.DataFrame,
node_line_dir: str,
strategies: Dict,
) -> pd.DataFrame:
"""
Run evaluation and select the best module among generator node results.
And save the results and summary to generator node directory.
:param modules: Generator modules to run.
:param module_params: Generator module parameters.
Including node parameters, which is used for every module in this node.
:param previous_result: Previous result dataframe.
Could be prompt maker node's result.
:param node_line_dir: This node line's directory.
:param strategies: Strategies for generator node.
:return: The best result dataframe.
It contains previous result columns and generator node's result columns.
"""
if not os.path.exists(node_line_dir):
os.makedirs(node_line_dir)
project_dir = pathlib.PurePath(node_line_dir).parent.parent
node_dir = os.path.join(node_line_dir, "generator") # node name
if not os.path.exists(node_dir):
os.makedirs(node_dir)
qa_data = pd.read_parquet(
os.path.join(project_dir, "data", "qa.parquet"), engine="pyarrow"
)
if "generation_gt" not in qa_data.columns:
raise ValueError("You must have 'generation_gt' column in qa.parquet.")
results, execution_times = zip(
*map(
lambda x: measure_speed(
x[0].run_evaluator,
project_dir=project_dir,
previous_result=previous_result,
**x[1],
),
zip(modules, module_params),
)
)
average_times = list(map(lambda x: x / len(results[0]), execution_times))
# get average token usage
token_usages = list(map(lambda x: x["generated_tokens"].apply(len).mean(), results))
# make rows to metric_inputs
generation_gt = to_list(qa_data["generation_gt"].tolist())
metric_inputs = [MetricInput(generation_gt=gen_gt) for gen_gt in generation_gt]
metric_names, metric_params = cast_metrics(strategies.get("metrics"))
if metric_names is None or len(metric_names) <= 0:
raise ValueError("You must at least one metrics for generator evaluation.")
results = list(
map(
lambda result: evaluate_generator_node(
result, metric_inputs, strategies.get("metrics")
),
results,
)
)
# save results to folder
filepaths = list(
map(lambda x: os.path.join(node_dir, f"{x}.parquet"), range(len(modules)))
)
list(
map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths))
) # execute save to parquet
filenames = list(map(lambda x: os.path.basename(x), filepaths))
summary_df = pd.DataFrame(
{
"filename": filenames,
"module_name": list(map(lambda module: module.__name__, modules)),
"module_params": module_params,
"execution_time": average_times,
"average_output_token": token_usages,
**{
metric: list(map(lambda x: x[metric].mean(), results))
for metric in metric_names
},
}
)
# filter by strategies
if strategies.get("speed_threshold") is not None:
results, filenames = filter_by_threshold(
results, average_times, strategies["speed_threshold"], filenames
)
if strategies.get("token_threshold") is not None:
results, filenames = filter_by_threshold(
results, token_usages, strategies["token_threshold"], filenames
)
selected_result, selected_filename = select_best(
results, metric_names, filenames, strategies.get("strategy", "mean")
)
best_result = pd.concat([previous_result, selected_result], axis=1)
# add 'is_best' column at summary file
summary_df["is_best"] = summary_df["filename"] == selected_filename
# save files
summary_df.to_csv(os.path.join(node_dir, "summary.csv"), index=False)
best_result.to_parquet(
os.path.join(
node_dir, f"best_{os.path.splitext(selected_filename)[0]}.parquet"
),
index=False,
)
return best_result
def evaluate_generator_node(
result_df: pd.DataFrame,
metric_inputs: List[MetricInput],
metrics: Union[List[str], List[Dict]],
):
@evaluate_generation(metric_inputs=metric_inputs, metrics=metrics)
def evaluate_generation_module(df: pd.DataFrame):
return (
df["generated_texts"].tolist(),
df["generated_tokens"].tolist(),
df["generated_log_probs"].tolist(),
)
return evaluate_generation_module(result_df)

View File

@@ -0,0 +1,121 @@
import gc
from copy import deepcopy
from typing import List, Tuple
import pandas as pd
from autorag.nodes.generator.base import BaseGenerator
from autorag.utils import result_to_dataframe
from autorag.utils.util import pop_params, to_list
class Vllm(BaseGenerator):
def __init__(self, project_dir: str, llm: str, **kwargs):
super().__init__(project_dir, llm, **kwargs)
try:
from vllm import SamplingParams, LLM
except ImportError:
raise ImportError(
"Please install vllm library. You can install it by running `pip install vllm`."
)
model_from_kwargs = kwargs.pop("model", None)
model = llm if model_from_kwargs is None else model_from_kwargs
input_kwargs = deepcopy(kwargs)
sampling_params_init_params = pop_params(
SamplingParams.from_optional, input_kwargs
)
self.vllm_model = LLM(model, **input_kwargs)
# delete not sampling param keys in the kwargs
kwargs_keys = list(kwargs.keys())
for key in kwargs_keys:
if key not in sampling_params_init_params:
kwargs.pop(key)
def __del__(self):
try:
import torch
import contextlib
if torch.cuda.is_available():
from vllm.distributed.parallel_state import (
destroy_model_parallel,
destroy_distributed_environment,
)
destroy_model_parallel()
destroy_distributed_environment()
del self.vllm_model.llm_engine.model_executor
del self.vllm_model
with contextlib.suppress(AssertionError):
torch.distributed.destroy_process_group()
gc.collect()
torch.cuda.empty_cache()
torch.cuda.synchronize()
except ImportError:
del self.vllm_model
super().__del__()
@result_to_dataframe(["generated_texts", "generated_tokens", "generated_log_probs"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
prompts = self.cast_to_run(previous_result)
return self._pure(prompts, **kwargs)
def _pure(
self, prompts: List[str], **kwargs
) -> Tuple[List[str], List[List[int]], List[List[float]]]:
"""
Vllm module.
It gets the VLLM instance and returns generated texts by the input prompt.
You can set logprobs to get the log probs of the generated text.
Default logprobs is 1.
:param prompts: A list of prompts.
:param kwargs: The extra parameters for generating the text.
:return: A tuple of three elements.
The first element is a list of generated text.
The second element is a list of generated text's token ids.
The third element is a list of generated text's log probs.
"""
try:
from vllm.outputs import RequestOutput
from vllm.sequence import SampleLogprobs
from vllm import SamplingParams
except ImportError:
raise ImportError(
"Please install vllm library. You can install it by running `pip install vllm`."
)
if "logprobs" not in kwargs:
kwargs["logprobs"] = 1
sampling_params = pop_params(SamplingParams.from_optional, kwargs)
generate_params = SamplingParams(**sampling_params)
results: List[RequestOutput] = self.vllm_model.generate(
prompts, generate_params
)
generated_texts = list(map(lambda x: x.outputs[0].text, results))
generated_token_ids = list(map(lambda x: x.outputs[0].token_ids, results))
log_probs: List[SampleLogprobs] = list(
map(lambda x: x.outputs[0].logprobs, results)
)
generated_log_probs = list(
map(
lambda x: list(map(lambda y: y[0][y[1]].logprob, zip(x[0], x[1]))),
zip(log_probs, generated_token_ids),
)
)
return (
to_list(generated_texts),
to_list(generated_token_ids),
to_list(generated_log_probs),
)
async def astream(self, prompt: str, **kwargs):
raise NotImplementedError
def stream(self, prompt: str, **kwargs):
raise NotImplementedError

View File

@@ -0,0 +1,176 @@
import logging
from typing import List, Tuple
import time
import pandas as pd
import requests
from asyncio import to_thread
from autorag.nodes.generator.base import BaseGenerator
from autorag.utils.util import get_event_loop, process_batch, result_to_dataframe
logger = logging.getLogger("AutoRAG")
DEFAULT_MAX_TOKENS = 4096 # Default token limit
class VllmAPI(BaseGenerator):
def __init__(
self,
project_dir,
llm: str,
uri: str,
max_tokens: int = None,
batch: int = 16,
*args,
**kwargs,
):
"""
VLLM API Wrapper for OpenAI-compatible chat/completions format.
:param project_dir: Project directory.
:param llm: Model name (e.g., LLaMA model).
:param uri: VLLM API server URI.
:param max_tokens: Maximum token limit.
Default is 4096.
:param batch: Request batch size.
Default is 16.
"""
super().__init__(project_dir, llm, *args, **kwargs)
assert batch > 0, "Batch size must be greater than 0."
self.uri = uri.rstrip("/") # Set API URI
self.batch = batch
# Use the provided max_tokens if available, otherwise use the default
self.max_token_size = max_tokens if max_tokens else DEFAULT_MAX_TOKENS
self.max_model_len = self.get_max_model_length()
logger.info(f"{llm} max model length: {self.max_model_len}")
@result_to_dataframe(["generated_texts", "generated_tokens", "generated_log_probs"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
prompts = self.cast_to_run(previous_result)
return self._pure(prompts, **kwargs)
def _pure(
self, prompts: List[str], truncate: bool = True, **kwargs
) -> Tuple[List[str], List[List[int]], List[List[float]]]:
"""
Method to call the VLLM API to generate text.
:param prompts: List of input prompts.
:param truncate: Whether to truncate input prompts to fit within the token limit.
:param kwargs: Additional options (e.g., temperature, top_p).
:return: Generated text, token lists, and log probability lists.
"""
if kwargs.get("logprobs") is not None:
kwargs.pop("logprobs")
logger.warning(
"parameter logprob does not effective. It always set to True."
)
if kwargs.get("n") is not None:
kwargs.pop("n")
logger.warning("parameter n does not effective. It always set to 1.")
if truncate:
prompts = list(map(lambda p: self.truncate_by_token(p), prompts))
loop = get_event_loop()
tasks = [to_thread(self.get_result, prompt, **kwargs) for prompt in prompts]
results = loop.run_until_complete(process_batch(tasks, self.batch))
answer_result = list(map(lambda x: x[0], results))
token_result = list(map(lambda x: x[1], results))
logprob_result = list(map(lambda x: x[2], results))
return answer_result, token_result, logprob_result
def truncate_by_token(self, prompt: str) -> str:
"""
Function to truncate prompts to fit within the maximum token limit.
"""
tokens = self.encoding_for_model(prompt)["tokens"] # Simple tokenization
return self.decoding_for_model(tokens[: self.max_model_len])["prompt"]
def call_vllm_api(self, prompt: str, **kwargs) -> dict:
"""
Calls the VLLM API to get chat/completions responses.
:param prompt: Input prompt.
:param kwargs: Additional API options (e.g., temperature, max_tokens).
:return: API response.
"""
payload = {
"model": self.llm,
"messages": [{"role": "user", "content": prompt}],
"temperature": kwargs.get("temperature", 0.4),
"max_tokens": min(
kwargs.get("max_tokens", self.max_token_size), self.max_token_size
),
"logprobs": True,
"n": 1,
}
start_time = time.time() # Record request start time
response = requests.post(f"{self.uri}/v1/chat/completions", json=payload)
end_time = time.time() # Record request end time
response.raise_for_status()
elapsed_time = end_time - start_time # Calculate elapsed time
logger.info(
f"Request chat completions to vllm server completed in {elapsed_time:.2f} seconds"
)
return response.json()
# Additional method: abstract method implementation
async def astream(self, prompt: str, **kwargs):
"""
Asynchronous streaming method not implemented.
"""
raise NotImplementedError("astream method is not implemented for VLLM API yet.")
def stream(self, prompt: str, **kwargs):
"""
Synchronous streaming method not implemented.
"""
raise NotImplementedError("stream method is not implemented for VLLM API yet.")
def get_result(self, prompt: str, **kwargs):
response = self.call_vllm_api(prompt, **kwargs)
choice = response["choices"][0]
answer = choice["message"]["content"]
# Handle cases where logprobs is None
if choice.get("logprobs") and "content" in choice["logprobs"]:
logprobs = list(map(lambda x: x["logprob"], choice["logprobs"]["content"]))
tokens = list(
map(
lambda x: self.encoding_for_model(x["token"])["tokens"],
choice["logprobs"]["content"],
)
)
else:
logprobs = []
tokens = []
return answer, tokens, logprobs
def encoding_for_model(self, answer_piece: str):
payload = {
"model": self.llm,
"prompt": answer_piece,
"add_special_tokens": True,
}
response = requests.post(f"{self.uri}/tokenize", json=payload)
response.raise_for_status()
return response.json()
def decoding_for_model(self, tokens: list[int]):
payload = {
"model": self.llm,
"tokens": tokens,
}
response = requests.post(f"{self.uri}/detokenize", json=payload)
response.raise_for_status()
return response.json()
def get_max_model_length(self):
response = requests.get(f"{self.uri}/v1/models")
response.raise_for_status()
json_data = response.json()
return json_data["data"][0]["max_model_len"]

View File

@@ -0,0 +1,2 @@
from .pass_passage_augmenter import PassPassageAugmenter
from .prev_next_augmenter import PrevNextPassageAugmenter

View File

@@ -0,0 +1,80 @@
import abc
import logging
import os
import pandas as pd
from autorag.schema import BaseModule
from autorag.utils import (
validate_qa_dataset,
sort_by_scores,
validate_corpus_dataset,
cast_corpus_dataset,
)
from autorag.utils.util import select_top_k
logger = logging.getLogger("AutoRAG")
class BasePassageAugmenter(BaseModule, metaclass=abc.ABCMeta):
def __init__(self, project_dir: str, *args, **kwargs):
logger.info(
f"Initialize passage augmenter node - {self.__class__.__name__} module..."
)
data_dir = os.path.join(project_dir, "data")
corpus_df = pd.read_parquet(
os.path.join(data_dir, "corpus.parquet"), engine="pyarrow"
)
validate_corpus_dataset(corpus_df)
corpus_df = cast_corpus_dataset(corpus_df)
self.corpus_df = corpus_df
def __del__(self):
logger.info(
f"Initialize passage augmenter node - {self.__class__.__name__} module..."
)
def cast_to_run(self, previous_result: pd.DataFrame, *args, **kwargs):
logger.info(
f"Running passage augmenter node - {self.__class__.__name__} module..."
)
validate_qa_dataset(previous_result)
# find ids columns
assert (
"retrieved_ids" in previous_result.columns
), "previous_result must have retrieved_ids column."
ids = previous_result["retrieved_ids"].tolist()
return ids
@staticmethod
def sort_by_scores(
augmented_contents,
augmented_ids,
augmented_scores,
top_k: int,
reverse: bool = True,
):
# sort by scores
df = pd.DataFrame(
{
"contents": augmented_contents,
"ids": augmented_ids,
"scores": augmented_scores,
}
)
df[["contents", "ids", "scores"]] = df.apply(
lambda row: sort_by_scores(row, reverse=reverse),
axis=1,
result_type="expand",
)
# select by top_k
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
return (
results["contents"].tolist(),
results["ids"].tolist(),
results["scores"].tolist(),
)

View File

@@ -0,0 +1,43 @@
from typing import List
import pandas as pd
from autorag.nodes.passageaugmenter.base import BasePassageAugmenter
from autorag.utils import result_to_dataframe
class PassPassageAugmenter(BasePassageAugmenter):
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
"""
Run the passage augmenter node - PassPassageAugmenter module.
:param previous_result: The previous result Dataframe.
:param top_k: You must input the top_k value to get the top k results.
:param kwargs: Not affected.
:return: DataFrame with retrieved_contents, retrieved_ids, and retrieve_scores columns
"""
top_k = kwargs.pop("top_k")
ids = self.cast_to_run(previous_result)
contents = previous_result["retrieved_contents"].tolist()
scores = previous_result["retrieve_scores"].tolist()
augmented_ids, augmented_contents, augmented_scores = self._pure(
ids, contents, scores
)
return self.sort_by_scores(
augmented_contents, augmented_ids, augmented_scores, top_k
)
def _pure(
self,
ids_list: List[List[str]],
contents_list: List[List[str]],
scores_list: List[List[float]],
):
"""
Do not perform augmentation.
Return given passages, scores, and ids as is.
"""
return ids_list, contents_list, scores_list

View File

@@ -0,0 +1,155 @@
from typing import List, Union
import numpy as np
import pandas as pd
from autorag.embedding.base import EmbeddingModel
from autorag.evaluation.metric.util import calculate_cosine_similarity
from autorag.nodes.passageaugmenter.base import BasePassageAugmenter
from autorag.utils.util import (
filter_dict_keys,
fetch_contents,
embedding_query_content,
result_to_dataframe,
empty_cuda_cache,
)
class PrevNextPassageAugmenter(BasePassageAugmenter):
def __init__(
self,
project_dir: str,
embedding_model: Union[str, dict] = "openai",
*args,
**kwargs,
):
"""
Initialize the PrevNextPassageAugmenter module.
:param project_dir:
:param embedding_model: The embedding model name to use for calculating cosine similarity
Default is openai (text-embedding-ada-002)
:param kwargs:
"""
super().__init__(project_dir, *args, **kwargs)
slim_corpus_df = self.corpus_df[["doc_id", "metadata"]]
slim_corpus_df.loc[:, "metadata"] = slim_corpus_df["metadata"].apply(
filter_dict_keys, keys=["prev_id", "next_id"]
)
self.slim_corpus_df = slim_corpus_df
# init embedding model
self.embedding_model = EmbeddingModel.load(embedding_model)()
def __del__(self):
del self.embedding_model
empty_cuda_cache()
super().__del__()
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
"""
Run the passage augmenter node - PrevNextPassageAugmenter module.
:param previous_result: The previous result Dataframe.
:param top_k: You must input the top_k value to get the top k results.
:param kwargs: Not affected.
:return: DataFrame with retrieved_contents, retrieved_ids, and retrieve_scores columns
"""
top_k = kwargs.pop("top_k")
ids = self.cast_to_run(previous_result)
# find queries columns
assert (
"query" in previous_result.columns
), "previous_result must have query column."
queries = previous_result["query"].tolist()
mode = kwargs.pop("mode", "both")
num_passages = kwargs.pop("num_passages", 1)
augmented_ids = self._pure(ids, num_passages, mode)
# fetch contents from corpus to use augmented ids
augmented_contents = fetch_contents(self.corpus_df, augmented_ids)
query_embeddings, contents_embeddings = embedding_query_content(
queries, augmented_contents, self.embedding_model, batch=128
)
# get scores from calculated cosine similarity
augmented_scores = [
np.array(
[
calculate_cosine_similarity(query_embedding, x)
for x in content_embeddings
]
).tolist()
for query_embedding, content_embeddings in zip(
query_embeddings, contents_embeddings
)
]
return self.sort_by_scores(
augmented_contents, augmented_ids, augmented_scores, top_k
)
def _pure(
self,
ids_list: List[List[str]],
num_passages: int = 1,
mode: str = "both",
) -> List[List[str]]:
"""
Add passages before and/or after the retrieved passage.
For more information, visit https://docs.llamaindex.ai/en/stable/examples/node_postprocessor/PrevNextPostprocessorDemo/.
:param ids_list: The list of lists of ids retrieved
:param num_passages: The number of passages to add before and after the retrieved passage
Default is 1.
:param mode: The mode of augmentation
'prev': add passages before the retrieved passage
'next': add passages after the retrieved passage
'both': add passages before and after the retrieved passage
Default is 'next'.
:return: The list of lists of augmented ids
"""
if mode not in ["prev", "next", "both"]:
raise ValueError(f"mode must be 'prev', 'next', or 'both', but got {mode}")
augmented_ids = [
(
lambda ids: prev_next_augmenter_pure(
ids, self.slim_corpus_df, mode, num_passages
)
)(ids)
for ids in ids_list
]
return augmented_ids
def prev_next_augmenter_pure(
ids: List[str], corpus_df: pd.DataFrame, mode: str, num_passages: int
):
def fetch_id_sequence(start_id, key):
sequence = []
current_id = start_id
for _ in range(num_passages):
current_id = (
corpus_df.loc[corpus_df["doc_id"] == current_id]["metadata"]
.values[0]
.get(key)
)
if current_id is None:
break
sequence.append(current_id)
return sequence
augmented_group = []
for id_ in ids:
current_ids = [id_]
if mode in ["prev", "both"]:
current_ids = fetch_id_sequence(id_, "prev_id")[::-1] + current_ids
if mode in ["next", "both"]:
current_ids += fetch_id_sequence(id_, "next_id")
augmented_group.extend(current_ids)
return augmented_group

View File

@@ -0,0 +1,131 @@
import logging
import os
import pathlib
from typing import List, Dict
import pandas as pd
from autorag.nodes.retrieval.run import evaluate_retrieval_node
from autorag.schema.metricinput import MetricInput
from autorag.strategy import measure_speed, filter_by_threshold, select_best
from autorag.utils.util import apply_recursive, to_list
logger = logging.getLogger("AutoRAG")
def run_passage_augmenter_node(
modules: List,
module_params: List[Dict],
previous_result: pd.DataFrame,
node_line_dir: str,
strategies: Dict,
) -> pd.DataFrame:
if not os.path.exists(node_line_dir):
os.makedirs(node_line_dir)
project_dir = pathlib.PurePath(node_line_dir).parent.parent
qa_df = pd.read_parquet(
os.path.join(project_dir, "data", "qa.parquet"), engine="pyarrow"
)
retrieval_gt = qa_df["retrieval_gt"].tolist()
retrieval_gt = apply_recursive(lambda x: str(x), to_list(retrieval_gt))
results, execution_times = zip(
*map(
lambda task: measure_speed(
task[0].run_evaluator,
project_dir=project_dir,
previous_result=previous_result,
**task[1],
),
zip(modules, module_params),
)
)
average_times = list(map(lambda x: x / len(results[0]), execution_times))
metric_inputs = [
MetricInput(retrieval_gt=ret_gt, query=query, generation_gt=gen_gt)
for ret_gt, query, gen_gt in zip(
retrieval_gt,
previous_result["query"].tolist(),
previous_result["generation_gt"].tolist(),
)
]
# run metrics before filtering
if strategies.get("metrics") is None:
raise ValueError(
"You must at least one metrics for passage_augmenter evaluation."
)
results = list(
map(
lambda x: evaluate_retrieval_node(
x,
metric_inputs,
strategies.get("metrics"),
),
results,
)
)
# save results to folder
save_dir = os.path.join(node_line_dir, "passage_augmenter") # node name
if not os.path.exists(save_dir):
os.makedirs(save_dir)
filepaths = list(
map(lambda x: os.path.join(save_dir, f"{x}.parquet"), range(len(modules)))
)
list(
map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths))
) # execute save to parquet
filenames = list(map(lambda x: os.path.basename(x), filepaths))
summary_df = pd.DataFrame(
{
"filename": filenames,
"module_name": list(map(lambda module: module.__name__, modules)),
"module_params": module_params,
"execution_time": average_times,
**{
f"passage_augmenter_{metric}": list(
map(lambda result: result[metric].mean(), results)
)
for metric in strategies.get("metrics")
},
}
)
# filter by strategies
if strategies.get("speed_threshold") is not None:
results, filenames = filter_by_threshold(
results, average_times, strategies["speed_threshold"], filenames
)
selected_result, selected_filename = select_best(
results,
strategies.get("metrics"),
filenames,
strategies.get("strategy", "mean"),
)
# change metric name columns to passage_augmenter_metric_name
selected_result = selected_result.rename(
columns={
metric_name: f"passage_augmenter_{metric_name}"
for metric_name in strategies["metrics"]
}
)
# drop retrieval result columns in previous_result
previous_result = previous_result.drop(
columns=["retrieved_contents", "retrieved_ids", "retrieve_scores"]
)
best_result = pd.concat([previous_result, selected_result], axis=1)
# add 'is_best' column to summary file
summary_df["is_best"] = summary_df["filename"] == selected_filename
# save files
summary_df.to_csv(os.path.join(save_dir, "summary.csv"), index=False)
best_result.to_parquet(
os.path.join(
save_dir, f"best_{os.path.splitext(selected_filename)[0]}.parquet"
),
index=False,
)
return best_result

View File

@@ -0,0 +1,4 @@
from .longllmlingua import LongLLMLingua
from .pass_compressor import PassCompressor
from .refine import Refine
from .tree_summarize import TreeSummarize

View File

@@ -0,0 +1,83 @@
import abc
import logging
from typing import Dict
import pandas as pd
from llama_index.core.llms import LLM
from autorag import generator_models
from autorag.schema import BaseModule
from autorag.utils import result_to_dataframe
logger = logging.getLogger("AutoRAG")
class BasePassageCompressor(BaseModule, metaclass=abc.ABCMeta):
def __init__(self, project_dir: str, *args, **kwargs):
logger.info(
f"Initialize passage compressor node - {self.__class__.__name__} module..."
)
def __del__(self):
logger.info(
f"Deleting passage compressor node - {self.__class__.__name__} module..."
)
def cast_to_run(self, previous_result: pd.DataFrame, *args, **kwargs):
logger.info(
f"Running passage compressor node - {self.__class__.__name__} module..."
)
assert all(
[
column in previous_result.columns
for column in [
"query",
"retrieved_contents",
]
]
), "previous_result must have retrieved_contents, retrieved_ids, and retrieve_scores columns."
assert len(previous_result) > 0, "previous_result must have at least one row."
queries = previous_result["query"].tolist()
retrieved_contents = previous_result["retrieved_contents"].tolist()
return queries, retrieved_contents
class LlamaIndexCompressor(BasePassageCompressor, metaclass=abc.ABCMeta):
param_list = ["prompt", "chat_prompt", "batch"]
def __init__(self, project_dir: str, **kwargs):
"""
Initialize passage compressor module.
:param project_dir: The project directory
:param llm: The llm name that will be used to summarize.
The LlamaIndex LLM model can be used in here.
:param kwargs: Extra parameter for init llm
"""
super().__init__(project_dir)
kwargs_dict = dict(
filter(lambda x: x[0] not in self.param_list, kwargs.items())
)
llm_name = kwargs_dict.pop("llm")
self.llm: LLM = make_llm(llm_name, kwargs_dict)
def __del__(self):
del self.llm
super().__del__()
@result_to_dataframe(["retrieved_contents"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries, retrieved_contents = self.cast_to_run(previous_result)
param_dict = dict(filter(lambda x: x[0] in self.param_list, kwargs.items()))
result = self._pure(queries, retrieved_contents, **param_dict)
return list(map(lambda x: [x], result))
def make_llm(llm_name: str, kwargs: Dict) -> LLM:
if llm_name not in generator_models:
raise KeyError(
f"{llm_name} is not supported. "
"You can add it manually by calling autorag.generator_models."
)
return generator_models[llm_name](**kwargs)

View File

@@ -0,0 +1,115 @@
from typing import List, Optional
import pandas as pd
from autorag.nodes.passagecompressor.base import BasePassageCompressor
from autorag.utils.util import pop_params, result_to_dataframe, empty_cuda_cache
# TODO: Parallel Processing Refactoring at #460
class LongLLMLingua(BasePassageCompressor):
def __init__(
self, project_dir: str, model_name: str = "NousResearch/Llama-2-7b-hf", **kwargs
):
try:
from llmlingua import PromptCompressor
except ImportError:
raise ImportError(
"LongLLMLingua is not installed. Please install it by running `pip install llmlingua`."
)
super().__init__(project_dir)
model_init_params = pop_params(PromptCompressor.__init__, kwargs)
self.llm_lingua = PromptCompressor(model_name=model_name, **model_init_params)
def __del__(self):
del self.llm_lingua
empty_cuda_cache()
super().__del__()
@result_to_dataframe(["retrieved_contents"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries, retrieved_contents = self.cast_to_run(previous_result)
results = self._pure(queries, retrieved_contents, **kwargs)
return list(map(lambda x: [x], results))
def _pure(
self,
queries: List[str],
contents: List[List[str]],
instructions: Optional[str] = None,
target_token: int = 300,
**kwargs,
) -> List[str]:
"""
Compresses the retrieved texts using LongLLMLingua.
For more information, visit https://github.com/microsoft/LLMLingua.
:param queries: The queries for retrieved passages.
:param contents: The contents of retrieved passages.
:param model_name: The model name to use for compression.
The default is "NousResearch/Llama-2-7b-hf".
:param instructions: The instructions for compression.
Default is None. When it is None, it will use default instructions.
:param target_token: The target token for compression.
Default is 300.
:param kwargs: Additional keyword arguments.
:return: The list of compressed texts.
"""
if instructions is None:
instructions = "Given the context, please answer the final question"
results = [
llmlingua_pure(
query, contents_, self.llm_lingua, instructions, target_token, **kwargs
)
for query, contents_ in zip(queries, contents)
]
return results
def llmlingua_pure(
query: str,
contents: List[str],
llm_lingua,
instructions: str,
target_token: int = 300,
**kwargs,
) -> str:
"""
Return the compressed text.
:param query: The query for retrieved passages.
:param contents: The contents of retrieved passages.
:param llm_lingua: The llm instance, that will be used to compress.
:param instructions: The instructions for compression.
:param target_token: The target token for compression.
Default is 300.
:param kwargs: Additional keyword arguments.
:return: The compressed text.
"""
try:
from llmlingua import PromptCompressor
except ImportError:
raise ImportError(
"LongLLMLingua is not installed. Please install it by running `pip install llmlingua`."
)
# split by "\n\n" (recommended by LongLLMLingua authors)
new_context_texts = [c for context in contents for c in context.split("\n\n")]
compress_prompt_params = pop_params(PromptCompressor.compress_prompt, kwargs)
compressed_prompt = llm_lingua.compress_prompt(
new_context_texts,
question=query,
instruction=instructions,
rank_method="longllmlingua",
target_token=target_token,
**compress_prompt_params,
)
compressed_prompt_txt = compressed_prompt["compressed_prompt"]
# separate out the question and instruction
result = "\n\n".join(compressed_prompt_txt.split("\n\n")[1:-1])
return result

View File

@@ -0,0 +1,16 @@
from typing import List
import pandas as pd
from autorag.nodes.passagecompressor.base import BasePassageCompressor
from autorag.utils import result_to_dataframe
class PassCompressor(BasePassageCompressor):
@result_to_dataframe(["retrieved_contents"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
_, contents = self.cast_to_run(previous_result)
return self._pure(contents)
def _pure(self, contents: List[List[str]]):
return contents

View File

@@ -0,0 +1,54 @@
from typing import List, Optional
from llama_index.core import PromptTemplate
from llama_index.core.prompts import PromptType
from llama_index.core.prompts.utils import is_chat_model
from llama_index.core.response_synthesizers import Refine as rf
from autorag.nodes.passagecompressor.base import LlamaIndexCompressor
from autorag.utils.util import get_event_loop, process_batch
class Refine(LlamaIndexCompressor):
def _pure(
self,
queries: List[str],
contents: List[List[str]],
prompt: Optional[str] = None,
chat_prompt: Optional[str] = None,
batch: int = 16,
) -> List[str]:
"""
Refine a response to a query across text chunks.
This function is a wrapper for llama_index.response_synthesizers.Refine.
For more information, visit https://docs.llamaindex.ai/en/stable/examples/response_synthesizers/refine/.
:param queries: The queries for retrieved passages.
:param contents: The contents of retrieved passages.
:param prompt: The prompt template for refine.
If you want to use chat prompt, you should pass chat_prompt instead.
At prompt, you must specify where to put 'context_msg' and 'query_str'.
Default is None. When it is None, it will use llama index default prompt.
:param chat_prompt: The chat prompt template for refine.
If you want to use normal prompt, you should pass prompt instead.
At prompt, you must specify where to put 'context_msg' and 'query_str'.
Default is None. When it is None, it will use llama index default chat prompt.
:param batch: The batch size for llm.
Set low if you face some errors.
Default is 16.
:return: The list of compressed texts.
"""
if prompt is not None and not is_chat_model(self.llm):
refine_template = PromptTemplate(prompt, prompt_type=PromptType.REFINE)
elif chat_prompt is not None and is_chat_model(self.llm):
refine_template = PromptTemplate(chat_prompt, prompt_type=PromptType.REFINE)
else:
refine_template = None
summarizer = rf(llm=self.llm, refine_template=refine_template, verbose=True)
tasks = [
summarizer.aget_response(query, content)
for query, content in zip(queries, contents)
]
loop = get_event_loop()
results = loop.run_until_complete(process_batch(tasks, batch_size=batch))
return results

View File

@@ -0,0 +1,186 @@
import os.path
import pathlib
from typing import List, Dict
import pandas as pd
from autorag.evaluation.metric import (
retrieval_token_recall,
retrieval_token_precision,
retrieval_token_f1,
)
from autorag.schema.metricinput import MetricInput
from autorag.strategy import measure_speed, filter_by_threshold, select_best
from autorag.utils.util import fetch_contents
def run_passage_compressor_node(
modules: List,
module_params: List[Dict],
previous_result: pd.DataFrame,
node_line_dir: str,
strategies: Dict,
) -> pd.DataFrame:
"""
Run evaluation and select the best module among passage compressor modules.
:param modules: Passage compressor modules to run.
:param module_params: Passage compressor module parameters.
:param previous_result: Previous result dataframe.
Could be retrieval, reranker modules result.
It means it must contain 'query', 'retrieved_contents', 'retrieved_ids', 'retrieve_scores' columns.
:param node_line_dir: This node line's directory.
:param strategies: Strategies for passage compressor node.
In this node, we use
You can skip evaluation when you use only one module and a module parameter.
:return: The best result dataframe with previous result columns.
This node will replace 'retrieved_contents' to compressed passages, so its length will be one.
"""
if not os.path.exists(node_line_dir):
os.makedirs(node_line_dir)
project_dir = pathlib.PurePath(node_line_dir).parent.parent
data_dir = os.path.join(project_dir, "data")
save_dir = os.path.join(node_line_dir, "passage_compressor")
if not os.path.exists(save_dir):
os.makedirs(save_dir)
# make retrieval contents gt
qa_data = pd.read_parquet(os.path.join(data_dir, "qa.parquet"), engine="pyarrow")
corpus_data = pd.read_parquet(
os.path.join(data_dir, "corpus.parquet"), engine="pyarrow"
)
# check qa_data have retrieval_gt
assert all(
len(x[0]) > 0 for x in qa_data["retrieval_gt"].tolist()
), "Can't use passage compressor if you don't have retrieval gt values in QA dataset."
# run modules
results, execution_times = zip(
*map(
lambda task: measure_speed(
task[0].run_evaluator,
project_dir=project_dir,
previous_result=previous_result,
**task[1],
),
zip(modules, module_params),
)
)
results = list(results)
average_times = list(map(lambda x: x / len(results[0]), execution_times))
retrieval_gt_contents = list(
map(lambda x: fetch_contents(corpus_data, x), qa_data["retrieval_gt"].tolist())
)
metric_inputs = [
MetricInput(retrieval_gt_contents=ret_cont_gt)
for ret_cont_gt in retrieval_gt_contents
]
# run metrics before filtering
if strategies.get("metrics") is None:
raise ValueError(
"You must at least one metrics for retrieval contents evaluation."
"It can be 'retrieval_token_f1', 'retrieval_token_precision', 'retrieval_token_recall'."
)
results = list(
map(
lambda x: evaluate_passage_compressor_node(
x, metric_inputs, strategies.get("metrics")
),
results,
)
)
# save results to folder
filepaths = list(
map(lambda x: os.path.join(save_dir, f"{x}.parquet"), range(len(modules)))
)
list(
map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths))
) # execute save to parquet
filenames = list(map(lambda x: os.path.basename(x), filepaths))
# make summary file
summary_df = pd.DataFrame(
{
"filename": filenames,
"module_name": list(map(lambda module: module.__name__, modules)),
"module_params": module_params,
"execution_time": average_times,
**{
f"passage_compressor_{metric}": list(
map(lambda result: result[metric].mean(), results)
)
for metric in strategies.get("metrics")
},
}
)
# filter by strategies
if strategies.get("speed_threshold") is not None:
results, filenames = filter_by_threshold(
results, average_times, strategies["speed_threshold"], filenames
)
selected_result, selected_filename = select_best(
results,
strategies.get("metrics"),
filenames,
strategies.get("strategy", "mean"),
)
new_retrieved_contents = selected_result["retrieved_contents"]
previous_result["retrieved_contents"] = new_retrieved_contents
selected_result = selected_result.drop(columns=["retrieved_contents"])
best_result = pd.concat([previous_result, selected_result], axis=1)
# add 'is_best' column to summary file
summary_df["is_best"] = summary_df["filename"] == selected_filename
# add prefix 'passage_compressor' to best_result columns
best_result = best_result.rename(
columns={
metric_name: f"passage_compressor_{metric_name}"
for metric_name in strategies.get("metrics")
}
)
# save the result files
best_result.to_parquet(
os.path.join(
save_dir, f"best_{os.path.splitext(selected_filename)[0]}.parquet"
),
index=False,
)
summary_df.to_csv(os.path.join(save_dir, "summary.csv"), index=False)
return best_result
def evaluate_passage_compressor_node(
result_df: pd.DataFrame, metric_inputs: List[MetricInput], metrics: List[str]
):
metric_funcs = {
retrieval_token_recall.__name__: retrieval_token_recall,
retrieval_token_precision.__name__: retrieval_token_precision,
retrieval_token_f1.__name__: retrieval_token_f1,
}
for metric_input, generated_text in zip(
metric_inputs, result_df["retrieved_contents"].tolist()
):
metric_input.retrieved_contents = generated_text
metrics = list(filter(lambda x: x in metric_funcs.keys(), metrics))
if len(metrics) <= 0:
raise ValueError(f"metrics must be one of {metric_funcs.keys()}")
metrics_scores = dict(
map(
lambda metric: (
metric,
metric_funcs[metric](
metric_inputs=metric_inputs,
),
),
metrics,
)
)
result_df = pd.concat([result_df, pd.DataFrame(metrics_scores)], axis=1)
return result_df

View File

@@ -0,0 +1,56 @@
from typing import List, Optional
from llama_index.core import PromptTemplate
from llama_index.core.prompts import PromptType
from llama_index.core.prompts.utils import is_chat_model
from llama_index.core.response_synthesizers import TreeSummarize as ts
from autorag.nodes.passagecompressor.base import LlamaIndexCompressor
from autorag.utils.util import get_event_loop, process_batch
class TreeSummarize(LlamaIndexCompressor):
def _pure(
self,
queries: List[str],
contents: List[List[str]],
prompt: Optional[str] = None,
chat_prompt: Optional[str] = None,
batch: int = 16,
) -> List[str]:
"""
Recursively merge retrieved texts and summarizes them in a bottom-up fashion.
This function is a wrapper for llama_index.response_synthesizers.TreeSummarize.
For more information, visit https://docs.llamaindex.ai/en/latest/examples/response_synthesizers/tree_summarize.html.
:param queries: The queries for retrieved passages.
:param contents: The contents of retrieved passages.
:param prompt: The prompt template for summarization.
If you want to use chat prompt, you should pass chat_prompt instead.
At prompt, you must specify where to put 'context_str' and 'query_str'.
Default is None. When it is None, it will use llama index default prompt.
:param chat_prompt: The chat prompt template for summarization.
If you want to use normal prompt, you should pass prompt instead.
At prompt, you must specify where to put 'context_str' and 'query_str'.
Default is None. When it is None, it will use llama index default chat prompt.
:param batch: The batch size for llm.
Set low if you face some errors.
Default is 16.
:return: The list of compressed texts.
"""
if prompt is not None and not is_chat_model(self.llm):
summary_template = PromptTemplate(prompt, prompt_type=PromptType.SUMMARY)
elif chat_prompt is not None and is_chat_model(self.llm):
summary_template = PromptTemplate(
chat_prompt, prompt_type=PromptType.SUMMARY
)
else:
summary_template = None
summarizer = ts(llm=self.llm, summary_template=summary_template, use_async=True)
tasks = [
summarizer.aget_response(query, content)
for query, content in zip(queries, contents)
]
loop = get_event_loop()
results = loop.run_until_complete(process_batch(tasks, batch_size=batch))
return results

View File

@@ -0,0 +1,6 @@
from .pass_passage_filter import PassPassageFilter
from .percentile_cutoff import PercentileCutoff
from .recency import RecencyFilter
from .similarity_percentile_cutoff import SimilarityPercentileCutoff
from .similarity_threshold_cutoff import SimilarityThresholdCutoff
from .threshold_cutoff import ThresholdCutoff

View File

@@ -0,0 +1,50 @@
import abc
import logging
from pathlib import Path
from typing import Union
import pandas as pd
from autorag.schema.base import BaseModule
from autorag.utils import validate_qa_dataset
logger = logging.getLogger("AutoRAG")
class BasePassageFilter(BaseModule, metaclass=abc.ABCMeta):
def __init__(self, project_dir: Union[str, Path], *args, **kwargs):
logger.info(f"Initialize passage filter node - {self.__class__.__name__}")
def __del__(self):
logger.info(f"Prompt maker node - {self.__class__.__name__} module is deleted.")
def cast_to_run(self, previous_result: pd.DataFrame, *args, **kwargs):
logger.info(
f"Running passage filter node - {self.__class__.__name__} module..."
)
validate_qa_dataset(previous_result)
# find queries columns
assert (
"query" in previous_result.columns
), "previous_result must have query column."
queries = previous_result["query"].tolist()
# find contents_list columns
assert (
"retrieved_contents" in previous_result.columns
), "previous_result must have retrieved_contents column."
contents = previous_result["retrieved_contents"].tolist()
# find scores columns
assert (
"retrieve_scores" in previous_result.columns
), "previous_result must have retrieve_scores column."
scores = previous_result["retrieve_scores"].tolist()
# find ids columns
assert (
"retrieved_ids" in previous_result.columns
), "previous_result must have retrieved_ids column."
ids = previous_result["retrieved_ids"].tolist()
return queries, contents, scores, ids

View File

@@ -0,0 +1,14 @@
import pandas as pd
from autorag.nodes.passagefilter.base import BasePassageFilter
from autorag.utils import result_to_dataframe
class PassPassageFilter(BasePassageFilter):
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
_, contents, scores, ids = self.cast_to_run(previous_result)
return contents, ids, scores
def _pure(self, *args, **kwargs):
pass

View File

@@ -0,0 +1,58 @@
from typing import List, Tuple
import pandas as pd
from autorag.nodes.passagefilter.base import BasePassageFilter
from autorag.utils.util import sort_by_scores, select_top_k, result_to_dataframe
class PercentileCutoff(BasePassageFilter):
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries, contents, scores, ids = self.cast_to_run(previous_result)
return self._pure(queries, contents, scores, ids, *args, **kwargs)
def _pure(
self,
queries: List[str],
contents_list: List[List[str]],
scores_list: List[List[float]],
ids_list: List[List[str]],
percentile: float,
reverse: bool = False,
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Filter out the contents that are below the content's length times percentile.
If This is a filter and does not override scores.
If the value of content's length times percentile is less than 1, keep the only one highest similarity content.
:param queries: The list of queries to use for filtering
:param contents_list: The list of lists of contents to filter
:param scores_list: The list of lists of scores retrieved
:param ids_list: The list of lists of ids retrieved
:param percentile: The percentile to cut off
:param reverse: If True, the lower the score, the better
Default is False.
:return: Tuple of lists containing the filtered contents, ids, and scores
"""
num_top_k = max(1, int(len(scores_list[0]) * percentile))
df = pd.DataFrame(
{
"contents": contents_list,
"ids": ids_list,
"scores": scores_list,
}
)
reverse = not reverse
df[["contents", "ids", "scores"]] = df.apply(
sort_by_scores, axis=1, result_type="expand", reverse=reverse
)
results = select_top_k(df, ["contents", "ids", "scores"], num_top_k)
return (
results["contents"].tolist(),
results["ids"].tolist(),
results["scores"].tolist(),
)

View File

@@ -0,0 +1,105 @@
import logging
import os
from datetime import datetime, date
from pathlib import Path
from typing import List, Tuple, Union
import pandas as pd
from autorag.nodes.passagefilter.base import BasePassageFilter
from autorag.utils import fetch_contents, result_to_dataframe
logger = logging.getLogger("AutoRAG")
class RecencyFilter(BasePassageFilter):
def __init__(self, project_dir: Union[str, Path], *args, **kwargs):
super().__init__(project_dir, *args, **kwargs)
self.corpus_df = pd.read_parquet(
os.path.join(project_dir, "data", "corpus.parquet"), engine="pyarrow"
)
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
_, contents, scores, ids = self.cast_to_run(previous_result, *args, **kwargs)
metadatas = fetch_contents(self.corpus_df, ids, column_name="metadata")
times = [
[time["last_modified_datetime"] for time in time_list]
for time_list in metadatas
]
return self._pure(contents, scores, ids, times, *args, **kwargs)
def _pure(
self,
contents_list: List[List[str]],
scores_list: List[List[float]],
ids_list: List[List[str]],
time_list: List[List[datetime]],
threshold_datetime: Union[datetime, date],
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Filter out the contents that are below the threshold datetime.
If all contents are filtered, keep the only one recency content.
If the threshold date format is incorrect, return the original contents.
:param contents_list: The list of lists of contents to filter
:param scores_list: The list of lists of scores retrieved
:param ids_list: The list of lists of ids retrieved
:param time_list: The list of lists of datetime retrieved
:param threshold_datetime: The threshold to cut off.
In recency filter, you have to use the datetime.datetime object or datetime.date object.
All you need to do is to set the date at your YAML file.
For example, you can write "2010-09-09 3:45:06" or "2010-09-09" in the YAML file.
:return: Tuple of lists containing the filtered contents, ids, and scores
"""
if not (
isinstance(threshold_datetime, datetime)
or isinstance(threshold_datetime, date)
):
raise ValueError(
f"Threshold should be a datetime object, but got {type(threshold_datetime)}"
)
if not isinstance(threshold_datetime, datetime):
threshold_datetime = datetime.combine(
threshold_datetime, datetime.min.time()
)
time_list = [
list(
map(
lambda t: datetime.combine(t, datetime.min.time())
if not isinstance(t, datetime)
else t,
time,
)
)
for time in time_list
]
def sort_row(contents, scores, ids, time, _datetime_threshold):
combined = list(zip(contents, scores, ids, time))
combined_filtered = [
item for item in combined if item[3] >= _datetime_threshold
]
if combined_filtered:
remain_contents, remain_scores, remain_ids, _ = zip(*combined_filtered)
else:
combined.sort(key=lambda x: x[3], reverse=True)
remain_contents, remain_scores, remain_ids, _ = zip(*combined[:1])
return list(remain_contents), list(remain_ids), list(remain_scores)
remain_contents_list, remain_ids_list, remain_scores_list = zip(
*map(
sort_row,
contents_list,
scores_list,
ids_list,
time_list,
[threshold_datetime] * len(contents_list),
)
)
return remain_contents_list, remain_ids_list, remain_scores_list

View File

@@ -0,0 +1,138 @@
import os
import pathlib
from typing import List, Dict
import pandas as pd
from autorag.nodes.retrieval.run import evaluate_retrieval_node
from autorag.schema.metricinput import MetricInput
from autorag.strategy import measure_speed, filter_by_threshold, select_best
from autorag.utils.util import to_list, apply_recursive
def run_passage_filter_node(
modules: List,
module_params: List[Dict],
previous_result: pd.DataFrame,
node_line_dir: str,
strategies: Dict,
) -> pd.DataFrame:
"""
Run evaluation and select the best module among passage filter node results.
:param modules: Passage filter modules to run.
:param module_params: Passage filter module parameters.
:param previous_result: Previous result dataframe.
Could be retrieval, reranker, passage filter modules result.
It means it must contain 'query', 'retrieved_contents', 'retrieved_ids', 'retrieve_scores' columns.
:param node_line_dir: This node line's directory.
:param strategies: Strategies for passage filter node.
In this node, we use 'retrieval_f1', 'retrieval_recall' and 'retrieval_precision'.
You can skip evaluation when you use only one module and a module parameter.
:return: The best result dataframe with previous result columns.
"""
if not os.path.exists(node_line_dir):
os.makedirs(node_line_dir)
project_dir = pathlib.PurePath(node_line_dir).parent.parent
qa_df = pd.read_parquet(
os.path.join(project_dir, "data", "qa.parquet"), engine="pyarrow"
)
retrieval_gt = qa_df["retrieval_gt"].tolist()
retrieval_gt = apply_recursive(lambda x: str(x), to_list(retrieval_gt))
# make rows to metric_inputs
metric_inputs = [
MetricInput(retrieval_gt=ret_gt, query=query, generation_gt=gen_gt)
for ret_gt, query, gen_gt in zip(
retrieval_gt, qa_df["query"].tolist(), qa_df["generation_gt"].tolist()
)
]
results, execution_times = zip(
*map(
lambda task: measure_speed(
task[0].run_evaluator,
project_dir=project_dir,
previous_result=previous_result,
**task[1],
),
zip(modules, module_params),
)
)
average_times = list(map(lambda x: x / len(results[0]), execution_times))
# run metrics before filtering
if strategies.get("metrics") is None:
raise ValueError("You must at least one metrics for passage_filter evaluation.")
results = list(
map(
lambda x: evaluate_retrieval_node(
x,
metric_inputs,
strategies.get("metrics"),
),
results,
)
)
# save results to folder
save_dir = os.path.join(node_line_dir, "passage_filter") # node name
if not os.path.exists(save_dir):
os.makedirs(save_dir)
filepaths = list(
map(lambda x: os.path.join(save_dir, f"{x}.parquet"), range(len(modules)))
)
list(
map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths))
) # execute save to parquet
filenames = list(map(lambda x: os.path.basename(x), filepaths))
summary_df = pd.DataFrame(
{
"filename": filenames,
"module_name": list(map(lambda module: module.__name__, modules)),
"module_params": module_params,
"execution_time": average_times,
**{
f"passage_filter_{metric}": list(
map(lambda result: result[metric].mean(), results)
)
for metric in strategies.get("metrics")
},
}
)
# filter by strategies
if strategies.get("speed_threshold") is not None:
results, filenames = filter_by_threshold(
results, average_times, strategies["speed_threshold"], filenames
)
selected_result, selected_filename = select_best(
results,
strategies.get("metrics"),
filenames,
strategies.get("strategy", "mean"),
)
selected_result = selected_result.rename(
columns={
metric_name: f"passage_filter_{metric_name}"
for metric_name in strategies["metrics"]
}
)
previous_result = previous_result.drop(
columns=["retrieved_contents", "retrieved_ids", "retrieve_scores"]
)
best_result = pd.concat([previous_result, selected_result], axis=1)
# add 'is_best' column to summary file
summary_df["is_best"] = summary_df["filename"] == selected_filename
# save files
summary_df.to_csv(os.path.join(save_dir, "summary.csv"), index=False)
best_result.to_parquet(
os.path.join(
save_dir, f"best_{os.path.splitext(selected_filename)[0]}.parquet"
),
index=False,
)
return best_result

View File

@@ -0,0 +1,134 @@
from pathlib import Path
from typing import List, Tuple, Union
import numpy as np
import pandas as pd
from autorag.embedding.base import EmbeddingModel
from autorag.evaluation.metric.util import calculate_cosine_similarity
from autorag.nodes.passagefilter.base import BasePassageFilter
from autorag.nodes.passagefilter.similarity_threshold_cutoff import (
embedding_query_content,
)
from autorag.utils import result_to_dataframe
from autorag.utils.util import empty_cuda_cache, pop_params
class SimilarityPercentileCutoff(BasePassageFilter):
def __init__(self, project_dir: Union[str, Path], *args, **kwargs):
"""
Initialize the SimilarityPercentileCutoff module
:param project_dir: The project directory to use for initializing the module
:param embedding_model: The embedding model string to use for calculating similarity
Default is "openai" which is OpenAI text-embedding-ada-002 embedding model.
"""
super().__init__(project_dir, *args, **kwargs)
embedding_model = kwargs.pop("embedding_model", "openai")
self.embedding_model = EmbeddingModel.load(embedding_model)()
def __del__(self):
super().__del__()
del self.embedding_model
empty_cuda_cache()
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, **kwargs):
queries, contents, scores, ids = self.cast_to_run(previous_result)
kwargs = pop_params(self._pure, kwargs)
return self._pure(queries, contents, scores, ids, **kwargs)
def _pure(
self,
queries: List[str],
contents_list: List[List[str]],
scores_list: List[List[float]],
ids_list: List[List[str]],
percentile: float,
batch: int = 128,
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Re-calculate each content's similarity with the query and filter out the contents that are below the content's
length times percentile. If This is a filter and does not override scores. The output of scores is not coming from
query-content similarity.
If the value of content's length times percentile is less than 1, keep the only one highest similarity content.
:param queries: The list of queries to use for filtering
:param contents_list: The list of lists of contents to filter
:param scores_list: The list of lists of scores retrieved
:param ids_list: The list of lists of ids retrieved
:param percentile: The percentile to cut off
:param batch: The number of queries to be processed in a batch
Default is 128.
:return: Tuple of lists containing the filtered contents, ids, and scores
"""
query_embeddings, content_embeddings = embedding_query_content(
queries, contents_list, self.embedding_model, batch
)
results = list(
map(
lambda x: self.__row_pure(x[0], x[1], x[2], x[3], x[4], percentile),
zip(
query_embeddings,
content_embeddings,
contents_list,
ids_list,
scores_list,
),
)
)
remain_content_list = list(map(lambda x: x[0], results))
remain_ids_list = list(map(lambda x: x[1], results))
remain_scores_list = list(map(lambda x: x[2], results))
return remain_content_list, remain_ids_list, remain_scores_list
@staticmethod
def __row_pure(
query_embedding: str,
content_embeddings: List[List[float]],
content_list: List[str],
ids_list: List[str],
scores_list: List[float],
percentile: float,
) -> Tuple[List[str], List[str], List[float]]:
"""
Return tuple of lists containing the filtered contents, ids, and scores
:param query_embedding: Query embedding
:param content_embeddings: Each content embedding
:param content_list: Each content
:param ids_list: Each id
:param scores_list: Each score
:param percentile: The percentile to cut off
:return: Tuple of lists containing the filtered contents, ids, and scores
"""
num_top_k = int(len(content_embeddings) * percentile)
if num_top_k == 0:
num_top_k = 1
similarities = np.array(
list(
map(
lambda x: calculate_cosine_similarity(query_embedding, x),
content_embeddings,
)
)
).tolist()
content_id_score_similarity = list(
zip(ids_list, content_list, scores_list, similarities)
)
sorted_content_id_score_similarity = sorted(
content_id_score_similarity, key=lambda x: x[3], reverse=True
)[:num_top_k]
content_result, id_result, score_result, _ = zip(
*sorted_content_id_score_similarity
)
return list(content_result), list(id_result), list(score_result)

View File

@@ -0,0 +1,112 @@
from typing import List, Tuple
import numpy as np
import pandas as pd
from autorag.embedding.base import EmbeddingModel
from autorag.evaluation.metric.util import calculate_cosine_similarity
from autorag.nodes.passagefilter.base import BasePassageFilter
from autorag.utils.util import (
embedding_query_content,
empty_cuda_cache,
result_to_dataframe,
pop_params,
)
class SimilarityThresholdCutoff(BasePassageFilter):
def __init__(self, project_dir: str, *args, **kwargs):
"""
Initialize the SimilarityThresholdCutoff module
:param project_dir: The project directory to use for initializing the module
:param embedding_model: The embedding model string to use for calculating similarity
Default is "openai" which is OpenAI text-embedding-ada-002 embedding model.
"""
super().__init__(project_dir, *args, **kwargs)
embedding_model = kwargs.get("embedding_model", "openai")
self.embedding_model = EmbeddingModel.load(embedding_model)()
def __del__(self):
del self.embedding_model
empty_cuda_cache()
super().__del__()
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
kwargs = pop_params(self._pure, kwargs)
queries, contents, scores, ids = self.cast_to_run(previous_result)
return self._pure(queries, contents, scores, ids, *args, **kwargs)
def _pure(
self,
queries: List[str],
contents_list: List[List[str]],
scores_list: List[List[float]],
ids_list: List[List[str]],
threshold: float,
batch: int = 128,
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Re-calculate each content's similarity with the query and filter out the contents that are below the threshold.
If all contents are filtered, keep the only one highest similarity content.
This is a filter and does not override scores.
The output of scores is not coming from query-content similarity.
:param queries: The list of queries to use for filtering
:param contents_list: The list of lists of contents to filter
:param scores_list: The list of lists of scores retrieved
:param ids_list: The list of lists of ids retrieved
:param threshold: The threshold to cut off
:param batch: The number of queries to be processed in a batch
Default is 128.
:return: Tuple of lists containing the filtered contents, ids, and scores
"""
query_embeddings, content_embeddings = embedding_query_content(
queries, contents_list, self.embedding_model, batch
)
remain_indices = list(
map(
lambda x: self.__row_pure(x[0], x[1], threshold),
zip(query_embeddings, content_embeddings),
)
)
remain_content_list = list(
map(lambda c, idx: [c[i] for i in idx], contents_list, remain_indices)
)
remain_scores_list = list(
map(lambda s, idx: [s[i] for i in idx], scores_list, remain_indices)
)
remain_ids_list = list(
map(lambda _id, idx: [_id[i] for i in idx], ids_list, remain_indices)
)
return remain_content_list, remain_ids_list, remain_scores_list
@staticmethod
def __row_pure(
query_embedding: str, content_embeddings: List[List[float]], threshold: float
) -> List[int]:
"""
Return indices that have to remain.
Return at least one index if there is nothing to remain.
:param query_embedding: Query embedding
:param content_embeddings: Each content embedding
:param threshold: The threshold to cut off
:return: Indices to remain at the contents
"""
similarities = np.array(
list(
map(
lambda x: calculate_cosine_similarity(query_embedding, x),
content_embeddings,
)
)
)
result = np.where(similarities >= threshold)[0].tolist()
if len(result) > 0:
return result
return [np.argmax(similarities)]

View File

@@ -0,0 +1,78 @@
from typing import List, Tuple
import pandas as pd
from autorag.nodes.passagefilter.base import BasePassageFilter
from autorag.utils.util import convert_inputs_to_list, result_to_dataframe
class ThresholdCutoff(BasePassageFilter):
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
_, contents, scores, ids = self.cast_to_run(previous_result)
return self._pure(contents, scores, ids, *args, **kwargs)
def _pure(
self,
contents_list: List[List[str]],
scores_list: List[List[float]],
ids_list: List[List[str]],
threshold: float,
reverse: bool = False,
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Filters the contents, scores, and ids based on a previous result's score.
Keeps at least one item per query if all scores are below the threshold.
:param contents_list: List of content strings for each query.
:param scores_list: List of scores for each content.
:param ids_list: List of ids for each content.
:param threshold: The minimum score to keep an item.
:param reverse: If True, the lower the score, the better.
Default is False.
:return: Filtered lists of contents, ids, and scores.
"""
remain_indices = list(
map(lambda x: self.__row_pure(x, threshold, reverse), scores_list)
)
remain_content_list = list(
map(lambda c, idx: [c[i] for i in idx], contents_list, remain_indices)
)
remain_scores_list = list(
map(lambda s, idx: [s[i] for i in idx], scores_list, remain_indices)
)
remain_ids_list = list(
map(lambda _id, idx: [_id[i] for i in idx], ids_list, remain_indices)
)
return remain_content_list, remain_ids_list, remain_scores_list
@convert_inputs_to_list
def __row_pure(
self, scores_list: List[float], threshold: float, reverse: bool = False
) -> List[int]:
"""
Return indices that have to remain.
Return at least one index if there is nothing to remain.
:param scores_list: Each score
:param threshold: The threshold to cut off
:param reverse: If True, the lower the score, the better
Default is False.
:return: Indices to remain at the contents
"""
assert isinstance(scores_list, list), "scores_list must be a list."
if reverse:
remain_indices = [
i for i, score in enumerate(scores_list) if score <= threshold
]
default_index = scores_list.index(min(scores_list))
else:
remain_indices = [
i for i, score in enumerate(scores_list) if score >= threshold
]
default_index = scores_list.index(max(scores_list))
return remain_indices if remain_indices else [default_index]

View File

@@ -0,0 +1,18 @@
from .cohere import CohereReranker
from .colbert import ColbertReranker
from .flag_embedding import FlagEmbeddingReranker
from .flag_embedding_llm import FlagEmbeddingLLMReranker
from .jina import JinaReranker
from .koreranker import KoReranker
from .monot5 import MonoT5
from .pass_reranker import PassReranker
from .rankgpt import RankGPT
from .sentence_transformer import SentenceTransformerReranker
from .time_reranker import TimeReranker
from .upr import Upr
from .openvino import OpenVINOReranker
from .voyageai import VoyageAIReranker
from .mixedbreadai import MixedbreadAIReranker
from .flashrank import FlashRankReranker
from .dragonkue2 import DragonKue2 # 250313 추가 - 김용연

View File

@@ -0,0 +1,55 @@
import abc
import logging
from pathlib import Path
from typing import Union
import pandas as pd
from autorag.schema import BaseModule
from autorag.utils import validate_qa_dataset
logger = logging.getLogger("AutoRAG")
class BasePassageReranker(BaseModule, metaclass=abc.ABCMeta):
def __init__(self, project_dir: Union[str, Path], *args, **kwargs):
logger.info(
f"Initialize passage reranker node - {self.__class__.__name__} module..."
)
def __del__(self):
logger.info(
f"Deleting passage reranker node - {self.__class__.__name__} module..."
)
def cast_to_run(self, previous_result: pd.DataFrame, *args, **kwargs):
logger.info(
f"Running passage reranker node - {self.__class__.__name__} module..."
)
validate_qa_dataset(previous_result)
# find queries columns
assert (
"query" in previous_result.columns
), "previous_result must have query column."
queries = previous_result["query"].tolist()
# find contents_list columns
assert (
"retrieved_contents" in previous_result.columns
), "previous_result must have retrieved_contents column."
contents = previous_result["retrieved_contents"].tolist()
# find scores columns
assert (
"retrieve_scores" in previous_result.columns
), "previous_result must have retrieve_scores column."
scores = previous_result["retrieve_scores"].tolist()
# find ids columns
assert (
"retrieved_ids" in previous_result.columns
), "previous_result must have retrieved_ids column."
ids = previous_result["retrieved_ids"].tolist()
return queries, contents, scores, ids

View File

@@ -0,0 +1,119 @@
import os
from typing import List, Tuple
import cohere
import pandas as pd
from cohere import RerankResponseResultsItem
from autorag.nodes.passagereranker.base import BasePassageReranker
from autorag.utils.util import get_event_loop, process_batch, result_to_dataframe
class CohereReranker(BasePassageReranker):
def __init__(self, project_dir: str, *args, **kwargs):
"""
Initialize Cohere rerank node.
:param project_dir: The project directory path.
:param api_key: The API key for Cohere rerank.
You can set it in the environment variable COHERE_API_KEY.
Or, you can directly set it on the config YAML file using this parameter.
Default is env variable "COHERE_API_KEY".
:param kwargs: Extra arguments that are not affected
"""
super().__init__(project_dir)
api_key = kwargs.pop("api_key", None)
api_key = os.getenv("COHERE_API_KEY", None) if api_key is None else api_key
if api_key is None:
api_key = os.getenv("CO_API_KEY", None)
if api_key is None:
raise KeyError(
"Please set the API key for Cohere rerank in the environment variable COHERE_API_KEY "
"or directly set it on the config YAML file."
)
self.cohere_client = cohere.AsyncClientV2(api_key=api_key)
def __del__(self):
del self.cohere_client
super().__del__()
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries, contents, scores, ids = self.cast_to_run(previous_result)
top_k = kwargs.pop("top_k")
batch = kwargs.pop("batch", 64)
model = kwargs.pop("model", "rerank-v3.5")
return self._pure(queries, contents, scores, ids, top_k, batch, model)
def _pure(
self,
queries: List[str],
contents_list: List[List[str]],
scores_list: List[List[float]],
ids_list: List[List[str]],
top_k: int,
batch: int = 64,
model: str = "rerank-v3.5",
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Rerank a list of contents with Cohere rerank models.
You can get the API key from https://cohere.com/rerank and set it in the environment variable COHERE_API_KEY.
:param queries: The list of queries to use for reranking
:param contents_list: The list of lists of contents to rerank
:param scores_list: The list of lists of scores retrieved from the initial ranking
:param ids_list: The list of lists of ids retrieved from the initial ranking
:param top_k: The number of passages to be retrieved
:param batch: The number of queries to be processed in a batch
:param model: The model name for Cohere rerank.
You can choose between "rerank-v3.5", "rerank-english-v3.0", and "rerank-multilingual-v3.0".
Default is "rerank-v3.5".
:return: Tuple of lists containing the reranked contents, ids, and scores
"""
# Run async cohere_rerank_pure function
tasks = [
cohere_rerank_pure(self.cohere_client, model, query, document, ids, top_k)
for query, document, ids in zip(queries, contents_list, ids_list)
]
loop = get_event_loop()
results = loop.run_until_complete(process_batch(tasks, batch_size=batch))
content_result = list(map(lambda x: x[0], results))
id_result = list(map(lambda x: x[1], results))
score_result = list(map(lambda x: x[2], results))
return content_result, id_result, score_result
async def cohere_rerank_pure(
cohere_client: cohere.AsyncClient,
model: str,
query: str,
documents: List[str],
ids: List[str],
top_k: int,
) -> Tuple[List[str], List[str], List[float]]:
"""
Rerank a list of contents with Cohere rerank models.
:param cohere_client: The Cohere AsyncClient to use for reranking
:param model: The model name for Cohere rerank
:param query: The query to use for reranking
:param documents: The list of contents to rerank
:param ids: The list of ids corresponding to the documents
:param top_k: The number of passages to be retrieved
:return: Tuple of lists containing the reranked contents, ids, and scores
"""
rerank_results = await cohere_client.rerank(
model=model,
query=query,
documents=documents,
top_n=top_k,
return_documents=False,
)
results: List[RerankResponseResultsItem] = rerank_results.results
reranked_scores: List[float] = list(map(lambda x: x.relevance_score, results))
indices = list(map(lambda x: x.index, results))
reranked_contents: List[str] = list(map(lambda i: documents[i], indices))
reranked_ids: List[str] = list(map(lambda i: ids[i], indices))
return reranked_contents, reranked_ids, reranked_scores

View File

@@ -0,0 +1,213 @@
from typing import List, Tuple
import numpy as np
import pandas as pd
from autorag.nodes.passagereranker.base import BasePassageReranker
from autorag.utils.util import (
flatten_apply,
sort_by_scores,
select_top_k,
pop_params,
result_to_dataframe,
empty_cuda_cache,
)
class ColbertReranker(BasePassageReranker):
def __init__(
self,
project_dir: str,
model_name: str = "colbert-ir/colbertv2.0",
*args,
**kwargs,
):
"""
Initialize a colbert rerank model for reranking.
:param project_dir: The project directory
:param model_name: The model name for Colbert rerank.
You can choose a colbert model for reranking.
The default is "colbert-ir/colbertv2.0".
:param kwargs: Extra parameter for the model.
"""
super().__init__(project_dir)
try:
import torch
from transformers import AutoModel, AutoTokenizer
except ImportError:
raise ImportError(
"Pytorch is not installed. Please install pytorch to use Colbert reranker."
)
self.device = "cuda" if torch.cuda.is_available() else "cpu"
model_params = pop_params(AutoModel.from_pretrained, kwargs)
self.model = AutoModel.from_pretrained(model_name, **model_params).to(
self.device
)
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
def __del__(self):
del self.model
empty_cuda_cache()
super().__del__()
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries, contents, _, ids = self.cast_to_run(previous_result)
top_k = kwargs.pop("top_k")
batch = kwargs.pop("batch", 64)
return self._pure(queries, contents, ids, top_k, batch)
def _pure(
self,
queries: List[str],
contents_list: List[List[str]],
ids_list: List[List[str]],
top_k: int,
batch: int = 64,
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Rerank a list of contents with Colbert rerank models.
You can get more information about a Colbert model at https://huggingface.co/colbert-ir/colbertv2.0.
It uses BERT-based model, so recommend using CUDA gpu for faster reranking.
:param queries: The list of queries to use for reranking
:param contents_list: The list of lists of contents to rerank
:param ids_list: The list of lists of ids retrieved from the initial ranking
:param top_k: The number of passages to be retrieved
:param batch: The number of queries to be processed in a batch
Default is 64.
:return: Tuple of lists containing the reranked contents, ids, and scores
"""
# get query and content embeddings
query_embedding_list = get_colbert_embedding_batch(
queries, self.model, self.tokenizer, batch
)
content_embedding_list = flatten_apply(
get_colbert_embedding_batch,
contents_list,
model=self.model,
tokenizer=self.tokenizer,
batch_size=batch,
)
df = pd.DataFrame(
{
"ids": ids_list,
"query_embedding": query_embedding_list,
"contents": contents_list,
"content_embedding": content_embedding_list,
}
)
temp_df = df.explode("content_embedding")
temp_df["score"] = temp_df.apply(
lambda x: get_colbert_score(x["query_embedding"], x["content_embedding"]),
axis=1,
)
df["scores"] = (
temp_df.groupby(level=0, sort=False)["score"].apply(list).tolist()
)
df[["contents", "ids", "scores"]] = df.apply(
sort_by_scores, axis=1, result_type="expand"
)
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
return (
results["contents"].tolist(),
results["ids"].tolist(),
results["scores"].tolist(),
)
def get_colbert_embedding_batch(
input_strings: List[str], model, tokenizer, batch_size: int
) -> List[np.array]:
try:
import torch
except ImportError:
raise ImportError(
"Pytorch is not installed. Please install pytorch to use Colbert reranker."
)
encoding = tokenizer(
input_strings,
return_tensors="pt",
padding=True,
truncation=True,
max_length=model.config.max_position_embeddings,
)
input_batches = slice_tokenizer_result(encoding, batch_size)
result_embedding = []
with torch.no_grad():
for encoding_batch in input_batches:
result_embedding.append(model(**encoding_batch).last_hidden_state)
total_tensor = torch.cat(
result_embedding, dim=0
) # shape [batch_size, token_length, embedding_dim]
tensor_results = list(total_tensor.chunk(total_tensor.size()[0]))
if torch.cuda.is_available():
return list(map(lambda x: x.detach().cpu().numpy(), tensor_results))
else:
return list(map(lambda x: x.detach().numpy(), tensor_results))
def slice_tokenizer_result(tokenizer_output, batch_size):
input_ids_batches = slice_tensor(tokenizer_output["input_ids"], batch_size)
attention_mask_batches = slice_tensor(
tokenizer_output["attention_mask"], batch_size
)
token_type_ids_batches = slice_tensor(
tokenizer_output.get("token_type_ids", None), batch_size
)
return [
{
"input_ids": input_ids,
"attention_mask": attention_mask,
"token_type_ids": token_type_ids,
}
for input_ids, attention_mask, token_type_ids in zip(
input_ids_batches, attention_mask_batches, token_type_ids_batches
)
]
def slice_tensor(input_tensor, batch_size):
try:
import torch
except ImportError:
raise ImportError(
"Pytorch is not installed. Please install pytorch to use Colbert reranker."
)
# Calculate the number of full batches
num_full_batches = input_tensor.size(0) // batch_size
# Slice the tensor into batches
tensor_list = [
input_tensor[i * batch_size : (i + 1) * batch_size]
for i in range(num_full_batches)
]
# Handle the last batch if it's smaller than batch_size
remainder = input_tensor.size(0) % batch_size
if remainder:
tensor_list.append(input_tensor[-remainder:])
device = "cuda" if torch.cuda.is_available() else "cpu"
tensor_list = list(map(lambda x: x.to(device), tensor_list))
return tensor_list
def get_colbert_score(query_embedding: np.array, content_embedding: np.array) -> float:
if query_embedding.ndim == 3 and content_embedding.ndim == 3:
query_embedding = query_embedding.reshape(-1, query_embedding.shape[-1])
content_embedding = content_embedding.reshape(-1, content_embedding.shape[-1])
sim_matrix = np.dot(query_embedding, content_embedding.T) / (
np.linalg.norm(query_embedding, axis=1)[:, np.newaxis]
* np.linalg.norm(content_embedding, axis=1)
)
max_sim_scores = np.max(sim_matrix, axis=1)
return float(np.mean(max_sim_scores))

View File

@@ -0,0 +1,138 @@
# 250313 reranker module_type 추가 - 김용연
from typing import List, Tuple
import numpy as np
import pandas as pd
from autorag.nodes.passagereranker.base import BasePassageReranker
from autorag.utils.util import (
make_batch,
sort_by_scores,
flatten_apply,
select_top_k,
result_to_dataframe,
empty_cuda_cache,
)
class DragonKue2(BasePassageReranker):
def __init__(self, project_dir: str, *args, **kwargs):
super().__init__(project_dir)
try:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
except ImportError:
raise ImportError("For using dragonkue2, please install torch first.")
model_path = "dragonkue/bge-reranker-v2-m3-ko"
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
self.model.eval()
# Determine the device to run the model on (GPU if available, otherwise CPU)
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.model.to(self.device)
def __del__(self):
del self.model
empty_cuda_cache()
super().__del__()
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries, contents, _, ids = self.cast_to_run(previous_result)
top_k = kwargs.pop("top_k")
batch = kwargs.pop("batch", 64)
return self._pure(queries, contents, ids, top_k, batch)
def _pure(
self,
queries: List[str],
contents_list: List[List[str]],
ids_list: List[List[str]],
top_k: int,
batch: int = 64,
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Rerank a list of contents based on their relevance to a query using ko-reranker.
bge-reranker-v2-m3-ko is a reranker based on korean (https://huggingface.co/dragonkue/bge-reranker-v2-m3-ko).
:param queries: The list of queries to use for reranking
:param contents_list: The list of lists of contents to rerank
:param ids_list: The list of lists of ids retrieved from the initial ranking
:param top_k: The number of passages to be retrieved
:param batch: The number of queries to be processed in a batch
Default is 64.
:return: Tuple of lists containing the reranked contents, ids, and scores
"""
nested_list = [
list(map(lambda x: [query, x], content_list))
for query, content_list in zip(queries, contents_list)
]
scores_nps = flatten_apply(
dragonku2_run_model,
nested_list,
model=self.model,
batch_size=batch,
tokenizer=self.tokenizer,
device=self.device,
)
rerank_scores = list(
map(
lambda scores: exp_normalize(np.array(scores)).astype(float), scores_nps
)
)
df = pd.DataFrame(
{
"contents": contents_list,
"ids": ids_list,
"scores": rerank_scores,
}
)
df[["contents", "ids", "scores"]] = df.apply(
sort_by_scores, axis=1, result_type="expand"
)
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
return (
results["contents"].tolist(),
results["ids"].tolist(),
results["scores"].tolist(),
)
def dragonku2_run_model(input_texts, model, tokenizer, device, batch_size: int): # 250313 추가 - 김용연
try:
import torch
except ImportError:
raise ImportError("For using drangonku2, please install torch first.")
batch_input_texts = make_batch(input_texts, batch_size)
results = []
for batch_texts in batch_input_texts:
inputs = tokenizer(
batch_texts,
padding=True,
truncation=True,
return_tensors="pt",
max_length=512,
)
inputs = inputs.to(device)
with torch.no_grad():
scores = (
model(**inputs, return_dict=True)
.logits.view(
-1,
)
.float()
)
scores_np = scores.cpu().numpy()
results.extend(scores_np)
return results
def exp_normalize(x):
b = x.max()
y = np.exp(x - b)
return y / y.sum()

View File

@@ -0,0 +1,112 @@
from typing import List, Tuple, Iterable
import pandas as pd
from autorag.nodes.passagereranker.base import BasePassageReranker
from autorag.utils.util import (
make_batch,
sort_by_scores,
flatten_apply,
select_top_k,
pop_params,
result_to_dataframe,
empty_cuda_cache,
)
class FlagEmbeddingReranker(BasePassageReranker):
def __init__(
self, project_dir, model_name: str = "BAAI/bge-reranker-large", *args, **kwargs
):
"""
Initialize the FlagEmbeddingReranker module.
:param project_dir: The project directory.
:param model_name: The name of the BAAI Reranker normal-model name.
Default is "BAAI/bge-reranker-large"
:param kwargs: Extra parameter for FlagEmbedding.FlagReranker
"""
super().__init__(project_dir)
try:
from FlagEmbedding import FlagReranker
except ImportError:
raise ImportError(
"FlagEmbeddingReranker requires the 'FlagEmbedding' package to be installed."
)
model_params = pop_params(FlagReranker.__init__, kwargs)
model_params.pop("model_name_or_path", None)
self.model = FlagReranker(model_name_or_path=model_name, **model_params)
def __del__(self):
del self.model
empty_cuda_cache()
super().__del__()
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries, contents, _, ids = self.cast_to_run(previous_result)
top_k = kwargs.pop("top_k")
batch = kwargs.pop("batch", 64)
return self._pure(queries, contents, ids, top_k, batch)
def _pure(
self,
queries: List[str],
contents_list: List[List[str]],
ids_list: List[List[str]],
top_k: int,
batch: int = 64,
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Rerank a list of contents based on their relevance to a query using BAAI normal-Reranker model.
:param queries: The list of queries to use for reranking
:param contents_list: The list of lists of contents to rerank
:param ids_list: The list of lists of ids retrieved from the initial ranking
:param top_k: The number of passages to be retrieved
:param batch: The number of queries to be processed in a batch
Default is 64.
:return: Tuple of lists containing the reranked contents, ids, and scores
"""
nested_list = [
list(map(lambda x: [query, x], content_list))
for query, content_list in zip(queries, contents_list)
]
rerank_scores = flatten_apply(
flag_embedding_run_model, nested_list, model=self.model, batch_size=batch
)
df = pd.DataFrame(
{
"contents": contents_list,
"ids": ids_list,
"scores": rerank_scores,
}
)
df[["contents", "ids", "scores"]] = df.apply(
sort_by_scores, axis=1, result_type="expand"
)
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
return (
results["contents"].tolist(),
results["ids"].tolist(),
results["scores"].tolist(),
)
def flag_embedding_run_model(input_texts, model, batch_size: int):
try:
import torch
except ImportError:
raise ImportError("FlagEmbeddingReranker requires PyTorch to be installed.")
batch_input_texts = make_batch(input_texts, batch_size)
results = []
for batch_texts in batch_input_texts:
with torch.no_grad():
pred_scores = model.compute_score(sentence_pairs=batch_texts)
if batch_size == 1 or not isinstance(pred_scores, Iterable):
results.append(pred_scores)
else:
results.extend(pred_scores)
return results

View File

@@ -0,0 +1,101 @@
from typing import List, Tuple
import pandas as pd
from autorag.nodes.passagereranker.base import BasePassageReranker
from autorag.nodes.passagereranker.flag_embedding import flag_embedding_run_model
from autorag.utils.util import (
flatten_apply,
sort_by_scores,
select_top_k,
pop_params,
result_to_dataframe,
empty_cuda_cache,
)
class FlagEmbeddingLLMReranker(BasePassageReranker):
def __init__(
self,
project_dir,
model_name: str = "BAAI/bge-reranker-v2-gemma",
*args,
**kwargs,
):
"""
Initialize the FlagEmbeddingReranker module.
:param project_dir: The project directory.
:param model_name: The name of the BAAI Reranker LLM-based-model name.
Default is "BAAI/bge-reranker-v2-gemma"
:param kwargs: Extra parameter for FlagEmbedding.FlagReranker
"""
super().__init__(project_dir)
try:
from FlagEmbedding import FlagLLMReranker
except ImportError:
raise ImportError(
"FlagEmbeddingLLMReranker requires the 'FlagEmbedding' package to be installed."
)
model_params = pop_params(FlagLLMReranker.__init__, kwargs)
model_params.pop("model_name_or_path", None)
self.model = FlagLLMReranker(model_name_or_path=model_name, **model_params)
def __del__(self):
del self.model
empty_cuda_cache()
super().__del__()
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries, contents, _, ids = self.cast_to_run(previous_result)
top_k = kwargs.pop("top_k")
batch = kwargs.pop("batch", 64)
return self._pure(queries, contents, ids, top_k, batch)
def _pure(
self,
queries: List[str],
contents_list: List[List[str]],
ids_list: List[List[str]],
top_k: int,
batch: int = 64,
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Rerank a list of contents based on their relevance to a query using BAAI LLM-based-Reranker model.
:param queries: The list of queries to use for reranking
:param contents_list: The list of lists of contents to rerank
:param ids_list: The list of lists of ids retrieved from the initial ranking
:param top_k: The number of passages to be retrieved
:param batch: The number of queries to be processed in a batch
Default is 64.
:return: tuple of lists containing the reranked contents, ids, and scores
"""
nested_list = [
list(map(lambda x: [query, x], content_list))
for query, content_list in zip(queries, contents_list)
]
rerank_scores = flatten_apply(
flag_embedding_run_model, nested_list, model=self.model, batch_size=batch
)
df = pd.DataFrame(
{
"contents": contents_list,
"ids": ids_list,
"scores": rerank_scores,
}
)
df[["contents", "ids", "scores"]] = df.apply(
sort_by_scores, axis=1, result_type="expand"
)
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
return (
results["contents"].tolist(),
results["ids"].tolist(),
results["scores"].tolist(),
)

View File

@@ -0,0 +1,245 @@
import json
from pathlib import Path
import pandas as pd
import numpy as np
import os
import zipfile
import requests
from tqdm import tqdm
import collections
from typing import List, Dict, Tuple
from autorag.nodes.passagereranker.base import BasePassageReranker
from autorag.utils import result_to_dataframe
from autorag.utils.util import (
flatten_apply,
sort_by_scores,
select_top_k,
make_batch,
empty_cuda_cache,
)
model_url = "https://huggingface.co/prithivida/flashrank/resolve/main/{}.zip"
model_file_map = {
"ms-marco-TinyBERT-L-2-v2": "flashrank-TinyBERT-L-2-v2.onnx",
"ms-marco-MiniLM-L-12-v2": "flashrank-MiniLM-L-12-v2_Q.onnx",
"ms-marco-MultiBERT-L-12": "flashrank-MultiBERT-L12_Q.onnx",
"rank-T5-flan": "flashrank-rankt5_Q.onnx",
"ce-esci-MiniLM-L12-v2": "flashrank-ce-esci-MiniLM-L12-v2_Q.onnx",
"miniReranker_arabic_v1": "miniReranker_arabic_v1.onnx",
}
class FlashRankReranker(BasePassageReranker):
def __init__(
self, project_dir: str, model: str = "ms-marco-TinyBERT-L-2-v2", *args, **kwargs
):
"""
Initialize FlashRank rerank node.
:param project_dir: The project directory path.
:param model: The model name for FlashRank rerank.
You can get the list of available models from https://github.com/PrithivirajDamodaran/FlashRank.
Default is "ms-marco-TinyBERT-L-2-v2".
Not support “rank_zephyr_7b_v1_full” due to parallel inference issue.
:param kwargs: Extra arguments that are not affected
"""
super().__init__(project_dir)
try:
from tokenizers import Tokenizer
except ImportError:
raise ImportError(
"Tokenizer is not installed. Please install tokenizers to use FlashRank reranker."
)
cache_dir = kwargs.pop("cache_dir", "/tmp")
max_length = kwargs.pop("max_length", 512)
self.cache_dir: Path = Path(cache_dir)
self.model_dir: Path = self.cache_dir / model
self._prepare_model_dir(model)
model_file = model_file_map[model]
try:
import onnxruntime as ort
except ImportError:
raise ImportError(
"onnxruntime is not installed. Please install onnxruntime to use FlashRank reranker."
)
self.session = ort.InferenceSession(str(self.model_dir / model_file))
self.tokenizer: Tokenizer = self._get_tokenizer(max_length)
def __del__(self):
del self.session
del self.tokenizer
empty_cuda_cache()
super().__del__()
def _prepare_model_dir(self, model_name: str):
if not self.cache_dir.exists():
self.cache_dir.mkdir(parents=True, exist_ok=True)
if not self.model_dir.exists():
self._download_model_files(model_name)
def _download_model_files(self, model_name: str):
local_zip_file = self.cache_dir / f"{model_name}.zip"
formatted_model_url = model_url.format(model_name)
with requests.get(formatted_model_url, stream=True) as r:
r.raise_for_status()
total_size = int(r.headers.get("content-length", 0))
with (
open(local_zip_file, "wb") as f,
tqdm(
desc=local_zip_file.name,
total=total_size,
unit="iB",
unit_scale=True,
unit_divisor=1024,
) as bar,
):
for chunk in r.iter_content(chunk_size=8192):
size = f.write(chunk)
bar.update(size)
with zipfile.ZipFile(local_zip_file, "r") as zip_ref:
zip_ref.extractall(self.cache_dir)
os.remove(local_zip_file)
def _get_tokenizer(self, max_length: int = 512):
try:
from tokenizers import AddedToken, Tokenizer
except ImportError:
raise ImportError(
"Pytorch is not installed. Please install pytorch to use FlashRank reranker."
)
config = json.load(open(str(self.model_dir / "config.json")))
tokenizer_config = json.load(
open(str(self.model_dir / "tokenizer_config.json"))
)
tokens_map = json.load(open(str(self.model_dir / "special_tokens_map.json")))
tokenizer = Tokenizer.from_file(str(self.model_dir / "tokenizer.json"))
tokenizer.enable_truncation(
max_length=min(tokenizer_config["model_max_length"], max_length)
)
tokenizer.enable_padding(
pad_id=config["pad_token_id"], pad_token=tokenizer_config["pad_token"]
)
for token in tokens_map.values():
if isinstance(token, str):
tokenizer.add_special_tokens([token])
elif isinstance(token, dict):
tokenizer.add_special_tokens([AddedToken(**token)])
vocab_file = self.model_dir / "vocab.txt"
if vocab_file.exists():
tokenizer.vocab = self._load_vocab(vocab_file)
tokenizer.ids_to_tokens = collections.OrderedDict(
[(ids, tok) for tok, ids in tokenizer.vocab.items()]
)
return tokenizer
def _load_vocab(self, vocab_file: Path) -> Dict[str, int]:
vocab = collections.OrderedDict()
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip("\n")
vocab[token] = index
return vocab
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries, contents, _, ids = self.cast_to_run(previous_result)
top_k = kwargs.pop("top_k")
batch = kwargs.pop("batch", 64)
return self._pure(queries, contents, ids, top_k, batch)
def _pure(
self,
queries: List[str],
contents_list: List[List[str]],
ids_list: List[List[str]],
top_k: int,
batch: int = 64,
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Rerank a list of contents with FlashRank rerank models.
:param queries: The list of queries to use for reranking
:param contents_list: The list of lists of contents to rerank
:param ids_list: The list of lists of ids retrieved from the initial ranking
:param top_k: The number of passages to be retrieved
:param batch: The number of queries to be processed in a batch
:return: Tuple of lists containing the reranked contents, ids, and scores
"""
nested_list = [
list(map(lambda x: [query, x], content_list))
for query, content_list in zip(queries, contents_list)
]
rerank_scores = flatten_apply(
flashrank_run_model,
nested_list,
session=self.session,
batch_size=batch,
tokenizer=self.tokenizer,
)
df = pd.DataFrame(
{
"contents": contents_list,
"ids": ids_list,
"scores": rerank_scores,
}
)
df[["contents", "ids", "scores"]] = df.apply(
sort_by_scores, axis=1, result_type="expand"
)
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
return (
results["contents"].tolist(),
results["ids"].tolist(),
results["scores"].tolist(),
)
def flashrank_run_model(input_texts, tokenizer, session, batch_size: int):
batch_input_texts = make_batch(input_texts, batch_size)
results = []
for batch_texts in tqdm(batch_input_texts):
input_text = tokenizer.encode_batch(batch_texts)
input_ids = np.array([e.ids for e in input_text])
token_type_ids = np.array([e.type_ids for e in input_text])
attention_mask = np.array([e.attention_mask for e in input_text])
use_token_type_ids = token_type_ids is not None and not np.all(
token_type_ids == 0
)
onnx_input = {
"input_ids": input_ids.astype(np.int64),
"attention_mask": attention_mask.astype(np.int64),
}
if use_token_type_ids:
onnx_input["token_type_ids"] = token_type_ids.astype(np.int64)
outputs = session.run(None, onnx_input)
logits = outputs[0]
if logits.shape[1] == 1:
scores = 1 / (1 + np.exp(-logits.flatten()))
else:
exp_logits = np.exp(logits)
scores = exp_logits[:, 1] / np.sum(exp_logits, axis=1)
results.extend(scores)
return results

View File

@@ -0,0 +1,115 @@
import os
from typing import List, Tuple
import aiohttp
import pandas as pd
from autorag.nodes.passagereranker.base import BasePassageReranker
from autorag.utils.util import get_event_loop, process_batch, result_to_dataframe
JINA_API_URL = "https://api.jina.ai/v1/rerank"
class JinaReranker(BasePassageReranker):
def __init__(self, project_dir: str, api_key: str = None, *args, **kwargs):
"""
Initialize Jina rerank node.
:param project_dir: The project directory path.
:param api_key: The API key for Jina rerank.
You can set it in the environment variable JINAAI_API_KEY.
Or, you can directly set it on the config YAML file using this parameter.
Default is env variable "JINAAI_API_KEY".
:param kwargs: Extra arguments that are not affected
"""
super().__init__(project_dir)
if api_key is None:
api_key = os.getenv("JINAAI_API_KEY", None)
if api_key is None:
raise ValueError(
"API key is not provided."
"You can set it as an argument or as an environment variable 'JINAAI_API_KEY'"
)
self.session = aiohttp.ClientSession(loop=get_event_loop())
self.session.headers.update(
{"Authorization": f"Bearer {api_key}", "Accept-Encoding": "identity"}
)
def __del__(self):
self.session.close()
del self.session
super().__del__()
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries, contents, _, ids = self.cast_to_run(previous_result)
top_k = kwargs.pop("top_k")
batch = kwargs.pop("batch", 8)
model = kwargs.pop("model", "jina-reranker-v1-base-en")
return self._pure(queries, contents, ids, top_k, model, batch)
def _pure(
self,
queries: List[str],
contents_list: List[List[str]],
ids_list: List[List[str]],
top_k: int,
model: str = "jina-reranker-v1-base-en",
batch: int = 8,
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Rerank a list of contents with Jina rerank models.
You can get the API key from https://jina.ai/reranker and set it in the environment variable JINAAI_API_KEY.
:param queries: The list of queries to use for reranking
:param contents_list: The list of lists of contents to rerank
:param ids_list: The list of lists of ids retrieved from the initial ranking
:param top_k: The number of passages to be retrieved
:param model: The model name for Cohere rerank.
You can choose between "jina-reranker-v1-base-en" and "jina-colbert-v1-en".
Default is "jina-reranker-v1-base-en".
:param batch: The number of queries to be processed in a batch
:return: Tuple of lists containing the reranked contents, ids, and scores
"""
tasks = [
jina_reranker_pure(
self.session, query, contents, ids, top_k=top_k, model=model
)
for query, contents, ids in zip(queries, contents_list, ids_list)
]
loop = get_event_loop()
results = loop.run_until_complete(process_batch(tasks, batch))
content_result, id_result, score_result = zip(*results)
return list(content_result), list(id_result), list(score_result)
async def jina_reranker_pure(
session,
query: str,
contents: List[str],
ids: List[str],
top_k: int,
model: str = "jina-reranker-v1-base-en",
) -> Tuple[List[str], List[str], List[float]]:
async with session.post(
JINA_API_URL,
json={
"query": query,
"documents": contents,
"model": model,
"top_n": top_k,
},
) as resp:
resp_json = await resp.json()
if "results" not in resp_json:
raise RuntimeError(f"Invalid response from Jina API: {resp_json['detail']}")
results = resp_json["results"]
indices = list(map(lambda x: x["index"], results))
score_result = list(map(lambda x: x["relevance_score"], results))
id_result = list(map(lambda x: ids[x], indices))
content_result = list(map(lambda x: contents[x], indices))
return content_result, id_result, score_result

View File

@@ -0,0 +1,136 @@
from typing import List, Tuple
import numpy as np
import pandas as pd
from autorag.nodes.passagereranker.base import BasePassageReranker
from autorag.utils.util import (
make_batch,
sort_by_scores,
flatten_apply,
select_top_k,
result_to_dataframe,
empty_cuda_cache,
)
class KoReranker(BasePassageReranker):
def __init__(self, project_dir: str, *args, **kwargs):
super().__init__(project_dir)
try:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
except ImportError:
raise ImportError("For using KoReranker, please install torch first.")
model_path = "Dongjin-kr/ko-reranker"
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
self.model.eval()
# Determine the device to run the model on (GPU if available, otherwise CPU)
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.model.to(self.device)
def __del__(self):
del self.model
empty_cuda_cache()
super().__del__()
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries, contents, _, ids = self.cast_to_run(previous_result)
top_k = kwargs.pop("top_k")
batch = kwargs.pop("batch", 64)
return self._pure(queries, contents, ids, top_k, batch)
def _pure(
self,
queries: List[str],
contents_list: List[List[str]],
ids_list: List[List[str]],
top_k: int,
batch: int = 64,
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Rerank a list of contents based on their relevance to a query using ko-reranker.
ko-reranker is a reranker based on korean (https://huggingface.co/Dongjin-kr/ko-reranker).
:param queries: The list of queries to use for reranking
:param contents_list: The list of lists of contents to rerank
:param ids_list: The list of lists of ids retrieved from the initial ranking
:param top_k: The number of passages to be retrieved
:param batch: The number of queries to be processed in a batch
Default is 64.
:return: Tuple of lists containing the reranked contents, ids, and scores
"""
nested_list = [
list(map(lambda x: [query, x], content_list))
for query, content_list in zip(queries, contents_list)
]
scores_nps = flatten_apply(
koreranker_run_model,
nested_list,
model=self.model,
batch_size=batch,
tokenizer=self.tokenizer,
device=self.device,
)
rerank_scores = list(
map(
lambda scores: exp_normalize(np.array(scores)).astype(float), scores_nps
)
)
df = pd.DataFrame(
{
"contents": contents_list,
"ids": ids_list,
"scores": rerank_scores,
}
)
df[["contents", "ids", "scores"]] = df.apply(
sort_by_scores, axis=1, result_type="expand"
)
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
return (
results["contents"].tolist(),
results["ids"].tolist(),
results["scores"].tolist(),
)
def koreranker_run_model(input_texts, model, tokenizer, device, batch_size: int):
try:
import torch
except ImportError:
raise ImportError("For using KoReranker, please install torch first.")
batch_input_texts = make_batch(input_texts, batch_size)
results = []
for batch_texts in batch_input_texts:
inputs = tokenizer(
batch_texts,
padding=True,
truncation=True,
return_tensors="pt",
max_length=512,
)
inputs = inputs.to(device)
with torch.no_grad():
scores = (
model(**inputs, return_dict=True)
.logits.view(
-1,
)
.float()
)
scores_np = scores.cpu().numpy()
results.extend(scores_np)
return results
def exp_normalize(x):
b = x.max()
y = np.exp(x - b)
return y / y.sum()

View File

@@ -0,0 +1,126 @@
import os
from typing import List, Tuple
import pandas as pd
from mixedbread_ai.client import AsyncMixedbreadAI
from autorag.nodes.passagereranker.base import BasePassageReranker
from autorag.utils.util import (
result_to_dataframe,
get_event_loop,
process_batch,
pop_params,
)
class MixedbreadAIReranker(BasePassageReranker):
def __init__(
self,
project_dir: str,
*args,
**kwargs,
):
"""
Initialize mixedbread-ai rerank node.
:param project_dir: The project directory path.
:param api_key: The API key for MixedbreadAI rerank.
You can set it in the environment variable MXBAI_API_KEY.
Or, you can directly set it on the config YAML file using this parameter.
Default is env variable "MXBAI_API_KEY".
:param kwargs: Extra arguments that are not affected
"""
super().__init__(project_dir)
api_key = kwargs.pop("api_key", None)
api_key = os.getenv("MXBAI_API_KEY", None) if api_key is None else api_key
if api_key is None:
raise KeyError(
"Please set the API key for Mixedbread AI rerank in the environment variable MXBAI_API_KEY "
"or directly set it on the config YAML file."
)
self.client = AsyncMixedbreadAI(api_key=api_key)
def __del__(self):
del self.client
super().__del__()
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries, contents, scores, ids = self.cast_to_run(previous_result)
top_k = kwargs.pop("top_k")
batch = kwargs.pop("batch", 8)
model = kwargs.pop("model", "mixedbread-ai/mxbai-rerank-large-v1")
rerank_params = pop_params(self.client.reranking, kwargs)
return self._pure(queries, contents, ids, top_k, model, batch, **rerank_params)
def _pure(
self,
queries: List[str],
contents_list: List[List[str]],
ids_list: List[List[str]],
top_k: int,
model: str = "mixedbread-ai/mxbai-rerank-large-v1",
batch: int = 8,
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Rerank a list of contents with mixedbread-ai rerank models.
You can get the API key from https://www.mixedbread.ai/api-reference#quick-start-guide and set it in the environment variable MXBAI_API_KEY.
:param queries: The list of queries to use for reranking
:param contents_list: The list of lists of contents to rerank
:param ids_list: The list of lists of ids retrieved from the initial ranking
:param top_k: The number of passages to be retrieved
:param model: The model name for mixedbread-ai rerank.
You can choose between "mixedbread-ai/mxbai-rerank-large-v1", "mixedbread-ai/mxbai-rerank-base-v1" and "mixedbread-ai/mxbai-rerank-xsmall-v1".
Default is "mixedbread-ai/mxbai-rerank-large-v1".
:param batch: The number of queries to be processed in a batch
:return: Tuple of lists containing the reranked contents, ids, and scores
"""
tasks = [
mixedbreadai_rerank_pure(
self.client, query, contents, ids, top_k=top_k, model=model
)
for query, contents, ids in zip(queries, contents_list, ids_list)
]
loop = get_event_loop()
results = loop.run_until_complete(process_batch(tasks, batch))
content_result, id_result, score_result = zip(*results)
return list(content_result), list(id_result), list(score_result)
async def mixedbreadai_rerank_pure(
client: AsyncMixedbreadAI,
query: str,
documents: List[str],
ids: List[str],
top_k: int,
model: str = "mixedbread-ai/mxbai-rerank-large-v1",
) -> Tuple[List[str], List[str], List[float]]:
"""
Rerank a list of contents with mixedbread-ai rerank models.
:param client: The mixedbread-ai client to use for reranking
:param query: The query to use for reranking
:param documents: The list of contents to rerank
:param ids: The list of ids corresponding to the documents
:param top_k: The number of passages to be retrieved
:param model: The model name for mixedbread-ai rerank.
You can choose between "mixedbread-ai/mxbai-rerank-large-v1" and "mixedbread-ai/mxbai-rerank-base-v1".
Default is "mixedbread-ai/mxbai-rerank-large-v1".
:return: Tuple of lists containing the reranked contents, ids, and scores
"""
results = await client.reranking(
query=query,
input=documents,
top_k=top_k,
model=model,
)
reranked_scores: List[float] = list(map(lambda x: x.score, results.data))
reranked_scores_float = list(map(float, reranked_scores))
indices = list(map(lambda x: x.index, results.data))
reranked_contents = list(map(lambda x: documents[x], indices))
reranked_ids: List[str] = list(map(lambda i: ids[i], indices))
return reranked_contents, reranked_ids, reranked_scores_float

View File

@@ -0,0 +1,190 @@
from itertools import chain
from typing import List, Tuple
import pandas as pd
from autorag.nodes.passagereranker.base import BasePassageReranker
from autorag.utils.util import (
make_batch,
sort_by_scores,
flatten_apply,
select_top_k,
result_to_dataframe,
pop_params,
empty_cuda_cache,
)
prediction_tokens = {
"castorini/monot5-base-msmarco": ["▁false", "▁true"],
"castorini/monot5-base-msmarco-10k": ["▁false", "▁true"],
"castorini/monot5-large-msmarco": ["▁false", "▁true"],
"castorini/monot5-large-msmarco-10k": ["▁false", "▁true"],
"castorini/monot5-base-med-msmarco": ["▁false", "▁true"],
"castorini/monot5-3b-med-msmarco": ["▁false", "▁true"],
"castorini/monot5-3b-msmarco-10k": ["▁false", "▁true"],
"unicamp-dl/mt5-base-en-msmarco": ["▁no", "▁yes"],
"unicamp-dl/ptt5-base-pt-msmarco-10k-v2": ["▁não", "▁sim"],
"unicamp-dl/ptt5-base-pt-msmarco-100k-v2": ["▁não", "▁sim"],
"unicamp-dl/ptt5-base-en-pt-msmarco-100k-v2": ["▁não", "▁sim"],
"unicamp-dl/mt5-base-en-pt-msmarco-v2": ["▁no", "▁yes"],
"unicamp-dl/mt5-base-mmarco-v2": ["▁no", "▁yes"],
"unicamp-dl/mt5-base-en-pt-msmarco-v1": ["▁no", "▁yes"],
"unicamp-dl/mt5-base-mmarco-v1": ["▁no", "▁yes"],
"unicamp-dl/ptt5-base-pt-msmarco-10k-v1": ["▁não", "▁sim"],
"unicamp-dl/ptt5-base-pt-msmarco-100k-v1": ["▁não", "▁sim"],
"unicamp-dl/ptt5-base-en-pt-msmarco-10k-v1": ["▁não", "▁sim"],
"unicamp-dl/mt5-3B-mmarco-en-pt": ["", "▁true"],
"unicamp-dl/mt5-13b-mmarco-100k": ["", "▁true"],
}
class MonoT5(BasePassageReranker):
def __init__(
self,
project_dir: str,
model_name: str = "castorini/monot5-3b-msmarco-10k",
*args,
**kwargs,
):
"""
Initialize the MonoT5 reranker.
:param project_dir: The project directory
:param model_name: The name of the MonoT5 model to use for reranking
Note: default model name is 'castorini/monot5-3b-msmarco-10k'
If there is a '/' in the model name parameter,
when we create the file to store the results, the path will be twisted because of the '/'.
Therefore, it will be received as '_' instead of '/'.
:param kwargs: The extra arguments for the MonoT5 reranker
"""
super().__init__(project_dir)
try:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
except ImportError:
raise ImportError("For using MonoT5 Reranker, please install torch first.")
# replace '_' to '/'
if "_" in model_name:
model_name = model_name.replace("_", "/")
# Load the tokenizer and model from the pre-trained MonoT5 model
self.tokenizer = T5Tokenizer.from_pretrained(model_name)
model_params = pop_params(T5ForConditionalGeneration.from_pretrained, kwargs)
self.model = T5ForConditionalGeneration.from_pretrained(
model_name, **model_params
).eval()
# Determine the device to run the model on (GPU if available, otherwise CPU)
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.model.to(self.device)
token_false, token_true = prediction_tokens[model_name]
self.token_false_id = self.tokenizer.convert_tokens_to_ids(token_false)
self.token_true_id = self.tokenizer.convert_tokens_to_ids(token_true)
def __del__(self):
del self.model
del self.tokenizer
empty_cuda_cache()
super().__del__()
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries, contents, _, ids = self.cast_to_run(previous_result)
top_k = kwargs.get("top_k", 3)
batch = kwargs.get("batch", 64)
return self._pure(queries, contents, ids, top_k, batch)
def _pure(
self,
queries: List[str],
contents_list: List[List[str]],
ids_list: List[List[str]],
top_k: int,
batch: int = 64,
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Rerank a list of contents based on their relevance to a query using MonoT5.
:param queries: The list of queries to use for reranking
:param contents_list: The list of lists of contents to rerank
:param ids_list: The list of lists of ids retrieved from the initial ranking
:param top_k: The number of passages to be retrieved
:param batch: The number of queries to be processed in a batch
:return: tuple of lists containing the reranked contents, ids, and scores
"""
# Retrieve the tokens used by the model to represent false and true predictions
nested_list = [
list(map(lambda x: [f"Query: {query} Document: {x}"], content_list))
for query, content_list in zip(queries, contents_list)
]
rerank_scores = flatten_apply(
monot5_run_model,
nested_list,
model=self.model,
batch_size=batch,
tokenizer=self.tokenizer,
device=self.device,
token_false_id=self.token_false_id,
token_true_id=self.token_true_id,
)
df = pd.DataFrame(
{
"contents": contents_list,
"ids": ids_list,
"scores": rerank_scores,
}
)
df[["contents", "ids", "scores"]] = df.apply(
sort_by_scores, axis=1, result_type="expand"
)
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
return (
results["contents"].tolist(),
results["ids"].tolist(),
results["scores"].tolist(),
)
def monot5_run_model(
input_texts,
model,
batch_size: int,
tokenizer,
device,
token_false_id,
token_true_id,
):
try:
import torch
except ImportError:
raise ImportError("For using MonoT5 Reranker, please install torch first.")
batch_input_texts = make_batch(input_texts, batch_size)
results = []
for batch_texts in batch_input_texts:
flattened_batch_texts = list(chain.from_iterable(batch_texts))
input_encodings = tokenizer(
flattened_batch_texts,
padding=True,
truncation=True,
max_length=512,
return_tensors="pt",
).to(device)
with torch.no_grad():
outputs = model.generate(
input_ids=input_encodings["input_ids"],
attention_mask=input_encodings["attention_mask"],
output_scores=True,
return_dict_in_generate=True,
)
# Extract logits for the 'false' and 'true' tokens from the model's output
logits = outputs.scores[-1][:, [token_false_id, token_true_id]]
# Calculate the softmax probability of the 'true' token
probs = torch.nn.functional.softmax(logits, dim=-1)[:, 1]
results.extend(probs.tolist())
return results

View File

@@ -0,0 +1,191 @@
from pathlib import Path
from typing import Any, List, Tuple
import numpy as np
import pandas as pd
from autorag.nodes.passagereranker.base import BasePassageReranker
from autorag.utils.util import (
make_batch,
sort_by_scores,
flatten_apply,
select_top_k,
result_to_dataframe,
pop_params,
empty_cuda_cache,
)
class OpenVINOReranker(BasePassageReranker):
def __init__(
self,
project_dir: str,
model: str = "BAAI/bge-reranker-large",
*args,
**kwargs,
):
super().__init__(project_dir)
try:
from huggingface_hub import HfApi
from transformers import AutoTokenizer
except ImportError as e:
raise ValueError(
"Could not import huggingface_hub python package. "
"Please install it with: "
"`pip install -U huggingface_hub`."
) from e
def require_model_export(
model_id: str, revision: Any = None, subfolder: Any = None
) -> bool:
model_dir = Path(model_id)
if subfolder is not None:
model_dir = model_dir / subfolder
if model_dir.is_dir():
return (
not (model_dir / "openvino_model.xml").exists()
or not (model_dir / "openvino_model.bin").exists()
)
hf_api = HfApi()
try:
model_info = hf_api.model_info(model_id, revision=revision or "main")
normalized_subfolder = (
None if subfolder is None else Path(subfolder).as_posix()
)
model_files = [
file.rfilename
for file in model_info.siblings
if normalized_subfolder is None
or file.rfilename.startswith(normalized_subfolder)
]
ov_model_path = (
"openvino_model.xml"
if subfolder is None
else f"{normalized_subfolder}/openvino_model.xml"
)
return (
ov_model_path not in model_files
or ov_model_path.replace(".xml", ".bin") not in model_files
)
except Exception:
return True
try:
from optimum.intel.openvino import OVModelForSequenceClassification
except ImportError:
raise ImportError(
"Please install optimum package to use OpenVINOReranker"
"pip install 'optimum[openvino,nncf]'"
)
model_kwargs = pop_params(
OVModelForSequenceClassification.from_pretrained, kwargs
)
if require_model_export(model):
# use remote model
self.model = OVModelForSequenceClassification.from_pretrained(
model, export=True, **model_kwargs
)
else:
# use local model
self.model = OVModelForSequenceClassification.from_pretrained(
model, **model_kwargs
)
self.tokenizer = AutoTokenizer.from_pretrained(model)
def __del__(self):
del self.model
del self.tokenizer
empty_cuda_cache()
super().__del__()
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries, contents, _, ids = self.cast_to_run(previous_result)
top_k = kwargs.get("top_k", 3)
batch = kwargs.get("batch", 64)
return self._pure(queries, contents, ids, top_k, batch)
def _pure(
self,
queries: List[str],
contents_list: List[List[str]],
ids_list: List[List[str]],
top_k: int,
batch: int = 64,
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Rerank a list of contents based on their relevance to a query using MonoT5.
:param queries: The list of queries to use for reranking
:param contents_list: The list of lists of contents to rerank
:param ids_list: The list of lists of ids retrieved from the initial ranking
:param top_k: The number of passages to be retrieved
:param batch: The number of queries to be processed in a batch
:return: tuple of lists containing the reranked contents, ids, and scores
"""
# Retrieve the tokens used by the model to represent false and true predictions
nested_list = [
list(map(lambda x: [query, x], content_list))
for query, content_list in zip(queries, contents_list)
]
rerank_scores = flatten_apply(
openvino_run_model,
nested_list,
model=self.model,
batch_size=batch,
tokenizer=self.tokenizer,
)
df = pd.DataFrame(
{
"contents": contents_list,
"ids": ids_list,
"scores": rerank_scores,
}
)
df[["contents", "ids", "scores"]] = df.apply(
sort_by_scores, axis=1, result_type="expand"
)
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
return (
results["contents"].tolist(),
results["ids"].tolist(),
results["scores"].tolist(),
)
def openvino_run_model(
input_texts,
model,
batch_size: int,
tokenizer,
):
batch_input_texts = make_batch(input_texts, batch_size)
results = []
for batch_texts in batch_input_texts:
input_tensors = tokenizer(
batch_texts,
padding=True,
truncation=True,
return_tensors="pt",
)
outputs = model(**input_tensors, return_dict=True)
if outputs[0].shape[1] > 1:
scores = outputs[0][:, 1]
else:
scores = outputs[0].flatten()
scores = list(map(float, (1 / (1 + np.exp(-np.array(scores))))))
results.extend(scores)
return results

View File

@@ -0,0 +1,31 @@
from typing import List
import pandas as pd
from autorag.nodes.passagereranker.base import BasePassageReranker
from autorag.utils import result_to_dataframe
class PassReranker(BasePassageReranker):
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
top_k = kwargs.pop("top_k")
_, contents_list, scores_list, ids_list = self.cast_to_run(previous_result)
return self._pure(contents_list, scores_list, ids_list, top_k)
def _pure(
self,
contents_list: List[List[str]],
scores_list: List[List[float]],
ids_list: List[List[str]],
top_k: int,
):
"""
Do not perform reranking.
Return the given top-k passages as is.
"""
contents_list = list(map(lambda x: x[:top_k], contents_list))
scores_list = list(map(lambda x: x[:top_k], scores_list))
ids_list = list(map(lambda x: x[:top_k], ids_list))
return contents_list, ids_list, scores_list

View File

@@ -0,0 +1,170 @@
from typing import List, Optional, Sequence, Tuple, Union
import numpy as np
import pandas as pd
from llama_index.core.base.llms.types import ChatMessage, ChatResponse
from llama_index.core.llms import LLM
from llama_index.core.postprocessor.rankGPT_rerank import RankGPTRerank
from llama_index.core.schema import NodeWithScore, QueryBundle, TextNode
from llama_index.core.utils import print_text
from llama_index.llms.openai import OpenAI
from autorag import generator_models
from autorag.nodes.passagereranker.base import BasePassageReranker
from autorag.utils.util import (
get_event_loop,
process_batch,
pop_params,
result_to_dataframe,
empty_cuda_cache,
)
class RankGPT(BasePassageReranker):
def __init__(
self, project_dir: str, llm: Optional[Union[str, LLM]] = None, **kwargs
):
"""
Initialize the RankGPT reranker.
:param project_dir: The project directory
:param llm: The LLM model to use for RankGPT rerank.
It is a llama index model.
Default is the OpenAI model with gpt-4o-mini.
:param kwargs: The keyword arguments for the LLM model.
"""
super().__init__(project_dir)
if llm is None:
self.llm = OpenAI(model="gpt-4o-mini")
else:
if not isinstance(llm, LLM):
llm_class = generator_models[llm]
llm_param = pop_params(llm_class.__init__, kwargs)
self.llm = llm_class(**llm_param)
else:
self.llm = llm
def __del__(self):
del self.llm
empty_cuda_cache()
super().__del__()
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries, contents, scores, ids = self.cast_to_run(previous_result)
top_k = kwargs.get("top_k", 1)
verbose = kwargs.get("verbose", False)
rankgpt_rerank_prompt = kwargs.get("rankgpt_rerank_prompt", None)
batch = kwargs.get("batch", 16)
return self._pure(
queries=queries,
contents_list=contents,
scores_list=scores,
ids_list=ids,
top_k=top_k,
verbose=verbose,
rankgpt_rerank_prompt=rankgpt_rerank_prompt,
batch=batch,
)
def _pure(
self,
queries: List[str],
contents_list: List[List[str]],
scores_list: List[List[float]],
ids_list: List[List[str]],
top_k: int,
verbose: bool = False,
rankgpt_rerank_prompt: Optional[str] = None,
batch: int = 16,
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Rerank given context paragraphs using RankGPT.
Return pseudo scores, since the actual scores are not available on RankGPT.
:param queries: The list of queries to use for reranking
:param contents_list: The list of lists of contents to rerank
:param scores_list: The list of lists of scores retrieved from the initial ranking
:param ids_list: The list of lists of ids retrieved from the initial ranking
:param top_k: The number of passages to be retrieved
:param verbose: Whether to print intermediate steps.
:param rankgpt_rerank_prompt: The prompt template for RankGPT rerank.
Default is RankGPT's default prompt.
:param batch: The number of queries to be processed in a batch.
:return: Tuple of lists containing the reranked contents, ids, and scores
"""
query_bundles = list(map(lambda query: QueryBundle(query_str=query), queries))
nodes_list = [
list(
map(
lambda x: NodeWithScore(node=TextNode(text=x[0]), score=x[1]),
zip(content_list, score_list),
)
)
for content_list, score_list in zip(contents_list, scores_list)
]
reranker = AsyncRankGPTRerank(
top_n=top_k,
llm=self.llm,
verbose=verbose,
rankgpt_rerank_prompt=rankgpt_rerank_prompt,
)
tasks = [
reranker.async_postprocess_nodes(nodes, query, ids)
for nodes, query, ids in zip(nodes_list, query_bundles, ids_list)
]
loop = get_event_loop()
rerank_result = loop.run_until_complete(process_batch(tasks, batch_size=batch))
content_result = [
list(map(lambda x: x.node.text, res[0])) for res in rerank_result
]
score_result = [
np.linspace(1.0, 0.0, len(res[0])).tolist() for res in rerank_result
]
id_result = [res[1] for res in rerank_result]
del reranker
return content_result, id_result, score_result
class AsyncRankGPTRerank(RankGPTRerank):
async def async_run_llm(self, messages: Sequence[ChatMessage]) -> ChatResponse:
return await self.llm.achat(messages)
async def async_postprocess_nodes(
self,
nodes: List[NodeWithScore],
query_bundle: QueryBundle,
ids: Optional[List[str]] = None,
) -> Tuple[List[NodeWithScore], List[str]]:
if ids is None:
ids = [str(i) for i in range(len(nodes))]
items = {
"query": query_bundle.query_str,
"hits": [{"content": node.get_content()} for node in nodes],
}
messages = self.create_permutation_instruction(item=items)
permutation = await self.async_run_llm(messages=messages)
if permutation.message is not None and permutation.message.content is not None:
rerank_ranks = self._receive_permutation(
items, str(permutation.message.content)
)
if self.verbose:
print_text(f"After Reranking, new rank list for nodes: {rerank_ranks}")
initial_results: List[NodeWithScore] = []
id_results = []
for idx in rerank_ranks:
initial_results.append(
NodeWithScore(node=nodes[idx].node, score=nodes[idx].score)
)
id_results.append(ids[idx])
return initial_results[: self.top_n], id_results[: self.top_n]
else:
return nodes[: self.top_n], ids[: self.top_n]

View File

@@ -0,0 +1,145 @@
import logging
import os
import pathlib
from typing import List, Dict
import pandas as pd
from autorag.nodes.retrieval.run import evaluate_retrieval_node
from autorag.schema.metricinput import MetricInput
from autorag.strategy import measure_speed, filter_by_threshold, select_best
from autorag.utils.util import apply_recursive, to_list
logger = logging.getLogger("AutoRAG")
def run_passage_reranker_node(
modules: List,
module_params: List[Dict],
previous_result: pd.DataFrame,
node_line_dir: str,
strategies: Dict,
) -> pd.DataFrame:
"""
Run evaluation and select the best module among passage reranker node results.
:param modules: Passage reranker modules to run.
:param module_params: Passage reranker module parameters.
:param previous_result: Previous result dataframe.
Could be retrieval, reranker modules result.
It means it must contain 'query', 'retrieved_contents', 'retrieved_ids', 'retrieve_scores' columns.
:param node_line_dir: This node line's directory.
:param strategies: Strategies for passage reranker node.
In this node, we use 'retrieval_f1', 'retrieval_recall' and 'retrieval_precision'.
You can skip evaluation when you use only one module and a module parameter.
:return: The best result dataframe with previous result columns.
"""
if not os.path.exists(node_line_dir):
os.makedirs(node_line_dir)
project_dir = pathlib.PurePath(node_line_dir).parent.parent
qa_df = pd.read_parquet(
os.path.join(project_dir, "data", "qa.parquet"), engine="pyarrow"
)
retrieval_gt = qa_df["retrieval_gt"].tolist()
retrieval_gt = apply_recursive(lambda x: str(x), to_list(retrieval_gt))
# make rows to metric_inputs
metric_inputs = [
MetricInput(retrieval_gt=ret_gt, query=query, generation_gt=gen_gt)
for ret_gt, query, gen_gt in zip(
retrieval_gt, qa_df["query"].tolist(), qa_df["generation_gt"].tolist()
)
]
results, execution_times = zip(
*map(
lambda task: measure_speed(
task[0].run_evaluator,
project_dir=project_dir,
previous_result=previous_result,
**task[1],
),
zip(modules, module_params),
)
)
average_times = list(map(lambda x: x / len(results[0]), execution_times))
# run metrics before filtering
if strategies.get("metrics") is None:
raise ValueError(
"You must at least one metrics for passage_reranker evaluation."
)
results = list(
map(
lambda x: evaluate_retrieval_node(
x,
metric_inputs,
strategies.get("metrics"),
),
results,
)
)
# save results to folder
save_dir = os.path.join(node_line_dir, "passage_reranker") # node name
if not os.path.exists(save_dir):
os.makedirs(save_dir)
filepaths = list(
map(lambda x: os.path.join(save_dir, f"{x}.parquet"), range(len(modules)))
)
list(
map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths))
) # execute save to parquet
filenames = list(map(lambda x: os.path.basename(x), filepaths))
summary_df = pd.DataFrame(
{
"filename": filenames,
"module_name": list(map(lambda module: module.__name__, modules)),
"module_params": module_params,
"execution_time": average_times,
**{
f"passage_reranker_{metric}": list(
map(lambda result: result[metric].mean(), results)
)
for metric in strategies.get("metrics")
},
}
)
# filter by strategies
if strategies.get("speed_threshold") is not None:
results, filenames = filter_by_threshold(
results, average_times, strategies["speed_threshold"], filenames
)
selected_result, selected_filename = select_best(
results,
strategies.get("metrics"),
filenames,
strategies.get("strategy", "mean"),
)
# change metric name columns to passage_reranker_metric_name
selected_result = selected_result.rename(
columns={
metric_name: f"passage_reranker_{metric_name}"
for metric_name in strategies["metrics"]
}
)
# drop retrieval result columns in previous_result
previous_result = previous_result.drop(
columns=["retrieved_contents", "retrieved_ids", "retrieve_scores"]
)
best_result = pd.concat([previous_result, selected_result], axis=1)
# add 'is_best' column to summary file
summary_df["is_best"] = summary_df["filename"] == selected_filename
# save files
summary_df.to_csv(os.path.join(save_dir, "summary.csv"), index=False)
best_result.to_parquet(
os.path.join(
save_dir, f"best_{os.path.splitext(selected_filename)[0]}.parquet"
),
index=False,
)
return best_result

View File

@@ -0,0 +1,129 @@
from typing import List, Tuple
import pandas as pd
from autorag.nodes.passagereranker.base import BasePassageReranker
from autorag.utils.util import (
flatten_apply,
make_batch,
select_top_k,
sort_by_scores,
pop_params,
result_to_dataframe,
empty_cuda_cache,
)
class SentenceTransformerReranker(BasePassageReranker):
def __init__(
self,
project_dir: str,
model_name: str = "cross-encoder/ms-marco-MiniLM-L-2-v2",
*args,
**kwargs,
):
"""
Initialize the Sentence Transformer reranker node.
:param project_dir: The project directory
:param model_name: The name of the Sentence Transformer model to use for reranking
Default is "cross-encoder/ms-marco-MiniLM-L-2-v2"
:param kwargs: The CrossEncoder parameters
"""
super().__init__(project_dir, *args, **kwargs)
try:
import torch
from sentence_transformers import CrossEncoder
except ImportError:
raise ImportError(
"You have to install AutoRAG[gpu] to use SentenceTransformerReranker"
)
self.device = "cuda" if torch.cuda.is_available() else "cpu"
model_params = pop_params(CrossEncoder.__init__, kwargs)
self.model = CrossEncoder(model_name, device=self.device, **model_params)
def __del__(self):
del self.model
empty_cuda_cache()
super().__del__()
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
"""
Rerank a list of contents based on their relevance to a query using a Sentence Transformer model.
:param previous_result: The previous result
:param top_k: The number of passages to be retrieved
:param batch: The number of queries to be processed in a batch
:return: pd DataFrame containing the reranked contents, ids, and scores
"""
queries, contents_list, scores_list, ids_list = self.cast_to_run(
previous_result
)
top_k = kwargs.get("top_k", 1)
batch = kwargs.get("batch", 64)
return self._pure(queries, contents_list, ids_list, top_k, batch)
def _pure(
self,
queries: List[str],
contents_list: List[List[str]],
ids_list: List[List[str]],
top_k: int,
batch: int = 64,
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Rerank a list of contents based on their relevance to a query using a Sentence Transformer model.
:param queries: The list of queries to use for reranking
:param contents_list: The list of lists of contents to rerank
:param ids_list: The list of lists of ids retrieved from the initial ranking
:param top_k: The number of passages to be retrieved
:param batch: The number of queries to be processed in a batch
:return: tuple of lists containing the reranked contents, ids, and scores
"""
nested_list = [
list(map(lambda x: [query, x], content_list))
for query, content_list in zip(queries, contents_list)
]
rerank_scores = flatten_apply(
sentence_transformer_run_model,
nested_list,
model=self.model,
batch_size=batch,
)
df = pd.DataFrame(
{
"contents": contents_list,
"ids": ids_list,
"scores": rerank_scores,
}
)
df[["contents", "ids", "scores"]] = df.apply(
sort_by_scores, axis=1, result_type="expand"
)
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
return (
results["contents"].tolist(),
results["ids"].tolist(),
results["scores"].tolist(),
)
def sentence_transformer_run_model(input_texts, model, batch_size: int):
try:
import torch
except ImportError:
raise ImportError(
"You have to install AutoRAG[gpu] to use SentenceTransformerReranker"
)
batch_input_texts = make_batch(input_texts, batch_size)
results = []
for batch_texts in batch_input_texts:
with torch.no_grad():
pred_scores = model.predict(sentences=batch_texts, apply_softmax=True)
results.extend(pred_scores.tolist())
return results

View File

@@ -0,0 +1 @@
from .tart import Tart

View File

@@ -0,0 +1,152 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import copy
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers.models.t5.modeling_t5 import T5Config, T5PreTrainedModel, T5Stack
from transformers.utils.model_parallel_utils import assert_device_map, get_device_map
from autorag.utils.util import empty_cuda_cache
class EncT5ForSequenceClassification(T5PreTrainedModel):
_keys_to_ignore_on_load_missing = [
r"encoder\.embed_tokens\.weight",
]
def __init__(self, config: T5Config, dropout=0.1):
super().__init__(config)
try:
from torch import nn
except ImportError:
raise ImportError("Please install PyTorch to use TART reranker.")
self.num_labels = config.num_labels
self.config = config
self.shared = nn.Embedding(config.vocab_size, config.d_model)
encoder_config = copy.deepcopy(config)
encoder_config.use_cache = False
encoder_config.is_encoder_decoder = False
self.encoder = T5Stack(encoder_config, self.shared)
self.dropout = nn.Dropout(dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
# Initialize weights and apply final processing
self.post_init()
# Model parallel
self.model_parallel = False
self.device_map = None
def parallelize(self, device_map=None):
try:
import torch
except ImportError:
raise ImportError("Please install PyTorch to use TART reranker.")
self.device_map = (
get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
if device_map is None
else device_map
)
assert_device_map(self.device_map, len(self.encoder.block))
self.encoder.parallelize(self.device_map)
self.classifier = self.classifier.to(self.encoder.first_device)
self.model_parallel = True
def deparallelize(self):
self.encoder.deparallelize()
self.encoder = self.encoder.to("cpu")
self.model_parallel = False
self.device_map = None
empty_cuda_cache()
def get_input_embeddings(self):
return self.shared
def set_input_embeddings(self, new_embeddings):
self.shared = new_embeddings
self.encoder.set_input_embeddings(new_embeddings)
def get_encoder(self):
return self.encoder
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
def forward(
self,
input_ids=None,
attention_mask=None,
head_mask=None,
inputs_embeds=None,
labels=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
):
try:
import torch
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
except ImportError:
raise ImportError("Please install PyTorch to use TART reranker.")
return_dict = (
return_dict if return_dict is not None else self.config.use_return_dict
)
outputs = self.encoder(
input_ids=input_ids,
attention_mask=attention_mask,
inputs_embeds=inputs_embeds,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = outputs[0]
pooled_output = hidden_states[:, 0, :] # Take bos token (equiv. to <s>)
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (
labels.dtype == torch.long or labels.dtype == torch.int
):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)

View File

@@ -0,0 +1,139 @@
from itertools import chain
from typing import List, Tuple
import pandas as pd
from autorag.nodes.passagereranker.base import BasePassageReranker
from autorag.nodes.passagereranker.tart.modeling_enc_t5 import (
EncT5ForSequenceClassification,
)
from autorag.nodes.passagereranker.tart.tokenization_enc_t5 import EncT5Tokenizer
from autorag.utils.util import (
make_batch,
sort_by_scores,
flatten_apply,
select_top_k,
result_to_dataframe,
empty_cuda_cache,
)
class Tart(BasePassageReranker):
def __init__(self, project_dir: str, *args, **kwargs):
super().__init__(project_dir)
try:
import torch
except ImportError:
raise ImportError(
"torch is not installed. Please install torch first to use TART reranker."
)
model_name = "facebook/tart-full-flan-t5-xl"
self.model = EncT5ForSequenceClassification.from_pretrained(model_name)
self.tokenizer = EncT5Tokenizer.from_pretrained(model_name)
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.model = self.model.to(self.device)
def __del__(self):
del self.model
del self.tokenizer
empty_cuda_cache()
super().__del__()
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries, contents, _, ids = self.cast_to_run(previous_result)
top_k = kwargs.pop("top_k")
instruction = kwargs.pop("instruction", "Find passage to answer given question")
batch = kwargs.pop("batch", 64)
return self._pure(queries, contents, ids, top_k, instruction, batch)
def _pure(
self,
queries: List[str],
contents_list: List[List[str]],
ids_list: List[List[str]],
top_k: int,
instruction: str = "Find passage to answer given question",
batch: int = 64,
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Rerank a list of contents based on their relevance to a query using Tart.
TART is a reranker based on TART (https://github.com/facebookresearch/tart).
You can rerank the passages with the instruction using TARTReranker.
The default model is facebook/tart-full-flan-t5-xl.
:param queries: The list of queries to use for reranking
:param contents_list: The list of lists of contents to rerank
:param ids_list: The list of lists of ids retrieved from the initial ranking
:param top_k: The number of passages to be retrieved
:param instruction: The instruction for reranking.
Note: default instruction is "Find passage to answer given question"
The default instruction from the TART paper is being used.
If you want to use a different instruction, you can change the instruction through this parameter
:param batch: The number of queries to be processed in a batch
:return: tuple of lists containing the reranked contents, ids, and scores
"""
nested_list = [
[["{} [SEP] {}".format(instruction, query)] for _ in contents]
for query, contents in zip(queries, contents_list)
]
rerank_scores = flatten_apply(
tart_run_model,
nested_list,
model=self.model,
batch_size=batch,
tokenizer=self.tokenizer,
device=self.device,
contents_list=contents_list,
)
df = pd.DataFrame(
{
"contents": contents_list,
"ids": ids_list,
"scores": rerank_scores,
}
)
df[["contents", "ids", "scores"]] = df.apply(
sort_by_scores, axis=1, result_type="expand"
)
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
return (
results["contents"].tolist(),
results["ids"].tolist(),
results["scores"].tolist(),
)
def tart_run_model(
input_texts, contents_list, model, batch_size: int, tokenizer, device
):
try:
import torch
import torch.nn.functional as F
except ImportError:
raise ImportError(
"torch is not installed. Please install torch first to use TART reranker."
)
flattened_texts = list(chain.from_iterable(input_texts))
flattened_contents = list(chain.from_iterable(contents_list))
batch_input_texts = make_batch(flattened_texts, batch_size)
batch_contents_list = make_batch(flattened_contents, batch_size)
results = []
for batch_texts, batch_contents in zip(batch_input_texts, batch_contents_list):
feature = tokenizer(
batch_texts,
batch_contents,
padding=True,
truncation=True,
return_tensors="pt",
).to(device)
with torch.no_grad():
pred_scores = model(**feature).logits
normalized_scores = [
float(score[1]) for score in F.softmax(pred_scores, dim=1)
]
results.extend(normalized_scores)
return results

View File

@@ -0,0 +1,112 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from typing import Any, Dict, List, Optional
from transformers import T5Tokenizer
class EncT5Tokenizer(T5Tokenizer):
def __init__(
self,
vocab_file,
bos_token="<s>",
eos_token="</s>",
unk_token="<unk>",
pad_token="<pad>",
extra_ids=100,
additional_special_tokens=None,
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs,
) -> None:
sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
super().__init__(
vocab_file=vocab_file,
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
pad_token=pad_token,
extra_ids=extra_ids,
additional_special_tokens=additional_special_tokens,
sp_model_kwargs=sp_model_kwargs,
**kwargs,
)
def get_special_tokens_mask(
self,
token_ids_0: List[int],
token_ids_1: Optional[List[int]] = None,
already_has_special_tokens: bool = False,
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0,
token_ids_1=token_ids_1,
already_has_special_tokens=True,
)
# normal case: some special tokens
if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of zeros.
"""
bos = [self.bos_token_id]
eos = [self.eos_token_id]
if token_ids_1 is None:
return len(bos + token_ids_0 + eos) * [0]
return len(bos + token_ids_0 + eos + token_ids_1 + eos) * [0]
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A sequence has the following format:
- single sequence: `<s> X </s>`
- pair of sequences: `<s> A </s> B </s>`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
else:
return (
[self.bos_token_id]
+ token_ids_0
+ [self.eos_token_id]
+ token_ids_1
+ [self.eos_token_id]
)

View File

@@ -0,0 +1,72 @@
import os
from datetime import datetime
from typing import List, Tuple
import pandas as pd
from autorag.nodes.passagereranker.base import BasePassageReranker
from autorag.utils import result_to_dataframe, fetch_contents
class TimeReranker(BasePassageReranker):
def __init__(self, project_dir: str, *args, **kwargs):
super().__init__(project_dir, *args, **kwargs)
self.corpus_df = pd.read_parquet(
os.path.join(project_dir, "data", "corpus.parquet"), engine="pyarrow"
)
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
_, contents, scores, ids = self.cast_to_run(previous_result)
metadatas = fetch_contents(self.corpus_df, ids, column_name="metadata")
times = [
[time["last_modified_datetime"] for time in time_list]
for time_list in metadatas
]
top_k = kwargs.pop("top_k")
return self._pure(contents, scores, ids, top_k, times)
def _pure(
self,
contents_list: List[List[str]],
scores_list: List[List[float]],
ids_list: List[List[str]],
top_k: int,
time_list: List[List[datetime]],
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Rerank the passages based on merely the datetime of the passage.
It uses 'last_modified_datetime' key in the corpus metadata,
so the metadata should be in the format of {'last_modified_datetime': datetime.datetime} at the corpus data file.
:param contents_list: The list of lists of contents
:param scores_list: The list of lists of scores from the initial ranking
:param ids_list: The list of lists of ids
:param top_k: The number of passages to be retrieved after reranking
:param time_list: The metadata list of lists of datetime.datetime
It automatically extracts the 'last_modified_datetime' key from the metadata in the corpus data.
:return: The reranked contents, ids, and scores
"""
def sort_row(contents, scores, ids, time, top_k):
combined = list(zip(contents, scores, ids, time))
combined.sort(key=lambda x: x[3], reverse=True)
sorted_contents, sorted_scores, sorted_ids, _ = zip(*combined)
return (
list(sorted_contents)[:top_k],
list(sorted_scores)[:top_k],
list(sorted_ids)[:top_k],
)
reranked_contents, reranked_scores, reranked_ids = zip(
*map(
sort_row,
contents_list,
scores_list,
ids_list,
time_list,
[top_k] * len(contents_list),
)
)
return list(reranked_contents), list(reranked_ids), list(reranked_scores)

View File

@@ -0,0 +1,160 @@
import logging
from typing import List, Tuple
import pandas as pd
from autorag.nodes.passagereranker.base import BasePassageReranker
from autorag.utils import result_to_dataframe
from autorag.utils.util import select_top_k, sort_by_scores, empty_cuda_cache
logger = logging.getLogger("AutoRAG")
class Upr(BasePassageReranker):
def __init__(
self,
project_dir: str,
use_bf16: bool = False,
prefix_prompt: str = "Passage: ",
suffix_prompt: str = "Please write a question based on this passage.",
*args,
**kwargs,
):
"""
Initialize the UPR reranker node.
:param project_dir: The project directory
:param use_bf16: Whether to use bfloat16 for the model. Default is False.
:param prefix_prompt: The prefix prompt for the language model that generates question for reranking.
Default is "Passage: ".
The prefix prompt serves as the initial context or instruction for the language model.
It sets the stage for what is expected in the output
:param suffix_prompt: The suffix prompt for the language model that generates question for reranking.
Default is "Please write a question based on this passage.".
The suffix prompt provides a cue or a closing instruction to the language model,
signaling how to conclude the generated text or what format to follow at the end.
:param kwargs: Extra arguments
"""
super().__init__(project_dir, *args, **kwargs)
self.scorer = UPRScorer(
suffix_prompt=suffix_prompt, prefix_prompt=prefix_prompt, use_bf16=use_bf16
)
def __del__(self):
del self.scorer
super().__del__()
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries, contents, _, ids = self.cast_to_run(previous_result)
top_k = kwargs.pop("top_k")
return self._pure(queries, contents, ids, top_k)
def _pure(
self,
queries: List[str],
contents_list: List[List[str]],
ids_list: List[List[str]],
top_k: int,
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Rerank a list of contents based on their relevance to a query using UPR.
UPR is a reranker based on UPR (https://github.com/DevSinghSachan/unsupervised-passage-reranking).
The language model will make a question based on the passage and rerank the passages by the likelihood of the question.
The default model is t5-large.
:param queries: The list of queries to use for reranking
:param contents_list: The list of lists of contents to rerank
:param ids_list: The list of lists of ids retrieved from the initial ranking
:param top_k: The number of passages to be retrieved
:return: tuple of lists containing the reranked contents, ids, and scores
"""
df = pd.DataFrame(
{
"query": queries,
"contents": contents_list,
"ids": ids_list,
}
)
df["scores"] = df.apply(
lambda row: self.scorer.compute(
query=row["query"], contents=row["contents"]
),
axis=1,
)
df[["contents", "ids", "scores"]] = df.apply(
lambda x: sort_by_scores(x, reverse=False), axis=1, result_type="expand"
)
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
return (
results["contents"].tolist(),
results["ids"].tolist(),
results["scores"].tolist(),
)
class UPRScorer:
def __init__(self, suffix_prompt: str, prefix_prompt: str, use_bf16: bool = False):
try:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
except ImportError:
raise ImportError(
"torch is not installed. Please install torch to use UPRReranker."
)
model_name = "t5-large"
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.tokenizer = T5Tokenizer.from_pretrained(model_name)
self.model = T5ForConditionalGeneration.from_pretrained(
model_name, torch_dtype=torch.bfloat16 if use_bf16 else torch.float32
).to(self.device)
self.suffix_prompt = suffix_prompt
self.prefix_prompt = prefix_prompt
def compute(self, query: str, contents: List[str]) -> List[float]:
try:
import torch
except ImportError:
raise ImportError(
"torch is not installed. Please install torch to use UPRReranker."
)
query_token = self.tokenizer(
query, max_length=128, truncation=True, return_tensors="pt"
)
prompts = list(
map(
lambda content: f"{self.prefix_prompt} {content} {self.suffix_prompt}",
contents,
)
)
prompt_token_outputs = self.tokenizer(
prompts,
padding="longest",
max_length=512,
pad_to_multiple_of=8,
truncation=True,
return_tensors="pt",
)
query_input_ids = torch.repeat_interleave(
query_token["input_ids"], len(contents), dim=0
).to(self.device)
with torch.no_grad():
logits = self.model(
input_ids=prompt_token_outputs["input_ids"].to(self.device),
attention_mask=prompt_token_outputs["attention_mask"].to(self.device),
labels=query_input_ids,
).logits
log_softmax = torch.nn.functional.log_softmax(logits, dim=-1)
nll = -log_softmax.gather(2, query_input_ids.unsqueeze(2)).squeeze(2)
avg_nll = torch.sum(nll, dim=1)
return avg_nll.tolist()
def __del__(self):
del self.model
del self.tokenizer
empty_cuda_cache()

View File

@@ -0,0 +1,109 @@
import os
from typing import List, Tuple
import pandas as pd
import voyageai
from autorag.nodes.passagereranker.base import BasePassageReranker
from autorag.utils.util import result_to_dataframe, get_event_loop, process_batch
class VoyageAIReranker(BasePassageReranker):
def __init__(self, project_dir: str, *args, **kwargs):
super().__init__(project_dir)
api_key = kwargs.pop("api_key", None)
api_key = os.getenv("VOYAGE_API_KEY", None) if api_key is None else api_key
if api_key is None:
raise KeyError(
"Please set the API key for VoyageAI rerank in the environment variable VOYAGE_API_KEY "
"or directly set it on the config YAML file."
)
self.voyage_client = voyageai.AsyncClient(api_key=api_key)
def __del__(self):
del self.voyage_client
super().__del__()
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries, contents, scores, ids = self.cast_to_run(previous_result)
top_k = kwargs.pop("top_k")
batch = kwargs.pop("batch", 8)
model = kwargs.pop("model", "rerank-2")
truncation = kwargs.pop("truncation", True)
return self._pure(queries, contents, ids, top_k, model, batch, truncation)
def _pure(
self,
queries: List[str],
contents_list: List[List[str]],
ids_list: List[List[str]],
top_k: int,
model: str = "rerank-2",
batch: int = 8,
truncation: bool = True,
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Rerank a list of contents with VoyageAI rerank models.
You can get the API key from https://docs.voyageai.com/docs/api-key-and-installation and set it in the environment variable VOYAGE_API_KEY.
:param queries: The list of queries to use for reranking
:param contents_list: The list of lists of contents to rerank
:param ids_list: The list of lists of ids retrieved from the initial ranking
:param top_k: The number of passages to be retrieved
:param model: The model name for VoyageAI rerank.
You can choose between "rerank-2" and "rerank-2-lite".
Default is "rerank-2".
:param batch: The number of queries to be processed in a batch
:param truncation: Whether to truncate the input to satisfy the 'context length limit' on the query and the documents.
:return: Tuple of lists containing the reranked contents, ids, and scores
"""
tasks = [
voyageai_rerank_pure(
self.voyage_client, model, query, contents, ids, top_k, truncation
)
for query, contents, ids in zip(queries, contents_list, ids_list)
]
loop = get_event_loop()
results = loop.run_until_complete(process_batch(tasks, batch))
content_result, id_result, score_result = zip(*results)
return list(content_result), list(id_result), list(score_result)
async def voyageai_rerank_pure(
voyage_client: voyageai.AsyncClient,
model: str,
query: str,
documents: List[str],
ids: List[str],
top_k: int,
truncation: bool = True,
) -> Tuple[List[str], List[str], List[float]]:
"""
Rerank a list of contents with VoyageAI rerank models.
:param voyage_client: The Voyage Client to use for reranking
:param model: The model name for VoyageAI rerank
:param query: The query to use for reranking
:param documents: The list of contents to rerank
:param ids: The list of ids corresponding to the documents
:param top_k: The number of passages to be retrieved
:param truncation: Whether to truncate the input to satisfy the 'context length limit' on the query and the documents.
:return: Tuple of lists containing the reranked contents, ids, and scores
"""
rerank_results = await voyage_client.rerank(
model=model,
query=query,
documents=documents,
top_k=top_k,
truncation=truncation,
)
reranked_scores: List[float] = list(
map(lambda x: x.relevance_score, rerank_results.results)
)
indices = list(map(lambda x: x.index, rerank_results.results))
reranked_contents: List[str] = list(map(lambda i: documents[i], indices))
reranked_ids: List[str] = list(map(lambda i: ids[i], indices))
return reranked_contents, reranked_ids, reranked_scores

View File

@@ -0,0 +1,3 @@
from .long_context_reorder import LongContextReorder
from .window_replacement import WindowReplacement
from .fstring import Fstring

View File

@@ -0,0 +1,34 @@
import logging
from abc import ABCMeta
from pathlib import Path
from typing import Union
import pandas as pd
from autorag.schema.base import BaseModule
logger = logging.getLogger("AutoRAG")
class BasePromptMaker(BaseModule, metaclass=ABCMeta):
def __init__(self, project_dir: Union[str, Path], *args, **kwargs):
logger.info(
f"Initialize prompt maker node - {self.__class__.__name__} module..."
)
def __del__(self):
logger.info(f"Prompt maker node - {self.__class__.__name__} module is deleted.")
def cast_to_run(self, previous_result: pd.DataFrame, *args, **kwargs):
logger.info(f"Running prompt maker node - {self.__class__.__name__} module...")
# get query and retrieved contents from previous_result
assert (
"query" in previous_result.columns
), "previous_result must have query column."
assert (
"retrieved_contents" in previous_result.columns
), "previous_result must have retrieved_contents column."
query = previous_result["query"].tolist()
retrieved_contents = previous_result["retrieved_contents"].tolist()
prompt = kwargs.pop("prompt")
return query, retrieved_contents, prompt

View File

@@ -0,0 +1,49 @@
from typing import List
import pandas as pd
from autorag.nodes.promptmaker.base import BasePromptMaker
from autorag.utils import result_to_dataframe
class Fstring(BasePromptMaker):
@result_to_dataframe(["prompts"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
query, retrieved_contents, prompt = self.cast_to_run(
previous_result, *args, **kwargs
)
return self._pure(prompt, query, retrieved_contents)
def _pure(
self, prompt: str, queries: List[str], retrieved_contents: List[List[str]]
) -> List[str]:
"""
Make a prompt using f-string from a query and retrieved_contents.
You must type a prompt or prompt list at a config YAML file like this:
.. Code:: yaml
nodes:
- node_type: prompt_maker
modules:
- module_type: fstring
prompt: [Answer this question: {query} \n\n {retrieved_contents},
Read the passages carefully and answer this question: {query} \n\n Passages: {retrieved_contents}]
:param prompt: A prompt string.
:param queries: List of query strings.
:param retrieved_contents: List of retrieved contents.
:return: Prompts that are made by f-string.
"""
def fstring_row(
_prompt: str, _query: str, _retrieved_contents: List[str]
) -> str:
contents_str = "\n\n".join(_retrieved_contents)
return _prompt.format(query=_query, retrieved_contents=contents_str)
return list(
map(
lambda x: fstring_row(prompt, x[0], x[1]),
zip(queries, retrieved_contents),
)
)

View File

@@ -0,0 +1,83 @@
import logging
from typing import List
import numpy as np
import pandas as pd
from autorag.nodes.promptmaker.base import BasePromptMaker
from autorag.utils import result_to_dataframe
logger = logging.getLogger("AutoRAG")
class LongContextReorder(BasePromptMaker):
@result_to_dataframe(["prompts"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
query, retrieved_contents, prompt = self.cast_to_run(
previous_result, *args, **kwargs
)
assert (
"retrieve_scores" in previous_result.columns
), "previous_result must have retrieve_scores column."
retrieve_scores = previous_result["retrieve_scores"].tolist()
return self._pure(prompt, query, retrieved_contents, retrieve_scores)
def _pure(
self,
prompt: str,
queries: List[str],
retrieved_contents: List[List[str]],
retrieve_scores: List[List[float]],
) -> List[str]:
"""
Models struggle to access significant details found
in the center of extended contexts. A study
(https://arxiv.org/abs/2307.03172) observed that the best
performance typically arises when crucial data is positioned
at the start or conclusion of the input context. Additionally,
as the input context lengthens, performance drops notably, even
in models designed for long contexts."
.. Code:: yaml
nodes:
- node_type: prompt_maker
modules:
- module_type: long_context_reorder
prompt: [Answer this question: {query} \n\n {retrieved_contents},
Read the passages carefully and answer this question: {query} \n\n Passages: {retrieved_contents}]
:param prompt: A prompt string.
:param queries: List of query strings.
:param retrieved_contents: List of retrieved contents.
:param retrieve_scores: List of `retrieve scores`.
:return: Prompts that are made by long context reorder.
"""
def long_context_reorder_row(
_prompt: str,
_query: str,
_retrieved_contents: List[str],
_retrieve_scores: List[float],
) -> str:
if isinstance(_retrieved_contents, np.ndarray):
_retrieved_contents = _retrieved_contents.tolist()
if not len(_retrieved_contents) == len(_retrieve_scores):
logger.info("If you use a summarizer, the reorder will not proceed.")
return _prompt.format(
query=_query, retrieved_contents="\n\n".join(_retrieved_contents)
)
content_scores = list(zip(_retrieved_contents, _retrieve_scores))
sorted_content_scores = sorted(
content_scores, key=lambda x: x[1], reverse=True
)
content_result, score_result = zip(*sorted_content_scores)
_retrieved_contents.append(content_result[0])
contents_str = "\n\n".join(_retrieved_contents)
return _prompt.format(query=_query, retrieved_contents=contents_str)
return list(
map(
lambda x: long_context_reorder_row(prompt, x[0], x[1], x[2]),
zip(queries, retrieved_contents, retrieve_scores),
)
)

View File

@@ -0,0 +1,280 @@
import os
import pathlib
from copy import deepcopy
from typing import List, Dict, Optional, Union
import pandas as pd
import tokenlog
from autorag.evaluation import evaluate_generation
from autorag.evaluation.util import cast_metrics
from autorag.schema.metricinput import MetricInput
from autorag.strategy import measure_speed, filter_by_threshold, select_best
from autorag.support import get_support_modules
from autorag.utils import validate_qa_dataset
from autorag.utils.util import make_combinations, explode, split_dataframe
def run_prompt_maker_node(
modules: List,
module_params: List[Dict],
previous_result: pd.DataFrame,
node_line_dir: str,
strategies: Dict,
) -> pd.DataFrame:
"""
Run prompt maker node.
With this function, you can select the best prompt maker module.
As default, when you can use only one module, the evaluation will be skipped.
If you want to select the best prompt among modules, you can use strategies.
When you use them, you must pass 'generator_modules' and its parameters at strategies.
Because it uses generator modules and generator metrics for evaluation this module.
It is recommended to use one params and modules for evaluation,
but you can use multiple params and modules for evaluation.
When you don't set generator module at strategies, it will use the default generator module.
The default generator module is llama_index_llm with openai gpt-3.5-turbo model.
:param modules: Prompt maker module classes to run.
:param module_params: Prompt maker module parameters.
:param previous_result: Previous result dataframe.
Could be query expansion's best result or qa data.
:param node_line_dir: This node line's directory.
:param strategies: Strategies for prompt maker node.
:return: The best result dataframe.
It contains previous result columns and prompt maker's result columns which is 'prompts'.
"""
if not os.path.exists(node_line_dir):
os.makedirs(node_line_dir)
node_dir = os.path.join(node_line_dir, "prompt_maker")
if not os.path.exists(node_dir):
os.makedirs(node_dir)
project_dir = pathlib.PurePath(node_line_dir).parent.parent
# run modules
results, execution_times = zip(
*map(
lambda task: measure_speed(
task[0].run_evaluator,
project_dir=project_dir,
previous_result=previous_result,
**task[1],
),
zip(modules, module_params),
)
)
average_times = list(map(lambda x: x / len(results[0]), execution_times))
# get average token usage
token_usages = []
for i, result in enumerate(results):
token_logger = tokenlog.getLogger(
f"prompt_maker_{i}", strategies.get("tokenizer", "gpt2")
)
token_logger.query_batch(result["prompts"].tolist())
token_usages.append(token_logger.get_token_usage() / len(result))
# save results to folder
filepaths = list(
map(lambda x: os.path.join(node_dir, f"{x}.parquet"), range(len(modules)))
)
list(
map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths))
) # execute save to parquet
filenames = list(map(lambda x: os.path.basename(x), filepaths))
# make summary file
summary_df = pd.DataFrame(
{
"filename": filenames,
"module_name": list(map(lambda module: module.__name__, modules)),
"module_params": module_params,
"execution_time": average_times,
"average_prompt_token": token_usages,
}
)
metric_names, metric_params = cast_metrics(strategies.get("metrics"))
# Run evaluation when there are more than one module.
if len(modules) > 1:
# pop general keys from strategies (e.g. metrics, speed_threshold)
general_key = ["metrics", "speed_threshold", "token_threshold", "tokenizer"]
general_strategy = dict(
filter(lambda x: x[0] in general_key, strategies.items())
)
extra_strategy = dict(
filter(lambda x: x[0] not in general_key, strategies.items())
)
# first, filter by threshold if it is enabled.
if general_strategy.get("speed_threshold") is not None:
results, filenames = filter_by_threshold(
results, average_times, general_strategy["speed_threshold"], filenames
)
# Calculate tokens and save to summary
if general_strategy.get("token_threshold") is not None:
results, filenames = filter_by_threshold(
results, token_usages, general_strategy["token_threshold"], filenames
)
# run metrics before filtering
if metric_names is None or len(metric_names) <= 0:
raise ValueError(
"You must at least one metrics for prompt maker evaluation."
)
# get generator modules from strategy
generator_callables, generator_params = make_generator_callable_params(
extra_strategy
)
# get generation_gt
qa_data = pd.read_parquet(
os.path.join(project_dir, "data", "qa.parquet"), engine="pyarrow"
)
validate_qa_dataset(qa_data)
generation_gt = qa_data["generation_gt"].tolist()
generation_gt = list(map(lambda x: x.tolist(), generation_gt))
metric_inputs = [MetricInput(generation_gt=gen_gt) for gen_gt in generation_gt]
all_prompts = []
for result in results:
all_prompts.extend(result["prompts"].tolist())
evaluation_result_all = evaluate_one_prompt_maker_node(
all_prompts,
generator_callables,
generator_params,
metric_inputs * len(results),
general_strategy["metrics"],
project_dir,
strategy_name=strategies.get("strategy", "mean"),
)
evaluation_results = split_dataframe(
evaluation_result_all, chunk_size=len(results[0])
)
evaluation_df = pd.DataFrame(
{
"filename": filenames,
**{
f"prompt_maker_{metric_name}": list(
map(lambda x: x[metric_name].mean(), evaluation_results)
)
for metric_name in metric_names
},
}
)
summary_df = pd.merge(
on="filename", left=summary_df, right=evaluation_df, how="left"
)
best_result, best_filename = select_best(
evaluation_results,
metric_names,
filenames,
strategies.get("strategy", "mean"),
)
# change metric name columns to prompt_maker_metric_name
best_result = best_result.rename(
columns={
metric_name: f"prompt_maker_{metric_name}"
for metric_name in metric_names
}
)
best_result = best_result.drop(columns=["generated_texts"])
else:
best_result, best_filename = results[0], filenames[0]
# add 'is_best' column at summary file
summary_df["is_best"] = summary_df["filename"] == best_filename
best_result = pd.concat([previous_result, best_result], axis=1)
# save files
summary_df.to_csv(os.path.join(node_dir, "summary.csv"), index=False)
best_result.to_parquet(
os.path.join(node_dir, f"best_{os.path.splitext(best_filename)[0]}.parquet"),
index=False,
)
return best_result
def make_generator_callable_params(strategy_dict: Dict):
node_dict = deepcopy(strategy_dict)
generator_module_list: Optional[List[Dict]] = node_dict.pop(
"generator_modules", None
)
if generator_module_list is None:
generator_module_list = [
{
"module_type": "llama_index_llm",
"llm": "openai",
"model": "gpt-3.5-turbo",
}
]
node_params = node_dict
modules = list(
map(
lambda module_dict: get_support_modules(module_dict.pop("module_type")),
generator_module_list,
)
)
param_combinations = list(
map(
lambda module_dict: make_combinations({**module_dict, **node_params}),
generator_module_list,
)
)
return explode(modules, param_combinations)
def evaluate_one_prompt_maker_node(
prompts: List[str],
generator_classes: List,
generator_params: List[Dict],
metric_inputs: List[MetricInput],
metrics: Union[List[str], List[Dict]],
project_dir,
strategy_name: str,
) -> pd.DataFrame:
input_df = pd.DataFrame({"prompts": prompts})
generator_results = list(
map(
lambda x: x[0].run_evaluator(
project_dir=project_dir, previous_result=input_df, **x[1]
),
zip(generator_classes, generator_params),
)
)
evaluation_results = list(
map(
lambda x: evaluate_generator_result(x[0], metric_inputs, metrics),
zip(generator_results, generator_classes),
)
)
metric_names = (
list(map(lambda x: x["metric_name"], metrics))
if isinstance(metrics[0], dict)
else metrics
)
best_result, _ = select_best(
evaluation_results, metric_names, strategy_name=strategy_name
)
best_result = pd.concat([input_df, best_result], axis=1)
return best_result # it has 'generated_texts' column
def evaluate_generator_result(
result_df: pd.DataFrame,
metric_inputs: List[MetricInput],
metrics: Union[List[str], List[Dict]],
) -> pd.DataFrame:
@evaluate_generation(metric_inputs=metric_inputs, metrics=metrics)
def evaluate(df):
return df["generated_texts"].tolist()
return evaluate(result_df)

View File

@@ -0,0 +1,85 @@
import logging
import os
from typing import List, Dict
import pandas as pd
from autorag.nodes.promptmaker.base import BasePromptMaker
from autorag.utils import result_to_dataframe, fetch_contents
logger = logging.getLogger("AutoRAG")
class WindowReplacement(BasePromptMaker):
def __init__(self, project_dir: str, *args, **kwargs):
super().__init__(project_dir, *args, **kwargs)
# load corpus
data_dir = os.path.join(project_dir, "data")
self.corpus_data = pd.read_parquet(
os.path.join(data_dir, "corpus.parquet"), engine="pyarrow"
)
@result_to_dataframe(["prompts"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
query, retrieved_contents, prompt = self.cast_to_run(
previous_result, *args, **kwargs
)
retrieved_ids = previous_result["retrieved_ids"].tolist()
# get metadata from corpus
retrieved_metadata = fetch_contents(
self.corpus_data, retrieved_ids, column_name="metadata"
)
return self._pure(prompt, query, retrieved_contents, retrieved_metadata)
def _pure(
self,
prompt: str,
queries: List[str],
retrieved_contents: List[List[str]],
retrieved_metadata: List[List[Dict]],
) -> List[str]:
"""
Replace retrieved_contents with a window to create a Prompt
(only available for corpus chunked with Sentence window method)
You must type a prompt or prompt list at a config YAML file like this:
.. Code:: yaml
nodes:
- node_type: prompt_maker
modules:
- module_type: window_replacement
prompt: [Answer this question: {query} \n\n {retrieved_contents},
Read the passages carefully and answer this question: {query} \n\n Passages: {retrieved_contents}]
:param prompt: A prompt string.
:param queries: List of query strings.
:param retrieved_contents: List of retrieved contents.
:param retrieved_metadata: List of retrieved metadata.
:return: Prompts that are made by window_replacement.
"""
def window_replacement_row(
_prompt: str,
_query: str,
_retrieved_contents,
_retrieved_metadata: List[Dict],
) -> str:
window_list = []
for content, metadata in zip(_retrieved_contents, _retrieved_metadata):
if "window" in metadata:
window_list.append(metadata["window"])
else:
window_list.append(content)
logger.info(
"Only available for corpus chunked with Sentence window method."
"window_replacement will not proceed."
)
contents_str = "\n\n".join(window_list)
return _prompt.format(query=_query, retrieved_contents=contents_str)
return list(
map(
lambda x: window_replacement_row(prompt, x[0], x[1], x[2]),
zip(queries, retrieved_contents, retrieved_metadata),
)
)

View File

@@ -0,0 +1,4 @@
from .hyde import HyDE
from .multi_query_expansion import MultiQueryExpansion
from .pass_query_expansion import PassQueryExpansion
from .query_decompose import QueryDecompose

View File

@@ -0,0 +1,62 @@
import abc
import logging
from pathlib import Path
from typing import List, Union
import pandas as pd
from autorag.nodes.util import make_generator_callable_param
from autorag.schema import BaseModule
from autorag.utils import validate_qa_dataset
logger = logging.getLogger("AutoRAG")
class BaseQueryExpansion(BaseModule, metaclass=abc.ABCMeta):
def __init__(self, project_dir: Union[str, Path], *args, **kwargs):
logger.info(
f"Initialize query expansion node - {self.__class__.__name__} module..."
)
# set generator module for query expansion
generator_class, generator_param = make_generator_callable_param(kwargs)
self.generator = generator_class(project_dir, **generator_param)
def __del__(self):
del self.generator
logger.info(
f"Delete query expansion node - {self.__class__.__name__} module..."
)
def cast_to_run(self, previous_result: pd.DataFrame, *args, **kwargs):
logger.info(
f"Running query expansion node - {self.__class__.__name__} module..."
)
validate_qa_dataset(previous_result)
# find queries columns
assert (
"query" in previous_result.columns
), "previous_result must have query column."
queries = previous_result["query"].tolist()
return queries
@staticmethod
def _check_expanded_query(queries: List[str], expanded_queries: List[List[str]]):
return list(
map(
lambda query, expanded_query_list: check_expanded_query(
query, expanded_query_list
),
queries,
expanded_queries,
)
)
def check_expanded_query(query: str, expanded_query_list: List[str]):
# check if the expanded query is the same as the original query
expanded_query_list = list(map(lambda x: x.strip(), expanded_query_list))
return [
expanded_query if expanded_query else query
for expanded_query in expanded_query_list
]

View File

@@ -0,0 +1,43 @@
from typing import List
import pandas as pd
from autorag.nodes.queryexpansion.base import BaseQueryExpansion
from autorag.utils import result_to_dataframe
hyde_prompt = "Please write a passage to answer the question"
class HyDE(BaseQueryExpansion):
@result_to_dataframe(["queries"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries = self.cast_to_run(previous_result, *args, **kwargs)
# pop prompt from kwargs
prompt = kwargs.pop("prompt", hyde_prompt)
kwargs.pop("generator_module_type", None)
expanded_queries = self._pure(queries, prompt, **kwargs)
return self._check_expanded_query(queries, expanded_queries)
def _pure(self, queries: List[str], prompt: str = hyde_prompt, **generator_params):
"""
HyDE, which inspired by "Precise Zero-shot Dense Retrieval without Relevance Labels" (https://arxiv.org/pdf/2212.10496.pdf)
LLM model creates a hypothetical passage.
And then, retrieve passages using hypothetical passage as a query.
:param queries: List[str], queries to retrieve.
:param prompt: Prompt to use when generating hypothetical passage
:return: List[List[str]], List of hyde results.
"""
full_prompts = list(
map(
lambda x: (prompt if not bool(prompt) else hyde_prompt)
+ f"\nQuestion: {x}\nPassage:",
queries,
)
)
input_df = pd.DataFrame({"prompts": full_prompts})
result_df = self.generator.pure(previous_result=input_df, **generator_params)
answers = result_df["generated_texts"].tolist()
results = list(map(lambda x: [x], answers))
return results

View File

@@ -0,0 +1,57 @@
from typing import List
import pandas as pd
from autorag.nodes.queryexpansion.base import BaseQueryExpansion
from autorag.utils import result_to_dataframe
multi_query_expansion_prompt = """You are an AI language model assistant.
Your task is to generate 3 different versions of the given user
question to retrieve relevant documents from a vector database.
By generating multiple perspectives on the user question,
your goal is to help the user overcome some of the limitations
of distance-based similarity search. Provide these alternative
questions separated by newlines. Original question: {query}"""
class MultiQueryExpansion(BaseQueryExpansion):
@result_to_dataframe(["queries"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries = self.cast_to_run(previous_result, *args, **kwargs)
# pop prompt from kwargs
prompt = kwargs.pop("prompt", multi_query_expansion_prompt)
kwargs.pop("generator_module_type", None)
expanded_queries = self._pure(queries, prompt, **kwargs)
return self._check_expanded_query(queries, expanded_queries)
def _pure(
self, queries, prompt: str = multi_query_expansion_prompt, **kwargs
) -> List[List[str]]:
"""
Expand a list of queries using a multi-query expansion approach.
LLM model generate 3 different versions queries for each input query.
:param queries: List[str], queries to decompose.
:param prompt: str, prompt to use for multi-query expansion.
default prompt comes from langchain MultiQueryRetriever default query prompt.
:return: List[List[str]], list of expansion query.
"""
full_prompts = list(map(lambda x: prompt.format(query=x), queries))
input_df = pd.DataFrame({"prompts": full_prompts})
result_df = self.generator.pure(previous_result=input_df, **kwargs)
answers = result_df["generated_texts"].tolist()
results = list(
map(lambda x: get_multi_query_expansion(x[0], x[1]), zip(queries, answers))
)
return results
def get_multi_query_expansion(query: str, answer: str) -> List[str]:
try:
queries = answer.split("\n")
queries.insert(0, query)
return queries
except:
return [query]

View File

@@ -0,0 +1,22 @@
import pandas as pd
from autorag.nodes.queryexpansion.base import BaseQueryExpansion
from autorag.utils import result_to_dataframe
class PassQueryExpansion(BaseQueryExpansion):
@result_to_dataframe(["queries"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
"""
Do not perform query expansion.
Return with the same queries.
The dimension will be 2-d list, and the column name will be 'queries'.
"""
assert (
"query" in previous_result.columns
), "previous_result must have query column."
queries = previous_result["query"].tolist()
return list(map(lambda x: [x], queries))
def _pure(self, *args, **kwargs):
pass

View File

@@ -0,0 +1,111 @@
from typing import List
import pandas as pd
from autorag.nodes.queryexpansion.base import BaseQueryExpansion
from autorag.utils import result_to_dataframe
decompose_prompt = """Decompose a question in self-contained sub-questions. Use \"The question needs no decomposition\" when no decomposition is needed.
Example 1:
Question: Is Hamlet more common on IMDB than Comedy of Errors?
Decompositions:
1: How many listings of Hamlet are there on IMDB?
2: How many listing of Comedy of Errors is there on IMDB?
Example 2:
Question: Are birds important to badminton?
Decompositions:
The question needs no decomposition
Example 3:
Question: Is it legal for a licensed child driving Mercedes-Benz to be employed in US?
Decompositions:
1: What is the minimum driving age in the US?
2: What is the minimum age for someone to be employed in the US?
Example 4:
Question: Are all cucumbers the same texture?
Decompositions:
The question needs no decomposition
Example 5:
Question: Hydrogen's atomic number squared exceeds number of Spice Girls?
Decompositions:
1: What is the atomic number of hydrogen?
2: How many Spice Girls are there?
Example 6:
Question: {question}
Decompositions:
"""
class QueryDecompose(BaseQueryExpansion):
@result_to_dataframe(["queries"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries = self.cast_to_run(previous_result, *args, **kwargs)
# pop prompt from kwargs
prompt = kwargs.pop("prompt", decompose_prompt)
kwargs.pop("generator_module_type", None)
expanded_queries = self._pure(queries, prompt, **kwargs)
return self._check_expanded_query(queries, expanded_queries)
def _pure(
self, queries: List[str], prompt: str = decompose_prompt, *args, **kwargs
) -> List[List[str]]:
"""
decompose query to little piece of questions.
:param queries: List[str], queries to decompose.
:param prompt: str, prompt to use for query decomposition.
default prompt comes from Visconde's StrategyQA few-shot prompt.
:return: List[List[str]], list of decomposed query. Return input query if query is not decomposable.
"""
full_prompts = []
for query in queries:
if bool(prompt):
full_prompt = f"prompt: {prompt}\n\n question: {query}"
else:
full_prompt = decompose_prompt.format(question=query)
full_prompts.append(full_prompt)
input_df = pd.DataFrame({"prompts": full_prompts})
result_df = self.generator.pure(previous_result=input_df, *args, **kwargs)
answers = result_df["generated_texts"].tolist()
results = list(
map(lambda x: get_query_decompose(x[0], x[1]), zip(queries, answers))
)
return results
def get_query_decompose(query: str, answer: str) -> List[str]:
"""
decompose query to little piece of questions.
:param query: str, query to decompose.
:param answer: str, answer from query_decompose function.
:return: List[str], list of a decomposed query. Return input query if query is not decomposable.
"""
if answer.lower() == "the question needs no decomposition":
return [query]
try:
lines = [line.strip() for line in answer.splitlines() if line.strip()]
if lines[0].startswith("Decompositions:"):
lines.pop(0)
questions = [line.split(":", 1)[1].strip() for line in lines if ":" in line]
if not questions:
return [query]
return questions
except:
return [query]

View File

@@ -0,0 +1,276 @@
import logging
import os
import pathlib
from copy import deepcopy
from typing import List, Dict, Optional
import pandas as pd
from autorag.nodes.retrieval.run import evaluate_retrieval_node
from autorag.schema.metricinput import MetricInput
from autorag.strategy import measure_speed, filter_by_threshold, select_best
from autorag.support import get_support_modules
from autorag.utils.util import make_combinations, explode
logger = logging.getLogger("AutoRAG")
def run_query_expansion_node(
modules: List,
module_params: List[Dict],
previous_result: pd.DataFrame,
node_line_dir: str,
strategies: Dict,
) -> pd.DataFrame:
"""
Run evaluation and select the best module among query expansion node results.
Initially, retrieval is run using expanded_queries, the result of the query_expansion module.
The retrieval module is run as a combination of the retrieval_modules in strategies.
If there are multiple retrieval_modules, run them all and choose the best result.
If there are no retrieval_modules, run them with the default of bm25.
In this way, the best result is selected for each module, and then the best result is selected.
:param modules: Query expansion modules to run.
:param module_params: Query expansion module parameters.
:param previous_result: Previous result dataframe.
In this case, it would be qa data.
:param node_line_dir: This node line's directory.
:param strategies: Strategies for query expansion node.
:return: The best result dataframe.
"""
if not os.path.exists(node_line_dir):
os.makedirs(node_line_dir)
node_dir = os.path.join(node_line_dir, "query_expansion")
if not os.path.exists(node_dir):
os.makedirs(node_dir)
project_dir = pathlib.PurePath(node_line_dir).parent.parent
# run query expansion
results, execution_times = zip(
*map(
lambda task: measure_speed(
task[0].run_evaluator,
project_dir=project_dir,
previous_result=previous_result,
**task[1],
),
zip(modules, module_params),
)
)
average_times = list(map(lambda x: x / len(results[0]), execution_times))
# save results to folder
pseudo_module_params = deepcopy(module_params)
for i, module_param in enumerate(pseudo_module_params):
if "prompt" in module_params:
module_param["prompt"] = str(i)
filepaths = list(
map(lambda x: os.path.join(node_dir, f"{x}.parquet"), range(len(modules)))
)
list(
map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths))
) # execute save to parquet
filenames = list(map(lambda x: os.path.basename(x), filepaths))
# make summary file
summary_df = pd.DataFrame(
{
"filename": filenames,
"module_name": list(map(lambda module: module.__name__, modules)),
"module_params": module_params,
"execution_time": average_times,
}
)
# Run evaluation when there are more than one module.
if len(modules) > 1:
# pop general keys from strategies (e.g. metrics, speed_threshold)
general_key = ["metrics", "speed_threshold", "strategy"]
general_strategy = dict(
filter(lambda x: x[0] in general_key, strategies.items())
)
extra_strategy = dict(
filter(lambda x: x[0] not in general_key, strategies.items())
)
# first, filter by threshold if it is enabled.
if general_strategy.get("speed_threshold") is not None:
results, filenames = filter_by_threshold(
results, average_times, general_strategy["speed_threshold"], filenames
)
# check metrics in strategy
if general_strategy.get("metrics") is None:
raise ValueError(
"You must at least one metrics for query expansion evaluation."
)
if extra_strategy.get("top_k") is None:
extra_strategy["top_k"] = 10 # default value
# get retrieval modules from strategy
retrieval_callables, retrieval_params = make_retrieval_callable_params(
extra_strategy
)
# get retrieval_gt
retrieval_gt = pd.read_parquet(
os.path.join(project_dir, "data", "qa.parquet"), engine="pyarrow"
)["retrieval_gt"].tolist()
# make rows to metric_inputs
metric_inputs = [
MetricInput(retrieval_gt=ret_gt, query=query, generation_gt=gen_gt)
for ret_gt, query, gen_gt in zip(
retrieval_gt,
previous_result["query"].tolist(),
previous_result["generation_gt"].tolist(),
)
]
# run evaluation
evaluation_results = list(
map(
lambda result: evaluate_one_query_expansion_node(
retrieval_callables,
retrieval_params,
[
setattr(metric_input, "queries", queries) or metric_input
for metric_input, queries in zip(
metric_inputs, result["queries"].to_list()
)
],
general_strategy["metrics"],
project_dir,
previous_result,
general_strategy.get("strategy", "mean"),
),
results,
)
)
evaluation_df = pd.DataFrame(
{
"filename": filenames,
**{
f"query_expansion_{metric_name}": list(
map(lambda x: x[metric_name].mean(), evaluation_results)
)
for metric_name in general_strategy["metrics"]
},
}
)
summary_df = pd.merge(
on="filename", left=summary_df, right=evaluation_df, how="left"
)
best_result, best_filename = select_best(
evaluation_results,
general_strategy["metrics"],
filenames,
strategies.get("strategy", "mean"),
)
# change metric name columns to query_expansion_metric_name
best_result = best_result.rename(
columns={
metric_name: f"query_expansion_{metric_name}"
for metric_name in strategies["metrics"]
}
)
best_result = best_result.drop(
columns=["retrieved_contents", "retrieved_ids", "retrieve_scores"]
)
else:
best_result, best_filename = results[0], filenames[0]
best_result = pd.concat([previous_result, best_result], axis=1)
# add 'is_best' column at summary file
summary_df["is_best"] = summary_df["filename"] == best_filename
# save files
summary_df.to_csv(os.path.join(node_dir, "summary.csv"), index=False)
best_result.to_parquet(
os.path.join(node_dir, f"best_{os.path.splitext(best_filename)[0]}.parquet"),
index=False,
)
return best_result
def evaluate_one_query_expansion_node(
retrieval_funcs: List,
retrieval_params: List[Dict],
metric_inputs: List[MetricInput],
metrics: List[str],
project_dir,
previous_result: pd.DataFrame,
strategy_name: str,
) -> pd.DataFrame:
previous_result["queries"] = [
metric_input.queries for metric_input in metric_inputs
]
retrieval_results = list(
map(
lambda x: x[0].run_evaluator(
project_dir=project_dir, previous_result=previous_result, **x[1]
),
zip(retrieval_funcs, retrieval_params),
)
)
evaluation_results = list(
map(
lambda x: evaluate_retrieval_node(
x,
metric_inputs,
metrics,
),
retrieval_results,
)
)
best_result, _ = select_best(
evaluation_results, metrics, strategy_name=strategy_name
)
best_result = pd.concat([previous_result, best_result], axis=1)
return best_result
def make_retrieval_callable_params(strategy_dict: Dict):
"""
strategy_dict looks like this:
.. Code:: json
{
"metrics": ["retrieval_f1", "retrieval_recall"],
"top_k": 50,
"retrieval_modules": [
{"module_type": "bm25"},
{"module_type": "vectordb", "embedding_model": ["openai", "huggingface"]}
]
}
"""
node_dict = deepcopy(strategy_dict)
retrieval_module_list: Optional[List[Dict]] = node_dict.pop(
"retrieval_modules", None
)
if retrieval_module_list is None:
retrieval_module_list = [
{
"module_type": "bm25",
}
]
node_params = node_dict
modules = list(
map(
lambda module_dict: get_support_modules(module_dict.pop("module_type")),
retrieval_module_list,
)
)
param_combinations = list(
map(
lambda module_dict: make_combinations({**module_dict, **node_params}),
retrieval_module_list,
)
)
return explode(modules, param_combinations)

View File

@@ -0,0 +1,4 @@
from .bm25 import BM25
from .hybrid_cc import HybridCC
from .hybrid_rrf import HybridRRF
from .vectordb import VectorDB

View File

@@ -0,0 +1,127 @@
import abc
import logging
import os
from typing import List, Union, Tuple
import pandas as pd
from autorag.schema import BaseModule
from autorag.support import get_support_modules
from autorag.utils import fetch_contents, result_to_dataframe, validate_qa_dataset
from autorag.utils.util import pop_params
logger = logging.getLogger("AutoRAG")
class BaseRetrieval(BaseModule, metaclass=abc.ABCMeta):
def __init__(self, project_dir: str, *args, **kwargs):
logger.info(f"Initialize retrieval node - {self.__class__.__name__}")
self.resources_dir = os.path.join(project_dir, "resources")
data_dir = os.path.join(project_dir, "data")
# fetch data from corpus_data
self.corpus_df = pd.read_parquet(
os.path.join(data_dir, "corpus.parquet"), engine="pyarrow"
)
def __del__(self):
logger.info(f"Deleting retrieval node - {self.__class__.__name__} module...")
def cast_to_run(self, previous_result: pd.DataFrame, *args, **kwargs):
logger.info(f"Running retrieval node - {self.__class__.__name__} module...")
validate_qa_dataset(previous_result)
# find queries columns & type cast queries
assert (
"query" in previous_result.columns
), "previous_result must have query column."
if "queries" not in previous_result.columns:
previous_result["queries"] = previous_result["query"]
previous_result.loc[:, "queries"] = previous_result["queries"].apply(
cast_queries
)
queries = previous_result["queries"].tolist()
return queries
class HybridRetrieval(BaseRetrieval, metaclass=abc.ABCMeta):
def __init__(
self, project_dir: str, target_modules, target_module_params, *args, **kwargs
):
super().__init__(project_dir)
self.target_modules = list(
map(
lambda x, y: get_support_modules(x)(
**y,
project_dir=project_dir,
),
target_modules,
target_module_params,
)
)
self.target_module_params = target_module_params
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
result_dfs: List[pd.DataFrame] = list(
map(
lambda x, y: x.pure(
**y,
previous_result=previous_result,
),
self.target_modules,
self.target_module_params,
)
)
ids = tuple(
map(lambda df: df["retrieved_ids"].apply(list).tolist(), result_dfs)
)
scores = tuple(
map(
lambda df: df["retrieve_scores"].apply(list).tolist(),
result_dfs,
)
)
_pure_params = pop_params(self._pure, kwargs)
if "ids" in _pure_params or "scores" in _pure_params:
raise ValueError(
"With specifying ids or scores, you must use HybridRRF.run_evaluator instead."
)
ids, scores = self._pure(ids=ids, scores=scores, **_pure_params)
contents = fetch_contents(self.corpus_df, ids)
return contents, ids, scores
def cast_queries(queries: Union[str, List[str]]) -> List[str]:
if isinstance(queries, str):
return [queries]
elif isinstance(queries, List):
return queries
else:
raise ValueError(f"queries must be str or list, but got {type(queries)}")
def evenly_distribute_passages(
ids: List[List[str]], scores: List[List[float]], top_k: int
) -> Tuple[List[str], List[float]]:
assert len(ids) == len(scores), "ids and scores must have same length."
query_cnt = len(ids)
avg_len = top_k // query_cnt
remainder = top_k % query_cnt
new_ids = []
new_scores = []
for i in range(query_cnt):
if i < remainder:
new_ids.extend(ids[i][: avg_len + 1])
new_scores.extend(scores[i][: avg_len + 1])
else:
new_ids.extend(ids[i][:avg_len])
new_scores.extend(scores[i][:avg_len])
return new_ids, new_scores
def get_bm25_pkl_name(bm25_tokenizer: str):
bm25_tokenizer = bm25_tokenizer.replace("/", "")
return f"bm25_{bm25_tokenizer}.pkl"

View File

@@ -0,0 +1,365 @@
import asyncio
import os
import pickle
import re
from typing import List, Dict, Tuple, Callable, Union, Iterable, Optional
import numpy as np
import pandas as pd
from llama_index.core.indices.keyword_table.utils import simple_extract_keywords
from nltk import PorterStemmer
from rank_bm25 import BM25Okapi
from transformers import AutoTokenizer, PreTrainedTokenizerBase
from autorag.nodes.retrieval.base import (
evenly_distribute_passages,
BaseRetrieval,
get_bm25_pkl_name,
)
from autorag.utils import validate_corpus_dataset, fetch_contents
from autorag.utils.util import (
get_event_loop,
normalize_string,
result_to_dataframe,
pop_params,
)
def tokenize_ko_kiwi(texts: List[str]) -> List[List[str]]:
try:
from kiwipiepy import Kiwi, Token
except ImportError:
raise ImportError(
"You need to install kiwipiepy to use 'ko_kiwi' tokenizer. "
"Please install kiwipiepy by running 'pip install kiwipiepy'. "
"Or install Korean version of AutoRAG by running 'pip install AutoRAG[ko]'."
)
texts = list(map(lambda x: x.strip().lower(), texts))
kiwi = Kiwi()
tokenized_list: Iterable[List[Token]] = kiwi.tokenize(texts)
return [list(map(lambda x: x.form, token_list)) for token_list in tokenized_list]
def tokenize_ko_kkma(texts: List[str]) -> List[List[str]]:
try:
from konlpy.tag import Kkma
except ImportError:
raise ImportError(
"You need to install konlpy to use 'ko_kkma' tokenizer. "
"Please install konlpy by running 'pip install konlpy'. "
"Or install Korean version of AutoRAG by running 'pip install AutoRAG[ko]'."
)
tokenizer = Kkma()
tokenized_list: List[List[str]] = list(map(lambda x: tokenizer.morphs(x), texts))
return tokenized_list
def tokenize_ko_okt(texts: List[str]) -> List[List[str]]:
try:
from konlpy.tag import Okt
except ImportError:
raise ImportError(
"You need to install konlpy to use 'ko_kkma' tokenizer. "
"Please install konlpy by running 'pip install konlpy'. "
"Or install Korean version of AutoRAG by running 'pip install AutoRAG[ko]'."
)
tokenizer = Okt()
tokenized_list: List[List[str]] = list(map(lambda x: tokenizer.morphs(x), texts))
return tokenized_list
def tokenize_porter_stemmer(texts: List[str]) -> List[List[str]]:
def tokenize_remove_stopword(text: str, stemmer) -> List[str]:
text = text.lower()
words = list(simple_extract_keywords(text))
return [stemmer.stem(word) for word in words]
stemmer = PorterStemmer()
tokenized_list: List[List[str]] = list(
map(lambda x: tokenize_remove_stopword(x, stemmer), texts)
)
return tokenized_list
def tokenize_space(texts: List[str]) -> List[List[str]]:
def tokenize_space_text(text: str) -> List[str]:
text = normalize_string(text)
return re.split(r"\s+", text.strip())
return list(map(tokenize_space_text, texts))
def load_bm25_corpus(bm25_path: str) -> Dict:
if bm25_path is None:
return {}
with open(bm25_path, "rb") as f:
bm25_corpus = pickle.load(f)
return bm25_corpus
def tokenize_ja_sudachipy(texts: List[str]) -> List[List[str]]:
try:
from sudachipy import dictionary, tokenizer
except ImportError:
raise ImportError(
"You need to install SudachiPy to use 'sudachipy' tokenizer. "
"Please install SudachiPy by running 'pip install sudachipy'."
)
# Initialize SudachiPy with the default tokenizer
tokenizer_obj = dictionary.Dictionary(dict="core").create()
# Choose the tokenizer mode: NORMAL, SEARCH, A
mode = tokenizer.Tokenizer.SplitMode.A
# Tokenize the input texts
tokenized_list = []
for text in texts:
tokens = tokenizer_obj.tokenize(text, mode)
tokenized_list.append([token.surface() for token in tokens])
return tokenized_list
BM25_TOKENIZER = {
"porter_stemmer": tokenize_porter_stemmer,
"ko_kiwi": tokenize_ko_kiwi,
"space": tokenize_space,
"ko_kkma": tokenize_ko_kkma,
"ko_okt": tokenize_ko_okt,
"sudachipy": tokenize_ja_sudachipy,
}
class BM25(BaseRetrieval):
def __init__(self, project_dir: str, *args, **kwargs):
"""
Initialize BM25 module.
(Retrieval)
:param project_dir: The project directory path.
:param bm25_tokenizer: The tokenizer name that is used to the BM25.
It supports 'porter_stemmer', 'ko_kiwi', and huggingface `AutoTokenizer`.
You can pass huggingface tokenizer name.
Default is porter_stemmer.
:param kwargs: The optional arguments.
"""
super().__init__(project_dir)
# check if bm25_path and file exist
bm25_tokenizer = kwargs.get("bm25_tokenizer", None)
if bm25_tokenizer is None:
bm25_tokenizer = "porter_stemmer"
bm25_path = os.path.join(self.resources_dir, get_bm25_pkl_name(bm25_tokenizer))
assert (
bm25_path is not None
), "bm25_path must be specified for using bm25 retrieval."
assert os.path.exists(
bm25_path
), f"bm25_path {bm25_path} does not exist. Please ingest first."
self.bm25_corpus = load_bm25_corpus(bm25_path)
assert (
"tokens" and "passage_id" in list(self.bm25_corpus.keys())
), "bm25_corpus must contain tokens and passage_id. Please check you ingested bm25 corpus correctly."
self.tokenizer = select_bm25_tokenizer(bm25_tokenizer)
assert self.bm25_corpus["tokenizer_name"] == bm25_tokenizer, (
f"The bm25 corpus tokenizer is {self.bm25_corpus['tokenizer_name']}, but your input is {bm25_tokenizer}. "
f"You need to ingest again. Delete bm25 pkl file and re-ingest it."
)
self.bm25_instance = BM25Okapi(self.bm25_corpus["tokens"])
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries = self.cast_to_run(previous_result)
pure_params = pop_params(self._pure, kwargs)
ids, scores = self._pure(queries, *args, **pure_params)
contents = fetch_contents(self.corpus_df, ids)
return contents, ids, scores
def _pure(
self,
queries: List[List[str]],
top_k: int,
ids: Optional[List[List[str]]] = None,
) -> Tuple[List[List[str]], List[List[float]]]:
"""
BM25 retrieval function.
You have to load a pickle file that is already ingested.
:param queries: 2-d list of query strings.
Each element of the list is a query strings of each row.
:param top_k: The number of passages to be retrieved.
:param ids: The optional list of ids that you want to retrieve.
You don't need to specify this in the general use cases.
Default is None.
:return: The 2-d list contains a list of passage ids that retrieved from bm25 and 2-d list of its scores.
It will be a length of queries. And each element has a length of top_k.
"""
if ids is not None:
score_result = list(
map(
lambda query_list, id_list: get_bm25_scores(
query_list,
id_list,
self.tokenizer,
self.bm25_instance,
self.bm25_corpus,
),
queries,
ids,
)
)
return ids, score_result
# run async bm25_pure function
tasks = [
bm25_pure(
input_queries,
top_k,
self.tokenizer,
self.bm25_instance,
self.bm25_corpus,
)
for input_queries in queries
]
loop = get_event_loop()
results = loop.run_until_complete(asyncio.gather(*tasks))
id_result = list(map(lambda x: x[0], results))
score_result = list(map(lambda x: x[1], results))
return id_result, score_result
async def bm25_pure(
queries: List[str], top_k: int, tokenizer, bm25_api: BM25Okapi, bm25_corpus: Dict
) -> Tuple[List[str], List[float]]:
"""
Async BM25 retrieval function.
Its usage is for async retrieval of bm25 row by row.
:param queries: A list of query strings.
:param top_k: The number of passages to be retrieved.
:param tokenizer: A tokenizer that will be used to tokenize queries.
:param bm25_api: A bm25 api instance that will be used to retrieve passages.
:param bm25_corpus: A dictionary containing the bm25 corpus, which is doc_id from corpus and tokenized corpus.
Its data structure looks like this:
.. Code:: python
{
"tokens": [], # 2d list of tokens
"passage_id": [], # 2d list of passage_id. Type must be str.
}
:return: The tuple contains a list of passage ids that retrieved from bm25 and its scores.
"""
# I don't make queries operation to async, because queries length might be small, so it will occur overhead.
tokenized_queries = tokenize(queries, tokenizer)
id_result = []
score_result = []
for query in tokenized_queries:
scores = bm25_api.get_scores(query)
sorted_scores = sorted(scores, reverse=True)
top_n_index = np.argsort(scores)[::-1][:top_k]
ids = [bm25_corpus["passage_id"][i] for i in top_n_index]
id_result.append(ids)
score_result.append(sorted_scores[:top_k])
# make a total result to top_k
id_result, score_result = evenly_distribute_passages(id_result, score_result, top_k)
# sort id_result and score_result by score
result = [
(_id, score)
for score, _id in sorted(
zip(score_result, id_result), key=lambda pair: pair[0], reverse=True
)
]
id_result, score_result = zip(*result)
return list(id_result), list(score_result)
def get_bm25_scores(
queries: List[str],
ids: List[str],
tokenizer,
bm25_api: BM25Okapi,
bm25_corpus: Dict,
) -> List[float]:
if len(ids) == 0 or not bool(ids):
return []
tokenized_queries = tokenize(queries, tokenizer)
result_dict = {id_: [] for id_ in ids}
for query in tokenized_queries:
scores = bm25_api.get_scores(query)
for i, id_ in enumerate(ids):
result_dict[id_].append(scores[bm25_corpus["passage_id"].index(id_)])
result_df = pd.DataFrame(result_dict)
return result_df.max(axis=0).tolist()
def tokenize(queries: List[str], tokenizer) -> List[List[int]]:
if isinstance(tokenizer, PreTrainedTokenizerBase):
tokenized_queries = tokenizer(queries).input_ids
else:
tokenized_queries = tokenizer(queries)
return tokenized_queries
def bm25_ingest(
corpus_path: str, corpus_data: pd.DataFrame, bm25_tokenizer: str = "porter_stemmer"
):
if not corpus_path.endswith(".pkl"):
raise ValueError(f"Corpus path {corpus_path} is not a pickle file.")
validate_corpus_dataset(corpus_data)
ids = corpus_data["doc_id"].tolist()
# Initialize bm25_corpus
bm25_corpus = pd.DataFrame()
# Load the BM25 corpus if it exists and get the passage ids
if os.path.exists(corpus_path) and os.path.getsize(corpus_path) > 0:
with open(corpus_path, "rb") as r:
corpus = pickle.load(r)
bm25_corpus = pd.DataFrame.from_dict(corpus)
duplicated_passage_rows = bm25_corpus[bm25_corpus["passage_id"].isin(ids)]
new_passage = corpus_data[
~corpus_data["doc_id"].isin(duplicated_passage_rows["passage_id"])
]
else:
new_passage = corpus_data
if not new_passage.empty:
tokenizer = select_bm25_tokenizer(bm25_tokenizer)
if isinstance(tokenizer, PreTrainedTokenizerBase):
tokenized_corpus = tokenizer(new_passage["contents"].tolist()).input_ids
else:
tokenized_corpus = tokenizer(new_passage["contents"].tolist())
new_bm25_corpus = pd.DataFrame(
{
"tokens": tokenized_corpus,
"passage_id": new_passage["doc_id"].tolist(),
}
)
if not bm25_corpus.empty:
bm25_corpus_updated = pd.concat(
[bm25_corpus, new_bm25_corpus], ignore_index=True
)
bm25_dict = bm25_corpus_updated.to_dict("list")
else:
bm25_dict = new_bm25_corpus.to_dict("list")
# add tokenizer name to bm25_dict
bm25_dict["tokenizer_name"] = bm25_tokenizer
with open(corpus_path, "wb") as w:
pickle.dump(bm25_dict, w)
def select_bm25_tokenizer(
bm25_tokenizer: str,
) -> Callable[[str], List[Union[int, str]]]:
if bm25_tokenizer in list(BM25_TOKENIZER.keys()):
return BM25_TOKENIZER[bm25_tokenizer]
return AutoTokenizer.from_pretrained(bm25_tokenizer, use_fast=False)

View File

@@ -0,0 +1,214 @@
import os
from pathlib import Path
from typing import Tuple, List, Union
import numpy as np
import pandas as pd
from autorag.nodes.retrieval.base import HybridRetrieval
from autorag.utils.util import pop_params, fetch_contents, result_to_dataframe
def normalize_mm(scores: List[str], fixed_min_value: float = 0):
arr = np.array(scores)
max_value = np.max(arr)
min_value = np.min(arr)
norm_score = (arr - min_value) / (max_value - min_value)
return norm_score
def normalize_tmm(scores: List[str], fixed_min_value: float):
arr = np.array(scores)
max_value = np.max(arr)
norm_score = (arr - fixed_min_value) / (max_value - fixed_min_value)
return norm_score
def normalize_z(scores: List[str], fixed_min_value: float = 0):
arr = np.array(scores)
mean_value = np.mean(arr)
std_value = np.std(arr)
norm_score = (arr - mean_value) / std_value
return norm_score
def normalize_dbsf(scores: List[str], fixed_min_value: float = 0):
arr = np.array(scores)
mean_value = np.mean(arr)
std_value = np.std(arr)
min_value = mean_value - 3 * std_value
max_value = mean_value + 3 * std_value
norm_score = (arr - min_value) / (max_value - min_value)
return norm_score
normalize_method_dict = {
"mm": normalize_mm,
"tmm": normalize_tmm,
"z": normalize_z,
"dbsf": normalize_dbsf,
}
class HybridCC(HybridRetrieval):
def _pure(
self,
ids: Tuple,
scores: Tuple,
top_k: int,
weight: float,
normalize_method: str = "mm",
semantic_theoretical_min_value: float = -1.0,
lexical_theoretical_min_value: float = 0.0,
):
return hybrid_cc(
ids,
scores,
top_k,
weight,
normalize_method,
semantic_theoretical_min_value,
lexical_theoretical_min_value,
)
@classmethod
def run_evaluator(
cls,
project_dir: Union[str, Path],
previous_result: pd.DataFrame,
*args,
**kwargs,
):
if "ids" in kwargs and "scores" in kwargs:
data_dir = os.path.join(project_dir, "data")
corpus_df = pd.read_parquet(
os.path.join(data_dir, "corpus.parquet"), engine="pyarrow"
)
params = pop_params(hybrid_cc, kwargs)
assert (
"ids" in params and "scores" in params and "top_k" in params
), "ids, scores, and top_k must be specified."
@result_to_dataframe(
["retrieved_contents", "retrieved_ids", "retrieve_scores"]
)
def __cc(**cc_params):
ids, scores = hybrid_cc(**cc_params)
contents = fetch_contents(corpus_df, ids)
return contents, ids, scores
return __cc(**params)
else:
assert (
"target_modules" in kwargs and "target_module_params" in kwargs
), "target_modules and target_module_params must be specified if there is not ids and scores."
instance = cls(project_dir, *args, **kwargs)
result = instance.pure(previous_result, *args, **kwargs)
del instance
return result
def hybrid_cc(
ids: Tuple,
scores: Tuple,
top_k: int,
weight: float,
normalize_method: str = "mm",
semantic_theoretical_min_value: float = -1.0,
lexical_theoretical_min_value: float = 0.0,
) -> Tuple[List[List[str]], List[List[float]]]:
"""
Hybrid CC function.
CC (convex combination) is a method to fuse lexical and semantic retrieval results.
It is a method that first normalizes the scores of each retrieval result,
and then combines them with the given weights.
It is uniquer than other retrieval modules, because it does not really execute retrieval,
but just fuse the results of other retrieval functions.
So you have to run more than two retrieval modules before running this function.
And collect ids and scores result from each retrieval module.
Make it as tuple and input it to this function.
:param ids: The tuple of ids that you want to fuse.
The length of this must be the same as the length of scores.
The semantic retrieval ids must be the first index.
:param scores: The retrieve scores that you want to fuse.
The length of this must be the same as the length of ids.
The semantic retrieval scores must be the first index.
:param top_k: The number of passages to be retrieved.
:param normalize_method: The normalization method to use.
There are some normalization method that you can use at the hybrid cc method.
AutoRAG support following.
- `mm`: Min-max scaling
- `tmm`: Theoretical min-max scaling
- `z`: z-score normalization
- `dbsf`: 3-sigma normalization
:param weight: The weight value. If the weight is 1.0, it means the
weight to the semantic module will be 1.0 and weight to the lexical module will be 0.0.
:param semantic_theoretical_min_value: This value used by `tmm` normalization method. You can set the
theoretical minimum value by yourself. Default is -1.
:param lexical_theoretical_min_value: This value used by `tmm` normalization method. You can set the
theoretical minimum value by yourself. Default is 0.
:return: The tuple of ids and fused scores that fused by CC. Plus, the third element is selected weight value.
"""
assert len(ids) == len(scores), "The length of ids and scores must be the same."
assert len(ids) > 1, "You must input more than one retrieval results."
assert top_k > 0, "top_k must be greater than 0."
assert weight >= 0, "The weight must be greater than 0."
assert weight <= 1, "The weight must be less than 1."
df = pd.DataFrame(
{
"semantic_ids": ids[0],
"lexical_ids": ids[1],
"semantic_score": scores[0],
"lexical_score": scores[1],
}
)
def cc_pure_apply(row):
return fuse_per_query(
row["semantic_ids"],
row["lexical_ids"],
row["semantic_score"],
row["lexical_score"],
normalize_method=normalize_method,
weight=weight,
top_k=top_k,
semantic_theoretical_min_value=semantic_theoretical_min_value,
lexical_theoretical_min_value=lexical_theoretical_min_value,
)
# fixed weight
df[["cc_id", "cc_score"]] = df.apply(
lambda row: cc_pure_apply(row), axis=1, result_type="expand"
)
return df["cc_id"].tolist(), df["cc_score"].tolist()
def fuse_per_query(
semantic_ids: List[str],
lexical_ids: List[str],
semantic_scores: List[float],
lexical_scores: List[float],
normalize_method: str,
weight: float,
top_k: int,
semantic_theoretical_min_value: float,
lexical_theoretical_min_value: float,
):
normalize_func = normalize_method_dict[normalize_method]
norm_semantic_scores = normalize_func(
semantic_scores, semantic_theoretical_min_value
)
norm_lexical_scores = normalize_func(lexical_scores, lexical_theoretical_min_value)
ids = [semantic_ids, lexical_ids]
scores = [norm_semantic_scores, norm_lexical_scores]
df = pd.concat(
[pd.Series(dict(zip(_id, score))) for _id, score in zip(ids, scores)], axis=1
)
df.columns = ["semantic", "lexical"]
df = df.fillna(0)
df["weighted_sum"] = df.mul((weight, 1.0 - weight)).sum(axis=1)
df = df.sort_values(by="weighted_sum", ascending=False)
return df.index.tolist()[:top_k], df["weighted_sum"][:top_k].tolist()

View File

@@ -0,0 +1,128 @@
import os
from pathlib import Path
from typing import List, Tuple, Union
import pandas as pd
from autorag.nodes.retrieval.base import HybridRetrieval
from autorag.utils.util import pop_params, fetch_contents, result_to_dataframe
class HybridRRF(HybridRetrieval):
def _pure(self, ids, scores, top_k: int, weight: int = 60, rrf_k: int = -1):
return hybrid_rrf(ids, scores, top_k, weight, rrf_k)
@classmethod
def run_evaluator(
cls,
project_dir: Union[str, Path],
previous_result: pd.DataFrame,
*args,
**kwargs,
):
if "ids" in kwargs and "scores" in kwargs:
data_dir = os.path.join(project_dir, "data")
corpus_df = pd.read_parquet(
os.path.join(data_dir, "corpus.parquet"), engine="pyarrow"
)
params = pop_params(hybrid_rrf, kwargs)
assert (
"ids" in params and "scores" in params and "top_k" in params
), "ids, scores, and top_k must be specified."
@result_to_dataframe(
["retrieved_contents", "retrieved_ids", "retrieve_scores"]
)
def __rrf(**rrf_params):
ids, scores = hybrid_rrf(**rrf_params)
contents = fetch_contents(corpus_df, ids)
return contents, ids, scores
return __rrf(**params)
else:
assert (
"target_modules" in kwargs and "target_module_params" in kwargs
), "target_modules and target_module_params must be specified if there is not ids and scores."
instance = cls(project_dir, *args, **kwargs)
result = instance.pure(previous_result, *args, **kwargs)
del instance
return result
def hybrid_rrf(
ids: Tuple,
scores: Tuple,
top_k: int,
weight: int = 60,
rrf_k: int = -1,
) -> Tuple[List[List[str]], List[List[float]]]:
"""
Hybrid RRF function.
RRF (Rank Reciprocal Fusion) is a method to fuse multiple retrieval results.
It is common to fuse dense retrieval and sparse retrieval results using RRF.
To use this function, you must input ids and scores as tuple.
It is more unique than other retrieval modules because it does not really execute retrieval but just fuses
the results of other retrieval functions.
So you have to run more than two retrieval modules before running this function.
And collect ids and scores result from each retrieval module.
Make it as a tuple and input it to this function.
:param ids: The tuple of ids that you want to fuse.
The length of this must be the same as the length of scores.
:param scores: The retrieve scores that you want to fuse.
The length of this must be the same as the length of ids.
:param top_k: The number of passages to be retrieved.
:param weight: Hyperparameter for RRF.
It was originally rrf_k value.
Default is 60.
For more information, please visit our documentation.
:param rrf_k: (Deprecated) Hyperparameter for RRF.
It was originally rrf_k value. Will remove at a further version.
:return: The tuple of ids and fused scores that are fused by RRF.
"""
assert len(ids) == len(scores), "The length of ids and scores must be the same."
assert len(ids) > 1, "You must input more than one retrieval results."
assert top_k > 0, "top_k must be greater than 0."
assert weight > 0, "rrf_k must be greater than 0."
if rrf_k != -1:
weight = int(rrf_k)
else:
weight = int(weight)
id_df = pd.DataFrame({f"id_{i}": id_list for i, id_list in enumerate(ids)})
score_df = pd.DataFrame(
{f"score_{i}": score_list for i, score_list in enumerate(scores)}
)
df = pd.concat([id_df, score_df], axis=1)
def rrf_pure_apply(row):
ids_tuple = tuple(row[[f"id_{i}" for i in range(len(ids))]].values)
scores_tuple = tuple(row[[f"score_{i}" for i in range(len(scores))]].values)
return pd.Series(rrf_pure(ids_tuple, scores_tuple, weight, top_k))
df[["rrf_id", "rrf_score"]] = df.apply(rrf_pure_apply, axis=1)
return df["rrf_id"].tolist(), df["rrf_score"].tolist()
def rrf_pure(
ids: Tuple, scores: Tuple, rrf_k: int, top_k: int
) -> Tuple[List[str], List[float]]:
df = pd.concat(
[pd.Series(dict(zip(_id, score))) for _id, score in zip(ids, scores)], axis=1
)
rank_df = df.rank(ascending=False, method="min")
rank_df = rank_df.fillna(0)
rank_df["rrf"] = rank_df.apply(lambda row: rrf_calculate(row, rrf_k), axis=1)
rank_df = rank_df.sort_values(by="rrf", ascending=False)
return rank_df.index.tolist()[:top_k], rank_df["rrf"].tolist()[:top_k]
def rrf_calculate(row, rrf_k):
result = 0
for r in row:
if r == 0:
continue
result += 1 / (r + rrf_k)
return result

View File

@@ -0,0 +1,544 @@
import logging
import os
import pathlib
from copy import deepcopy
from typing import List, Callable, Dict, Tuple, Union
import numpy as np
import pandas as pd
from autorag.evaluation import evaluate_retrieval
from autorag.schema.metricinput import MetricInput
from autorag.strategy import measure_speed, filter_by_threshold, select_best
from autorag.support import get_support_modules
from autorag.utils.util import get_best_row, to_list, apply_recursive
logger = logging.getLogger("AutoRAG")
semantic_module_names = ["vectordb", "VectorDB"]
lexical_module_names = ["bm25", "BM25"]
hybrid_module_names = ["hybrid_rrf", "hybrid_cc", "HybridCC", "HybridRRF"]
def run_retrieval_node(
modules: List,
module_params: List[Dict],
previous_result: pd.DataFrame,
node_line_dir: str,
strategies: Dict,
) -> pd.DataFrame:
"""
Run evaluation and select the best module among retrieval node results.
:param modules: Retrieval modules to run.
:param module_params: Retrieval module parameters.
:param previous_result: Previous result dataframe.
Could be query expansion's best result or qa data.
:param node_line_dir: This node line's directory.
:param strategies: Strategies for retrieval node.
:return: The best result dataframe.
It contains previous result columns and retrieval node's result columns.
"""
if not os.path.exists(node_line_dir):
os.makedirs(node_line_dir)
project_dir = pathlib.PurePath(node_line_dir).parent.parent
qa_df = pd.read_parquet(
os.path.join(project_dir, "data", "qa.parquet"), engine="pyarrow"
)
retrieval_gt = qa_df["retrieval_gt"].tolist()
retrieval_gt = apply_recursive(lambda x: str(x), to_list(retrieval_gt))
# make rows to metric_inputs
metric_inputs = [
MetricInput(retrieval_gt=ret_gt, query=query, generation_gt=gen_gt)
for ret_gt, query, gen_gt in zip(
retrieval_gt, qa_df["query"].tolist(), qa_df["generation_gt"].tolist()
)
]
save_dir = os.path.join(node_line_dir, "retrieval") # node name
if not os.path.exists(save_dir):
os.makedirs(save_dir)
def run(input_modules, input_module_params) -> Tuple[List[pd.DataFrame], List]:
"""
Run input modules and parameters.
:param input_modules: Input modules
:param input_module_params: Input module parameters
:return: First, it returns list of result dataframe.
Second, it returns list of execution times.
"""
result, execution_times = zip(
*map(
lambda task: measure_speed(
task[0].run_evaluator,
project_dir=project_dir,
previous_result=previous_result,
**task[1],
),
zip(input_modules, input_module_params),
)
)
average_times = list(map(lambda x: x / len(result[0]), execution_times))
# run metrics before filtering
if strategies.get("metrics") is None:
raise ValueError("You must at least one metrics for retrieval evaluation.")
result = list(
map(
lambda x: evaluate_retrieval_node(
x,
metric_inputs,
strategies.get("metrics"),
),
result,
)
)
return result, average_times
def save_and_summary(
input_modules,
input_module_params,
result_list,
execution_time_list,
filename_start: int,
):
"""
Save the result and make summary file
:param input_modules: Input modules
:param input_module_params: Input module parameters
:param result_list: Result list
:param execution_time_list: Execution times
:param filename_start: The first filename to use
:return: First, it returns list of result dataframe.
Second, it returns list of execution times.
"""
# save results to folder
filepaths = list(
map(
lambda x: os.path.join(save_dir, f"{x}.parquet"),
range(filename_start, filename_start + len(input_modules)),
)
)
list(
map(
lambda x: x[0].to_parquet(x[1], index=False),
zip(result_list, filepaths),
)
) # execute save to parquet
filename_list = list(map(lambda x: os.path.basename(x), filepaths))
summary_df = pd.DataFrame(
{
"filename": filename_list,
"module_name": list(map(lambda module: module.__name__, input_modules)),
"module_params": input_module_params,
"execution_time": execution_time_list,
**{
metric: list(map(lambda result: result[metric].mean(), result_list))
for metric in strategies.get("metrics")
},
}
)
summary_df.to_csv(os.path.join(save_dir, "summary.csv"), index=False)
return summary_df
def find_best(results, average_times, filenames):
# filter by strategies
if strategies.get("speed_threshold") is not None:
results, filenames = filter_by_threshold(
results, average_times, strategies["speed_threshold"], filenames
)
selected_result, selected_filename = select_best(
results,
strategies.get("metrics"),
filenames,
strategies.get("strategy", "mean"),
)
return selected_result, selected_filename
filename_first = 0
# run semantic modules
logger.info("Running retrieval node - semantic retrieval module...")
if any([module.__name__ in semantic_module_names for module in modules]):
semantic_modules, semantic_module_params = zip(
*filter(
lambda x: x[0].__name__ in semantic_module_names,
zip(modules, module_params),
)
)
semantic_results, semantic_times = run(semantic_modules, semantic_module_params)
semantic_summary_df = save_and_summary(
semantic_modules,
semantic_module_params,
semantic_results,
semantic_times,
filename_first,
)
semantic_selected_result, semantic_selected_filename = find_best(
semantic_results, semantic_times, semantic_summary_df["filename"].tolist()
)
semantic_summary_df["is_best"] = (
semantic_summary_df["filename"] == semantic_selected_filename
)
filename_first += len(semantic_modules)
else:
(
semantic_selected_filename,
semantic_summary_df,
semantic_results,
semantic_times,
) = None, pd.DataFrame(), [], []
# run lexical modules
logger.info("Running retrieval node - lexical retrieval module...")
if any([module.__name__ in lexical_module_names for module in modules]):
lexical_modules, lexical_module_params = zip(
*filter(
lambda x: x[0].__name__ in lexical_module_names,
zip(modules, module_params),
)
)
lexical_results, lexical_times = run(lexical_modules, lexical_module_params)
lexical_summary_df = save_and_summary(
lexical_modules,
lexical_module_params,
lexical_results,
lexical_times,
filename_first,
)
lexical_selected_result, lexical_selected_filename = find_best(
lexical_results, lexical_times, lexical_summary_df["filename"].tolist()
)
lexical_summary_df["is_best"] = (
lexical_summary_df["filename"] == lexical_selected_filename
)
filename_first += len(lexical_modules)
else:
(
lexical_selected_filename,
lexical_summary_df,
lexical_results,
lexical_times,
) = None, pd.DataFrame(), [], []
logger.info("Running retrieval node - hybrid retrieval module...")
# Next, run hybrid retrieval
if any([module.__name__ in hybrid_module_names for module in modules]):
hybrid_modules, hybrid_module_params = zip(
*filter(
lambda x: x[0].__name__ in hybrid_module_names,
zip(modules, module_params),
)
)
if all(
["target_module_params" in x for x in hybrid_module_params]
): # for Runner.run
# If target_module_params are already given, run hybrid retrieval directly
hybrid_results, hybrid_times = run(hybrid_modules, hybrid_module_params)
hybrid_summary_df = save_and_summary(
hybrid_modules,
hybrid_module_params,
hybrid_results,
hybrid_times,
filename_first,
)
filename_first += len(hybrid_modules)
else: # for Evaluator
# get id and score
ids_scores = get_ids_and_scores(
save_dir,
[semantic_selected_filename, lexical_selected_filename],
semantic_summary_df,
lexical_summary_df,
previous_result,
)
hybrid_module_params = list(
map(lambda x: {**x, **ids_scores}, hybrid_module_params)
)
# optimize each modules
real_hybrid_times = [
get_hybrid_execution_times(semantic_summary_df, lexical_summary_df)
] * len(hybrid_module_params)
hybrid_times = real_hybrid_times.copy()
hybrid_results = []
for module, module_param in zip(hybrid_modules, hybrid_module_params):
module_result_df, module_best_weight = optimize_hybrid(
module,
module_param,
strategies,
metric_inputs,
project_dir,
previous_result,
)
module_param["weight"] = module_best_weight
hybrid_results.append(module_result_df)
hybrid_summary_df = save_and_summary(
hybrid_modules,
hybrid_module_params,
hybrid_results,
hybrid_times,
filename_first,
)
filename_first += len(hybrid_modules)
hybrid_summary_df["execution_time"] = hybrid_times
best_semantic_summary_row = semantic_summary_df.loc[
semantic_summary_df["is_best"]
].iloc[0]
best_lexical_summary_row = lexical_summary_df.loc[
lexical_summary_df["is_best"]
].iloc[0]
target_modules = (
best_semantic_summary_row["module_name"],
best_lexical_summary_row["module_name"],
)
target_module_params = (
best_semantic_summary_row["module_params"],
best_lexical_summary_row["module_params"],
)
hybrid_summary_df = edit_summary_df_params(
hybrid_summary_df, target_modules, target_module_params
)
else:
if any([module.__name__ in hybrid_module_names for module in modules]):
logger.warning(
"You must at least one semantic module and lexical module for hybrid evaluation."
"Passing hybrid module."
)
_, hybrid_summary_df, hybrid_results, hybrid_times = (
None,
pd.DataFrame(),
[],
[],
)
summary = pd.concat(
[semantic_summary_df, lexical_summary_df, hybrid_summary_df], ignore_index=True
)
results = semantic_results + lexical_results + hybrid_results
average_times = semantic_times + lexical_times + hybrid_times
filenames = summary["filename"].tolist()
# filter by strategies
selected_result, selected_filename = find_best(results, average_times, filenames)
best_result = pd.concat([previous_result, selected_result], axis=1)
# add summary.csv 'is_best' column
summary["is_best"] = summary["filename"] == selected_filename
# save the result files
best_result.to_parquet(
os.path.join(
save_dir, f"best_{os.path.splitext(selected_filename)[0]}.parquet"
),
index=False,
)
summary.to_csv(os.path.join(save_dir, "summary.csv"), index=False)
return best_result
def evaluate_retrieval_node(
result_df: pd.DataFrame,
metric_inputs: List[MetricInput],
metrics: Union[List[str], List[Dict]],
) -> pd.DataFrame:
"""
Evaluate retrieval node from retrieval node result dataframe.
:param result_df: The result dataframe from a retrieval node.
:param metric_inputs: List of metric input schema for AutoRAG.
:param metrics: Metric list from input strategies.
:return: Return result_df with metrics columns.
The columns will be 'retrieved_contents', 'retrieved_ids', 'retrieve_scores', and metric names.
"""
@evaluate_retrieval(
metric_inputs=metric_inputs,
metrics=metrics,
)
def evaluate_this_module(df: pd.DataFrame):
return (
df["retrieved_contents"].tolist(),
df["retrieved_ids"].tolist(),
df["retrieve_scores"].tolist(),
)
return evaluate_this_module(result_df)
def edit_summary_df_params(
summary_df: pd.DataFrame, target_modules, target_module_params
) -> pd.DataFrame:
def delete_ids_scores(x):
del x["ids"]
del x["scores"]
return x
summary_df["module_params"] = summary_df["module_params"].apply(delete_ids_scores)
summary_df["new_params"] = [
{"target_modules": target_modules, "target_module_params": target_module_params}
] * len(summary_df)
summary_df["module_params"] = summary_df.apply(
lambda row: {**row["module_params"], **row["new_params"]}, axis=1
)
summary_df = summary_df.drop(columns=["new_params"])
return summary_df
def get_ids_and_scores(
node_dir: str,
filenames: List[str],
semantic_summary_df: pd.DataFrame,
lexical_summary_df: pd.DataFrame,
previous_result,
) -> Dict[str, Tuple[List[List[str]], List[List[float]]]]:
project_dir = pathlib.PurePath(node_dir).parent.parent.parent
best_results_df = list(
map(
lambda filename: pd.read_parquet(
os.path.join(node_dir, filename), engine="pyarrow"
),
filenames,
)
)
ids = tuple(
map(lambda df: df["retrieved_ids"].apply(list).tolist(), best_results_df)
)
scores = tuple(
map(lambda df: df["retrieve_scores"].apply(list).tolist(), best_results_df)
)
# search non-duplicate ids
semantic_ids = deepcopy(ids[0])
lexical_ids = deepcopy(ids[1])
def get_non_duplicate_ids(target_ids, compare_ids) -> List[List[str]]:
"""
Get non-duplicate ids from target_ids and compare_ids.
If you want to non-duplicate ids of semantic_ids, you have to put it at target_ids.
"""
result_ids = []
assert len(target_ids) == len(compare_ids)
for target_id_list, compare_id_list in zip(target_ids, compare_ids):
query_duplicated = list(set(compare_id_list) - set(target_id_list))
duplicate_list = query_duplicated if len(query_duplicated) != 0 else []
result_ids.append(duplicate_list)
return result_ids
lexical_target_ids = get_non_duplicate_ids(lexical_ids, semantic_ids)
semantic_target_ids = get_non_duplicate_ids(semantic_ids, lexical_ids)
new_id_tuple = (
[a + b for a, b in zip(semantic_ids, semantic_target_ids)],
[a + b for a, b in zip(lexical_ids, lexical_target_ids)],
)
# search non-duplicate ids' scores
new_semantic_scores = get_scores_by_ids(
semantic_target_ids, semantic_summary_df, project_dir, previous_result
)
new_lexical_scores = get_scores_by_ids(
lexical_target_ids, lexical_summary_df, project_dir, previous_result
)
new_score_tuple = (
[a + b for a, b in zip(scores[0], new_semantic_scores)],
[a + b for a, b in zip(scores[1], new_lexical_scores)],
)
return {
"ids": new_id_tuple,
"scores": new_score_tuple,
}
def get_scores_by_ids(
ids: List[List[str]], module_summary_df: pd.DataFrame, project_dir, previous_result
) -> List[List[float]]:
module_name = get_best_row(module_summary_df)["module_name"]
module_params = get_best_row(module_summary_df)["module_params"]
module = get_support_modules(module_name)
result_df = module.run_evaluator(
project_dir=project_dir,
previous_result=previous_result,
ids=ids,
**module_params,
)
return to_list(result_df["retrieve_scores"].tolist())
def find_unique_elems(list1: List[str], list2: List[str]) -> List[str]:
return list(set(list1).symmetric_difference(set(list2)))
def get_hybrid_execution_times(lexical_summary, semantic_summary) -> float:
lexical_execution_time = lexical_summary.loc[lexical_summary["is_best"]].iloc[0][
"execution_time"
]
semantic_execution_time = semantic_summary.loc[semantic_summary["is_best"]].iloc[0][
"execution_time"
]
return lexical_execution_time + semantic_execution_time
def optimize_hybrid(
hybrid_module_func: Callable,
hybrid_module_param: Dict,
strategy: Dict,
input_metrics: List[MetricInput],
project_dir,
previous_result,
):
if (
hybrid_module_func.__name__ == "HybridRRF"
or hybrid_module_func.__name__ == "hybrid_rrf"
):
weight_range = hybrid_module_param.pop("weight_range", (4, 80))
test_weight_size = weight_range[1] - weight_range[0] + 1
elif (
hybrid_module_func.__name__ == "HybridCC"
or hybrid_module_func.__name__ == "hybrid_cc"
):
weight_range = hybrid_module_param.pop("weight_range", (0.0, 1.0))
test_weight_size = hybrid_module_param.pop("test_weight_size", 101)
else:
raise ValueError("You must input hybrid module function at hybrid_module_func.")
weight_candidates = np.linspace(
weight_range[0], weight_range[1], test_weight_size
).tolist()
result_list = []
for weight_value in weight_candidates:
result_df = hybrid_module_func.run_evaluator(
project_dir=project_dir,
previous_result=previous_result,
weight=weight_value,
**hybrid_module_param,
)
result_list.append(result_df)
# evaluate here
if strategy.get("metrics") is None:
raise ValueError("You must at least one metrics for retrieval evaluation.")
result_list = list(
map(
lambda x: evaluate_retrieval_node(
x,
input_metrics,
strategy.get("metrics"),
),
result_list,
)
)
# select best result
best_result_df, best_weight = select_best(
result_list,
strategy.get("metrics"),
metadatas=weight_candidates,
strategy_name=strategy.get("strategy", "normalize_mean"),
)
return best_result_df, best_weight

View File

@@ -0,0 +1,303 @@
import itertools
import logging
import os
from typing import List, Tuple, Optional
import numpy as np
import pandas as pd
from llama_index.core.embeddings import BaseEmbedding
from llama_index.embeddings.openai import OpenAIEmbedding
from autorag.evaluation.metric.util import (
calculate_l2_distance,
calculate_inner_product,
calculate_cosine_similarity,
)
from autorag.nodes.retrieval.base import evenly_distribute_passages, BaseRetrieval
from autorag.utils import (
validate_corpus_dataset,
cast_corpus_dataset,
cast_qa_dataset,
validate_qa_dataset,
)
from autorag.utils.util import (
get_event_loop,
process_batch,
openai_truncate_by_token,
flatten_apply,
result_to_dataframe,
pop_params,
fetch_contents,
empty_cuda_cache,
convert_inputs_to_list,
make_batch,
)
from autorag.vectordb import load_vectordb_from_yaml
from autorag.vectordb.base import BaseVectorStore
logger = logging.getLogger("AutoRAG")
class VectorDB(BaseRetrieval):
def __init__(self, project_dir: str, vectordb: str = "default", **kwargs):
"""
Initialize VectorDB retrieval node.
:param project_dir: The project directory path.
:param vectordb: The vectordb name.
You must configure the vectordb name in the config.yaml file.
If you don't configure, it uses the default vectordb.
:param kwargs: The optional arguments.
Not affected in the init method.
"""
super().__init__(project_dir)
vectordb_config_path = os.path.join(self.resources_dir, "vectordb.yaml")
self.vector_store = load_vectordb_from_yaml(
vectordb_config_path, vectordb, project_dir
)
self.embedding_model = self.vector_store.embedding
def __del__(self):
del self.vector_store
del self.embedding_model
empty_cuda_cache()
super().__del__()
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
queries = self.cast_to_run(previous_result)
pure_params = pop_params(self._pure, kwargs)
ids, scores = self._pure(queries, **pure_params)
contents = fetch_contents(self.corpus_df, ids)
return contents, ids, scores
def _pure(
self,
queries: List[List[str]],
top_k: int,
embedding_batch: int = 128,
ids: Optional[List[List[str]]] = None,
) -> Tuple[List[List[str]], List[List[float]]]:
"""
VectorDB retrieval function.
You have to get a chroma collection that is already ingested.
You have to get an embedding model that is already used in ingesting.
:param queries: 2-d list of query strings.
Each element of the list is a query strings of each row.
:param top_k: The number of passages to be retrieved.
:param embedding_batch: The number of queries to be processed in parallel.
This is used to prevent API error at the query embedding.
Default is 128.
:param ids: The optional list of ids that you want to retrieve.
You don't need to specify this in the general use cases.
Default is None.
:return: The 2-d list contains a list of passage ids that retrieved from vectordb and 2-d list of its scores.
It will be a length of queries. And each element has a length of top_k.
"""
# if ids are specified, fetch the ids score from Chroma
if ids is not None:
return self.__get_ids_scores(queries, ids, embedding_batch)
# run async vector_db_pure function
tasks = [
vectordb_pure(query_list, top_k, self.vector_store)
for query_list in queries
]
loop = get_event_loop()
results = loop.run_until_complete(
process_batch(tasks, batch_size=embedding_batch)
)
id_result = list(map(lambda x: x[0], results))
score_result = list(map(lambda x: x[1], results))
return id_result, score_result
def __get_ids_scores(self, queries, ids, embedding_batch: int):
# truncate queries and embedding execution here.
openai_embedding_limit = 8000
if isinstance(self.embedding_model, OpenAIEmbedding):
queries = list(
map(
lambda query_list: openai_truncate_by_token(
query_list,
openai_embedding_limit,
self.embedding_model.model_name,
),
queries,
)
)
query_embeddings = flatten_apply(
run_query_embedding_batch,
queries,
embedding_model=self.embedding_model,
batch_size=embedding_batch,
)
loop = get_event_loop()
async def run_fetch(ids):
final_result = []
for id_list in ids:
if len(id_list) == 0:
final_result.append([])
else:
result = await self.vector_store.fetch(id_list)
final_result.append(result)
return final_result
content_embeddings = loop.run_until_complete(run_fetch(ids))
score_result = list(
map(
lambda query_embedding_list, content_embedding_list: get_id_scores(
query_embedding_list,
content_embedding_list,
similarity_metric=self.vector_store.similarity_metric,
),
query_embeddings,
content_embeddings,
)
)
return ids, score_result
async def vectordb_pure(
queries: List[str], top_k: int, vectordb: BaseVectorStore
) -> Tuple[List[str], List[float]]:
"""
Async VectorDB retrieval function.
Its usage is for async retrieval of vector_db row by row.
:param query_embeddings: A list of query embeddings.
:param top_k: The number of passages to be retrieved.
:param vectordb: The vector store instance.
:return: The tuple contains a list of passage ids that are retrieved from vectordb and a list of its scores.
"""
id_result, score_result = await vectordb.query(queries=queries, top_k=top_k)
# Distribute passages evenly
id_result, score_result = evenly_distribute_passages(id_result, score_result, top_k)
# sort id_result and score_result by score
result = [
(_id, score)
for score, _id in sorted(
zip(score_result, id_result), key=lambda pair: pair[0], reverse=True
)
]
id_result, score_result = zip(*result)
return list(id_result), list(score_result)
async def filter_exist_ids(
vectordb: BaseVectorStore,
corpus_data: pd.DataFrame,
) -> pd.DataFrame:
corpus_data = cast_corpus_dataset(corpus_data)
validate_corpus_dataset(corpus_data)
ids = corpus_data["doc_id"].tolist()
# Query the collection to check if IDs already exist
existed_bool_list = await vectordb.is_exist(ids=ids)
# Assuming 'ids' is the key in the response
new_passage = corpus_data[~pd.Series(existed_bool_list)]
return new_passage
async def filter_exist_ids_from_retrieval_gt(
vectordb: BaseVectorStore,
qa_data: pd.DataFrame,
corpus_data: pd.DataFrame,
) -> pd.DataFrame:
qa_data = cast_qa_dataset(qa_data)
validate_qa_dataset(qa_data)
corpus_data = cast_corpus_dataset(corpus_data)
validate_corpus_dataset(corpus_data)
retrieval_gt = (
qa_data["retrieval_gt"]
.apply(lambda x: list(itertools.chain.from_iterable(x)))
.tolist()
)
retrieval_gt = list(itertools.chain.from_iterable(retrieval_gt))
retrieval_gt = list(set(retrieval_gt))
existed_bool_list = await vectordb.is_exist(ids=retrieval_gt)
add_ids = []
for ret_gt, is_exist in zip(retrieval_gt, existed_bool_list):
if not is_exist:
add_ids.append(ret_gt)
new_passage = corpus_data[corpus_data["doc_id"].isin(add_ids)]
return new_passage
async def vectordb_ingest(
vectordb: BaseVectorStore,
corpus_data: pd.DataFrame,
):
"""
Ingest given corpus data to the vectordb.
It truncates corpus content when the embedding model is OpenAIEmbedding to the 8000 tokens.
Plus, when the corpus content is empty (whitespace), it will be ignored.
And if there is a document id that already exists in the collection, it will be ignored.
:param vectordb: A vector stores instance that you want to ingest.
:param corpus_data: The corpus data that contains doc_id and contents columns.
"""
embedding_batch = vectordb.embedding_batch
if not corpus_data.empty:
new_contents = corpus_data["contents"].tolist()
new_ids = corpus_data["doc_id"].tolist()
content_batches = make_batch(new_contents, embedding_batch)
id_batches = make_batch(new_ids, embedding_batch)
for content_batch, id_batch in zip(content_batches, id_batches):
await vectordb.add(ids=id_batch, texts=content_batch)
def run_query_embedding_batch(
queries: List[str], embedding_model: BaseEmbedding, batch_size: int
) -> List[List[float]]:
result = []
for i in range(0, len(queries), batch_size):
batch = queries[i : i + batch_size]
embeddings = embedding_model.get_text_embedding_batch(batch)
result.extend(embeddings)
return result
@convert_inputs_to_list
def get_id_scores( # To find the uncalculated score when fuse the scores for the hybrid retrieval
query_embeddings: List[
List[float]
], # `queries` is input. This is one user input query.
content_embeddings: List[List[float]],
similarity_metric: str,
) -> List[
float
]: # The most high scores among each query. The length of a result is the same as the contents length.
"""
Calculate the highest similarity scores between query embeddings and content embeddings.
:param query_embeddings: A list of lists containing query embeddings.
:param content_embeddings: A list of lists containing content embeddings.
:param similarity_metric: The similarity metric to use ('l2', 'ip', or 'cosine').
:return: A list of the highest similarity scores for each content embedding.
"""
metric_func_dict = {
"l2": lambda x, y: 1 - calculate_l2_distance(x, y),
"ip": calculate_inner_product,
"cosine": calculate_cosine_similarity,
}
metric_func = metric_func_dict[similarity_metric]
result = []
for content_embedding in content_embeddings:
scores = []
for query_embedding in query_embeddings:
scores.append(
metric_func(np.array(query_embedding), np.array(content_embedding))
)
result.append(max(scores))
return result

16
autorag/nodes/util.py Normal file
View File

@@ -0,0 +1,16 @@
from typing import Optional, Dict
from autorag.support import get_support_modules
def make_generator_callable_param(generator_dict: Optional[Dict]):
if "generator_module_type" not in generator_dict.keys():
generator_dict = {
"generator_module_type": "llama_index_llm",
"llm": "openai",
"model": "gpt-4o-mini",
}
module_str = generator_dict.pop("generator_module_type")
module_class = get_support_modules(module_str)
module_param = generator_dict
return module_class, module_param