Fix Dockerfile build issue
This commit is contained in:
0
autorag/nodes/__init__.py
Normal file
0
autorag/nodes/__init__.py
Normal file
4
autorag/nodes/generator/__init__.py
Normal file
4
autorag/nodes/generator/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from .llama_index_llm import LlamaIndexLLM
|
||||
from .openai_llm import OpenAILLM
|
||||
from .vllm import Vllm
|
||||
from .vllm_api import VllmAPI
|
||||
103
autorag/nodes/generator/base.py
Normal file
103
autorag/nodes/generator/base.py
Normal file
@@ -0,0 +1,103 @@
|
||||
import abc
|
||||
import functools
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Union, Tuple, List
|
||||
|
||||
import pandas as pd
|
||||
from llama_index.core.output_parsers import PydanticOutputParser
|
||||
|
||||
from autorag import generator_models
|
||||
from autorag.schema import BaseModule
|
||||
from autorag.utils import result_to_dataframe
|
||||
|
||||
logger = logging.getLogger("AutoRAG")
|
||||
|
||||
|
||||
class BaseGenerator(BaseModule, metaclass=abc.ABCMeta):
|
||||
def __init__(self, project_dir: str, llm: str, *args, **kwargs):
|
||||
logger.info(f"Initialize generator node - {self.__class__.__name__}")
|
||||
self.llm = llm
|
||||
|
||||
def __del__(self):
|
||||
logger.info(f"Deleting generator module - {self.__class__.__name__}")
|
||||
|
||||
def cast_to_run(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
logger.info(f"Running generator node - {self.__class__.__name__} module...")
|
||||
assert (
|
||||
"prompts" in previous_result.columns
|
||||
), "previous_result must contain prompts column."
|
||||
prompts = previous_result["prompts"].tolist()
|
||||
return prompts
|
||||
|
||||
def structured_output(self, prompts: List[str], output_cls):
|
||||
response, _, _ = self._pure(prompts)
|
||||
parser = PydanticOutputParser(output_cls)
|
||||
result = []
|
||||
for res in response:
|
||||
try:
|
||||
result.append(parser.parse(res))
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Error parsing response: {e} \nSo returning None instead in this case."
|
||||
)
|
||||
result.append(None)
|
||||
return result
|
||||
|
||||
@abc.abstractmethod
|
||||
async def astream(self, prompt: str, **kwargs):
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def stream(self, prompt: str, **kwargs):
|
||||
pass
|
||||
|
||||
|
||||
def generator_node(func):
|
||||
@functools.wraps(func)
|
||||
@result_to_dataframe(["generated_texts", "generated_tokens", "generated_log_probs"])
|
||||
def wrapper(
|
||||
project_dir: Union[str, Path], previous_result: pd.DataFrame, llm: str, **kwargs
|
||||
) -> Tuple[List[str], List[List[int]], List[List[float]]]:
|
||||
"""
|
||||
This decorator makes a generator module to be a node.
|
||||
It automatically extracts prompts from previous_result and runs the generator function.
|
||||
Plus, it retrieves the llm instance from autorag.generator_models.
|
||||
|
||||
:param project_dir: The project directory.
|
||||
:param previous_result: The previous result that contains prompts,
|
||||
:param llm: The llm name that you want to use.
|
||||
:param kwargs: The extra parameters for initializing the llm instance.
|
||||
:return: Pandas dataframe that contains generated texts, generated tokens, and generated log probs.
|
||||
Each column is "generated_texts", "generated_tokens", and "generated_log_probs".
|
||||
"""
|
||||
logger.info(f"Running generator node - {func.__name__} module...")
|
||||
assert (
|
||||
"prompts" in previous_result.columns
|
||||
), "previous_result must contain prompts column."
|
||||
prompts = previous_result["prompts"].tolist()
|
||||
if func.__name__ == "llama_index_llm":
|
||||
if llm not in generator_models:
|
||||
raise ValueError(
|
||||
f"{llm} is not a valid llm name. Please check the llm name."
|
||||
"You can check valid llm names from autorag.generator_models."
|
||||
)
|
||||
batch = kwargs.pop("batch", 16)
|
||||
if llm == "huggingfacellm":
|
||||
model_name = kwargs.pop("model", None)
|
||||
if model_name is not None:
|
||||
kwargs["model_name"] = model_name
|
||||
else:
|
||||
if "model_name" not in kwargs.keys():
|
||||
raise ValueError(
|
||||
"`model` or `model_name` parameter must be provided for using huggingfacellm."
|
||||
)
|
||||
kwargs["tokenizer_name"] = kwargs["model_name"]
|
||||
llm_instance = generator_models[llm](**kwargs)
|
||||
result = func(prompts=prompts, llm=llm_instance, batch=batch)
|
||||
del llm_instance
|
||||
return result
|
||||
else:
|
||||
return func(prompts=prompts, llm=llm, **kwargs)
|
||||
|
||||
return wrapper
|
||||
97
autorag/nodes/generator/llama_index_llm.py
Normal file
97
autorag/nodes/generator/llama_index_llm.py
Normal file
@@ -0,0 +1,97 @@
|
||||
from typing import List, Tuple
|
||||
|
||||
import pandas as pd
|
||||
from llama_index.core.base.llms.base import BaseLLM
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from autorag import generator_models
|
||||
from autorag.nodes.generator.base import BaseGenerator
|
||||
from autorag.utils.util import (
|
||||
get_event_loop,
|
||||
process_batch,
|
||||
result_to_dataframe,
|
||||
pop_params,
|
||||
)
|
||||
|
||||
|
||||
class LlamaIndexLLM(BaseGenerator):
|
||||
def __init__(self, project_dir: str, llm: str, batch: int = 16, *args, **kwargs):
|
||||
"""
|
||||
Initialize the Llama Index LLM module.
|
||||
|
||||
:param project_dir: The project directory.
|
||||
:param llm: A llama index LLM instance.
|
||||
:param batch: The batch size for llm.
|
||||
Set low if you face some errors.
|
||||
Default is 16.
|
||||
:param kwargs: The extra parameters for initializing the llm instance.
|
||||
"""
|
||||
super().__init__(project_dir=project_dir, llm=llm)
|
||||
if self.llm not in generator_models.keys():
|
||||
raise ValueError(
|
||||
f"{self.llm} is not a valid llm name. Please check the llm name."
|
||||
"You can check valid llm names from autorag.generator_models."
|
||||
)
|
||||
self.batch = batch
|
||||
llm_class = generator_models[self.llm]
|
||||
|
||||
if llm_class.class_name() in [
|
||||
"HuggingFace_LLM",
|
||||
"HuggingFaceInferenceAPI",
|
||||
"TextGenerationInference",
|
||||
]:
|
||||
model_name = kwargs.pop("model", None)
|
||||
if model_name is not None:
|
||||
kwargs["model_name"] = model_name
|
||||
else:
|
||||
if "model_name" not in kwargs.keys():
|
||||
raise ValueError(
|
||||
"`model` or `model_name` parameter must be provided for using huggingfacellm."
|
||||
)
|
||||
kwargs["tokenizer_name"] = kwargs["model_name"]
|
||||
self.llm_instance: BaseLLM = llm_class(**pop_params(llm_class.__init__, kwargs))
|
||||
|
||||
def __del__(self):
|
||||
super().__del__()
|
||||
del self.llm_instance
|
||||
|
||||
@result_to_dataframe(["generated_texts", "generated_tokens", "generated_log_probs"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
prompts = self.cast_to_run(previous_result=previous_result)
|
||||
return self._pure(prompts)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
prompts: List[str],
|
||||
) -> Tuple[List[str], List[List[int]], List[List[float]]]:
|
||||
"""
|
||||
Llama Index LLM module.
|
||||
It gets the LLM instance from llama index, and returns generated text by the input prompt.
|
||||
It does not generate the right log probs, but it returns the pseudo log probs,
|
||||
which are not meant to be used for other modules.
|
||||
|
||||
:param prompts: A list of prompts.
|
||||
:return: A tuple of three elements.
|
||||
The first element is a list of a generated text.
|
||||
The second element is a list of generated text's token ids, used tokenizer is GPT2Tokenizer.
|
||||
The third element is a list of generated text's pseudo log probs.
|
||||
"""
|
||||
tasks = [self.llm_instance.acomplete(prompt) for prompt in prompts]
|
||||
loop = get_event_loop() # get_event_loop()
|
||||
results = loop.run_until_complete(process_batch(tasks, batch_size=self.batch))
|
||||
|
||||
generated_texts = list(map(lambda x: x.text, results))
|
||||
tokenizer = AutoTokenizer.from_pretrained("gpt2", use_fast=False)
|
||||
tokenized_ids = tokenizer(generated_texts).data["input_ids"]
|
||||
pseudo_log_probs = list(map(lambda x: [0.5] * len(x), tokenized_ids))
|
||||
return generated_texts, tokenized_ids, pseudo_log_probs
|
||||
|
||||
async def astream(self, prompt: str, **kwargs):
|
||||
async for completion_response in await self.llm_instance.astream_complete(
|
||||
prompt
|
||||
):
|
||||
yield completion_response.text
|
||||
|
||||
def stream(self, prompt: str, **kwargs):
|
||||
for completion_response in self.llm_instance.stream_complete(prompt):
|
||||
yield completion_response.text
|
||||
296
autorag/nodes/generator/openai_llm.py
Normal file
296
autorag/nodes/generator/openai_llm.py
Normal file
@@ -0,0 +1,296 @@
|
||||
import logging
|
||||
from typing import List, Tuple
|
||||
|
||||
import pandas as pd
|
||||
import tiktoken
|
||||
from openai import AsyncOpenAI
|
||||
from tiktoken import Encoding
|
||||
|
||||
from autorag.nodes.generator.base import BaseGenerator
|
||||
from autorag.utils.util import (
|
||||
get_event_loop,
|
||||
process_batch,
|
||||
pop_params,
|
||||
result_to_dataframe,
|
||||
)
|
||||
|
||||
logger = logging.getLogger("AutoRAG")
|
||||
|
||||
MAX_TOKEN_DICT = { # model name : token limit
|
||||
"gpt-4.5-preview": 128_000,
|
||||
"gpt-4.5-preview-2025-02-27": 128_000,
|
||||
"o1": 200_000,
|
||||
"o1-preview": 128_000,
|
||||
"o1-preview-2024-09-12": 128_000,
|
||||
"o1-mini": 128_000,
|
||||
"o1-mini-2024-09-12": 128_000,
|
||||
"o3-mini": 200_000,
|
||||
"gpt-4o-mini": 128_000,
|
||||
"gpt-4o-mini-2024-07-18": 128_000,
|
||||
"gpt-4o": 128_000,
|
||||
"gpt-4o-2024-08-06": 128_000,
|
||||
"gpt-4o-2024-05-13": 128_000,
|
||||
"chatgpt-4o-latest": 128_000,
|
||||
"gpt-4-turbo": 128_000,
|
||||
"gpt-4-turbo-2024-04-09": 128_000,
|
||||
"gpt-4-turbo-preview": 128_000,
|
||||
"gpt-4-0125-preview": 128_000,
|
||||
"gpt-4-1106-preview": 128_000,
|
||||
"gpt-4-vision-preview": 128_000,
|
||||
"gpt-4-1106-vision-preview": 128_000,
|
||||
"gpt-4": 8_192,
|
||||
"gpt-4-0613": 8_192,
|
||||
"gpt-4-32k": 32_768,
|
||||
"gpt-4-32k-0613": 32_768,
|
||||
"gpt-3.5-turbo-0125": 16_385,
|
||||
"gpt-3.5-turbo": 16_385,
|
||||
"gpt-3.5-turbo-1106": 16_385,
|
||||
"gpt-3.5-turbo-instruct": 4_096,
|
||||
"gpt-3.5-turbo-16k": 16_385,
|
||||
"gpt-3.5-turbo-0613": 4_096,
|
||||
"gpt-3.5-turbo-16k-0613": 16_385,
|
||||
}
|
||||
|
||||
|
||||
class OpenAILLM(BaseGenerator):
|
||||
def __init__(self, project_dir, llm: str, batch: int = 16, *args, **kwargs):
|
||||
super().__init__(project_dir, llm, *args, **kwargs)
|
||||
assert batch > 0, "batch size must be greater than 0."
|
||||
self.batch = batch
|
||||
|
||||
client_init_params = pop_params(AsyncOpenAI.__init__, kwargs)
|
||||
self.client = AsyncOpenAI(**client_init_params)
|
||||
|
||||
if self.llm.startswith("gpt-4.5"):
|
||||
self.tokenizer = tiktoken.get_encoding("o200k_base")
|
||||
else:
|
||||
self.tokenizer = tiktoken.encoding_for_model(self.llm)
|
||||
|
||||
self.max_token_size = (
|
||||
MAX_TOKEN_DICT.get(self.llm) - 7
|
||||
) # because of chat token usage
|
||||
if self.max_token_size is None:
|
||||
raise ValueError(
|
||||
f"Model {self.llm} does not supported. "
|
||||
f"Please select the model between {list(MAX_TOKEN_DICT.keys())}"
|
||||
)
|
||||
|
||||
@result_to_dataframe(["generated_texts", "generated_tokens", "generated_log_probs"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
prompts = self.cast_to_run(previous_result)
|
||||
return self._pure(prompts, **kwargs)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
prompts: List[str],
|
||||
truncate: bool = True,
|
||||
**kwargs,
|
||||
) -> Tuple[List[str], List[List[int]], List[List[float]]]:
|
||||
"""
|
||||
OpenAI generator module.
|
||||
Uses an official openai library for generating answer from the given prompt.
|
||||
It returns real token ids and log probs, so you must use this for using token ids and log probs.
|
||||
|
||||
:param prompts: A list of prompts.
|
||||
:param llm: A model name for openai.
|
||||
Default is gpt-3.5-turbo.
|
||||
:param batch: Batch size for openai api call.
|
||||
If you get API limit errors, you should lower the batch size.
|
||||
Default is 16.
|
||||
:param truncate: Whether to truncate the input prompt.
|
||||
Default is True.
|
||||
:param api_key: OpenAI API key. You can set this by passing env variable `OPENAI_API_KEY`
|
||||
:param kwargs: The optional parameter for openai api call `openai.chat.completion`
|
||||
See https://platform.openai.com/docs/api-reference/chat/create for more details.
|
||||
:return: A tuple of three elements.
|
||||
The first element is a list of generated text.
|
||||
The second element is a list of generated text's token ids.
|
||||
The third element is a list of generated text's log probs.
|
||||
"""
|
||||
if kwargs.get("logprobs") is not None:
|
||||
kwargs.pop("logprobs")
|
||||
logger.warning(
|
||||
"parameter logprob does not effective. It always set to True."
|
||||
)
|
||||
if kwargs.get("n") is not None:
|
||||
kwargs.pop("n")
|
||||
logger.warning("parameter n does not effective. It always set to 1.")
|
||||
|
||||
# TODO: fix this after updating tiktoken for the gpt-4.5 model. It is not yet supported yet.
|
||||
if truncate:
|
||||
prompts = list(
|
||||
map(
|
||||
lambda prompt: truncate_by_token(
|
||||
prompt, self.tokenizer, self.max_token_size
|
||||
),
|
||||
prompts,
|
||||
)
|
||||
)
|
||||
|
||||
openai_chat_params = pop_params(self.client.chat.completions.create, kwargs)
|
||||
loop = get_event_loop()
|
||||
if self.llm.startswith("o1") or self.llm.startswith("o3"):
|
||||
tasks = [
|
||||
self.get_result_o1(prompt, **openai_chat_params) for prompt in prompts
|
||||
]
|
||||
else:
|
||||
tasks = [
|
||||
self.get_result(prompt, **openai_chat_params) for prompt in prompts
|
||||
]
|
||||
result = loop.run_until_complete(process_batch(tasks, self.batch))
|
||||
answer_result = list(map(lambda x: x[0], result))
|
||||
token_result = list(map(lambda x: x[1], result))
|
||||
logprob_result = list(map(lambda x: x[2], result))
|
||||
return answer_result, token_result, logprob_result
|
||||
|
||||
def structured_output(self, prompts: List[str], output_cls, **kwargs):
|
||||
supported_models = [
|
||||
"gpt-4o-mini-2024-07-18",
|
||||
"gpt-4o-2024-08-06",
|
||||
]
|
||||
if self.llm not in supported_models:
|
||||
raise ValueError(
|
||||
f"{self.llm} is not a valid model name for structured output. "
|
||||
f"Please select the model between {supported_models}"
|
||||
)
|
||||
|
||||
if kwargs.get("logprobs") is not None:
|
||||
kwargs.pop("logprobs")
|
||||
logger.warning(
|
||||
"parameter logprob does not effective. It always set to False."
|
||||
)
|
||||
if kwargs.get("n") is not None:
|
||||
kwargs.pop("n")
|
||||
logger.warning("parameter n does not effective. It always set to 1.")
|
||||
|
||||
# TODO: fix this after updating tiktoken for the gpt-4.5 model. It is not yet supported yet.
|
||||
prompts = list(
|
||||
map(
|
||||
lambda prompt: truncate_by_token(
|
||||
prompt, self.tokenizer, self.max_token_size
|
||||
),
|
||||
prompts,
|
||||
)
|
||||
)
|
||||
|
||||
openai_chat_params = pop_params(self.client.beta.chat.completions.parse, kwargs)
|
||||
loop = get_event_loop()
|
||||
tasks = [
|
||||
self.get_structured_result(prompt, output_cls, **openai_chat_params)
|
||||
for prompt in prompts
|
||||
]
|
||||
result = loop.run_until_complete(process_batch(tasks, self.batch))
|
||||
return result
|
||||
|
||||
async def astream(self, prompt: str, **kwargs):
|
||||
# TODO: gpt-4.5-preview does not support logprobs. It should be fixed after the openai update.
|
||||
if kwargs.get("logprobs") is not None:
|
||||
kwargs.pop("logprobs")
|
||||
logger.warning(
|
||||
"parameter logprob does not effective. It always set to False."
|
||||
)
|
||||
if kwargs.get("n") is not None:
|
||||
kwargs.pop("n")
|
||||
logger.warning("parameter n does not effective. It always set to 1.")
|
||||
|
||||
prompt = truncate_by_token(prompt, self.tokenizer, self.max_token_size)
|
||||
|
||||
openai_chat_params = pop_params(self.client.chat.completions.create, kwargs)
|
||||
|
||||
stream = await self.client.chat.completions.create(
|
||||
model=self.llm,
|
||||
messages=[
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
logprobs=False,
|
||||
n=1,
|
||||
stream=True,
|
||||
**openai_chat_params,
|
||||
)
|
||||
result = ""
|
||||
async for chunk in stream:
|
||||
if chunk.choices[0].delta.content is not None:
|
||||
result += chunk.choices[0].delta.content
|
||||
yield result
|
||||
|
||||
def stream(self, prompt: str, **kwargs):
|
||||
raise NotImplementedError("stream method is not implemented yet.")
|
||||
|
||||
async def get_structured_result(self, prompt: str, output_cls, **kwargs):
|
||||
logprobs = True
|
||||
if self.llm.startswith("gpt-4.5"):
|
||||
logprobs = False
|
||||
response = await self.client.beta.chat.completions.parse(
|
||||
model=self.llm,
|
||||
messages=[
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
response_format=output_cls,
|
||||
logprobs=logprobs,
|
||||
n=1,
|
||||
**kwargs,
|
||||
)
|
||||
return response.choices[0].message.parsed
|
||||
|
||||
async def get_result(self, prompt: str, **kwargs):
|
||||
# TODO: gpt-4.5-preview does not support logprobs. It should be fixed after the openai update.
|
||||
logprobs = True
|
||||
if self.llm.startswith("gpt-4.5"):
|
||||
logprobs = False
|
||||
response = await self.client.chat.completions.create(
|
||||
model=self.llm,
|
||||
messages=[
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
logprobs=logprobs,
|
||||
n=1,
|
||||
**kwargs,
|
||||
)
|
||||
choice = response.choices[0]
|
||||
answer = choice.message.content
|
||||
# TODO: gpt-4.5-preview does not support logprobs. It should be fixed after the openai update.
|
||||
if self.llm.startswith("gpt-4.5"):
|
||||
tokens = self.tokenizer.encode(answer, allowed_special="all")
|
||||
logprobs = [0.5] * len(tokens)
|
||||
logger.warning("gpt-4.5-preview does not support logprobs yet.")
|
||||
else:
|
||||
logprobs = list(map(lambda x: x.logprob, choice.logprobs.content))
|
||||
tokens = list(
|
||||
map(
|
||||
lambda x: self.tokenizer.encode(x.token, allowed_special="all")[0],
|
||||
choice.logprobs.content,
|
||||
)
|
||||
)
|
||||
assert len(tokens) == len(
|
||||
logprobs
|
||||
), "tokens and logprobs size is different."
|
||||
return answer, tokens, logprobs
|
||||
|
||||
async def get_result_o1(self, prompt: str, **kwargs):
|
||||
assert self.llm.startswith("o1") or self.llm.startswith(
|
||||
"o3"
|
||||
), "This function only supports o1 or o3 model."
|
||||
# The default temperature for the o1 model is 1. 1 is only supported.
|
||||
# See https://platform.openai.com/docs/guides/reasoning about beta limitation of o1 models.
|
||||
kwargs["temperature"] = 1
|
||||
kwargs["top_p"] = 1
|
||||
kwargs["presence_penalty"] = 0
|
||||
kwargs["frequency_penalty"] = 0
|
||||
response = await self.client.chat.completions.create(
|
||||
model=self.llm,
|
||||
messages=[
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
logprobs=False,
|
||||
n=1,
|
||||
**kwargs,
|
||||
)
|
||||
answer = response.choices[0].message.content
|
||||
tokens = self.tokenizer.encode(answer, allowed_special="all")
|
||||
pseudo_log_probs = [0.5] * len(tokens)
|
||||
return answer, tokens, pseudo_log_probs
|
||||
|
||||
|
||||
def truncate_by_token(prompt: str, tokenizer: Encoding, max_token_size: int):
|
||||
tokens = tokenizer.encode(prompt, allowed_special="all")
|
||||
return tokenizer.decode(tokens[:max_token_size])
|
||||
144
autorag/nodes/generator/run.py
Normal file
144
autorag/nodes/generator/run.py
Normal file
@@ -0,0 +1,144 @@
|
||||
import os
|
||||
import pathlib
|
||||
from typing import List, Dict, Union
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.evaluation import evaluate_generation
|
||||
from autorag.evaluation.util import cast_metrics
|
||||
from autorag.schema.metricinput import MetricInput
|
||||
from autorag.strategy import measure_speed, filter_by_threshold, select_best
|
||||
from autorag.utils.util import to_list
|
||||
|
||||
|
||||
def run_generator_node(
|
||||
modules: List,
|
||||
module_params: List[Dict],
|
||||
previous_result: pd.DataFrame,
|
||||
node_line_dir: str,
|
||||
strategies: Dict,
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Run evaluation and select the best module among generator node results.
|
||||
And save the results and summary to generator node directory.
|
||||
|
||||
:param modules: Generator modules to run.
|
||||
:param module_params: Generator module parameters.
|
||||
Including node parameters, which is used for every module in this node.
|
||||
:param previous_result: Previous result dataframe.
|
||||
Could be prompt maker node's result.
|
||||
:param node_line_dir: This node line's directory.
|
||||
:param strategies: Strategies for generator node.
|
||||
:return: The best result dataframe.
|
||||
It contains previous result columns and generator node's result columns.
|
||||
"""
|
||||
if not os.path.exists(node_line_dir):
|
||||
os.makedirs(node_line_dir)
|
||||
project_dir = pathlib.PurePath(node_line_dir).parent.parent
|
||||
node_dir = os.path.join(node_line_dir, "generator") # node name
|
||||
if not os.path.exists(node_dir):
|
||||
os.makedirs(node_dir)
|
||||
qa_data = pd.read_parquet(
|
||||
os.path.join(project_dir, "data", "qa.parquet"), engine="pyarrow"
|
||||
)
|
||||
if "generation_gt" not in qa_data.columns:
|
||||
raise ValueError("You must have 'generation_gt' column in qa.parquet.")
|
||||
|
||||
results, execution_times = zip(
|
||||
*map(
|
||||
lambda x: measure_speed(
|
||||
x[0].run_evaluator,
|
||||
project_dir=project_dir,
|
||||
previous_result=previous_result,
|
||||
**x[1],
|
||||
),
|
||||
zip(modules, module_params),
|
||||
)
|
||||
)
|
||||
average_times = list(map(lambda x: x / len(results[0]), execution_times))
|
||||
|
||||
# get average token usage
|
||||
token_usages = list(map(lambda x: x["generated_tokens"].apply(len).mean(), results))
|
||||
|
||||
# make rows to metric_inputs
|
||||
generation_gt = to_list(qa_data["generation_gt"].tolist())
|
||||
|
||||
metric_inputs = [MetricInput(generation_gt=gen_gt) for gen_gt in generation_gt]
|
||||
|
||||
metric_names, metric_params = cast_metrics(strategies.get("metrics"))
|
||||
if metric_names is None or len(metric_names) <= 0:
|
||||
raise ValueError("You must at least one metrics for generator evaluation.")
|
||||
results = list(
|
||||
map(
|
||||
lambda result: evaluate_generator_node(
|
||||
result, metric_inputs, strategies.get("metrics")
|
||||
),
|
||||
results,
|
||||
)
|
||||
)
|
||||
|
||||
# save results to folder
|
||||
filepaths = list(
|
||||
map(lambda x: os.path.join(node_dir, f"{x}.parquet"), range(len(modules)))
|
||||
)
|
||||
list(
|
||||
map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths))
|
||||
) # execute save to parquet
|
||||
filenames = list(map(lambda x: os.path.basename(x), filepaths))
|
||||
|
||||
summary_df = pd.DataFrame(
|
||||
{
|
||||
"filename": filenames,
|
||||
"module_name": list(map(lambda module: module.__name__, modules)),
|
||||
"module_params": module_params,
|
||||
"execution_time": average_times,
|
||||
"average_output_token": token_usages,
|
||||
**{
|
||||
metric: list(map(lambda x: x[metric].mean(), results))
|
||||
for metric in metric_names
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
# filter by strategies
|
||||
if strategies.get("speed_threshold") is not None:
|
||||
results, filenames = filter_by_threshold(
|
||||
results, average_times, strategies["speed_threshold"], filenames
|
||||
)
|
||||
if strategies.get("token_threshold") is not None:
|
||||
results, filenames = filter_by_threshold(
|
||||
results, token_usages, strategies["token_threshold"], filenames
|
||||
)
|
||||
selected_result, selected_filename = select_best(
|
||||
results, metric_names, filenames, strategies.get("strategy", "mean")
|
||||
)
|
||||
best_result = pd.concat([previous_result, selected_result], axis=1)
|
||||
|
||||
# add 'is_best' column at summary file
|
||||
summary_df["is_best"] = summary_df["filename"] == selected_filename
|
||||
|
||||
# save files
|
||||
summary_df.to_csv(os.path.join(node_dir, "summary.csv"), index=False)
|
||||
best_result.to_parquet(
|
||||
os.path.join(
|
||||
node_dir, f"best_{os.path.splitext(selected_filename)[0]}.parquet"
|
||||
),
|
||||
index=False,
|
||||
)
|
||||
return best_result
|
||||
|
||||
|
||||
def evaluate_generator_node(
|
||||
result_df: pd.DataFrame,
|
||||
metric_inputs: List[MetricInput],
|
||||
metrics: Union[List[str], List[Dict]],
|
||||
):
|
||||
@evaluate_generation(metric_inputs=metric_inputs, metrics=metrics)
|
||||
def evaluate_generation_module(df: pd.DataFrame):
|
||||
return (
|
||||
df["generated_texts"].tolist(),
|
||||
df["generated_tokens"].tolist(),
|
||||
df["generated_log_probs"].tolist(),
|
||||
)
|
||||
|
||||
return evaluate_generation_module(result_df)
|
||||
121
autorag/nodes/generator/vllm.py
Normal file
121
autorag/nodes/generator/vllm.py
Normal file
@@ -0,0 +1,121 @@
|
||||
import gc
|
||||
from copy import deepcopy
|
||||
from typing import List, Tuple
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.generator.base import BaseGenerator
|
||||
from autorag.utils import result_to_dataframe
|
||||
from autorag.utils.util import pop_params, to_list
|
||||
|
||||
|
||||
class Vllm(BaseGenerator):
|
||||
def __init__(self, project_dir: str, llm: str, **kwargs):
|
||||
super().__init__(project_dir, llm, **kwargs)
|
||||
try:
|
||||
from vllm import SamplingParams, LLM
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Please install vllm library. You can install it by running `pip install vllm`."
|
||||
)
|
||||
|
||||
model_from_kwargs = kwargs.pop("model", None)
|
||||
model = llm if model_from_kwargs is None else model_from_kwargs
|
||||
|
||||
input_kwargs = deepcopy(kwargs)
|
||||
sampling_params_init_params = pop_params(
|
||||
SamplingParams.from_optional, input_kwargs
|
||||
)
|
||||
self.vllm_model = LLM(model, **input_kwargs)
|
||||
|
||||
# delete not sampling param keys in the kwargs
|
||||
kwargs_keys = list(kwargs.keys())
|
||||
for key in kwargs_keys:
|
||||
if key not in sampling_params_init_params:
|
||||
kwargs.pop(key)
|
||||
|
||||
def __del__(self):
|
||||
try:
|
||||
import torch
|
||||
import contextlib
|
||||
|
||||
if torch.cuda.is_available():
|
||||
from vllm.distributed.parallel_state import (
|
||||
destroy_model_parallel,
|
||||
destroy_distributed_environment,
|
||||
)
|
||||
|
||||
destroy_model_parallel()
|
||||
destroy_distributed_environment()
|
||||
del self.vllm_model.llm_engine.model_executor
|
||||
del self.vllm_model
|
||||
with contextlib.suppress(AssertionError):
|
||||
torch.distributed.destroy_process_group()
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
torch.cuda.synchronize()
|
||||
except ImportError:
|
||||
del self.vllm_model
|
||||
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["generated_texts", "generated_tokens", "generated_log_probs"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
prompts = self.cast_to_run(previous_result)
|
||||
return self._pure(prompts, **kwargs)
|
||||
|
||||
def _pure(
|
||||
self, prompts: List[str], **kwargs
|
||||
) -> Tuple[List[str], List[List[int]], List[List[float]]]:
|
||||
"""
|
||||
Vllm module.
|
||||
It gets the VLLM instance and returns generated texts by the input prompt.
|
||||
You can set logprobs to get the log probs of the generated text.
|
||||
Default logprobs is 1.
|
||||
|
||||
:param prompts: A list of prompts.
|
||||
:param kwargs: The extra parameters for generating the text.
|
||||
:return: A tuple of three elements.
|
||||
The first element is a list of generated text.
|
||||
The second element is a list of generated text's token ids.
|
||||
The third element is a list of generated text's log probs.
|
||||
"""
|
||||
try:
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.sequence import SampleLogprobs
|
||||
from vllm import SamplingParams
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Please install vllm library. You can install it by running `pip install vllm`."
|
||||
)
|
||||
|
||||
if "logprobs" not in kwargs:
|
||||
kwargs["logprobs"] = 1
|
||||
|
||||
sampling_params = pop_params(SamplingParams.from_optional, kwargs)
|
||||
generate_params = SamplingParams(**sampling_params)
|
||||
results: List[RequestOutput] = self.vllm_model.generate(
|
||||
prompts, generate_params
|
||||
)
|
||||
generated_texts = list(map(lambda x: x.outputs[0].text, results))
|
||||
generated_token_ids = list(map(lambda x: x.outputs[0].token_ids, results))
|
||||
log_probs: List[SampleLogprobs] = list(
|
||||
map(lambda x: x.outputs[0].logprobs, results)
|
||||
)
|
||||
generated_log_probs = list(
|
||||
map(
|
||||
lambda x: list(map(lambda y: y[0][y[1]].logprob, zip(x[0], x[1]))),
|
||||
zip(log_probs, generated_token_ids),
|
||||
)
|
||||
)
|
||||
return (
|
||||
to_list(generated_texts),
|
||||
to_list(generated_token_ids),
|
||||
to_list(generated_log_probs),
|
||||
)
|
||||
|
||||
async def astream(self, prompt: str, **kwargs):
|
||||
raise NotImplementedError
|
||||
|
||||
def stream(self, prompt: str, **kwargs):
|
||||
raise NotImplementedError
|
||||
176
autorag/nodes/generator/vllm_api.py
Normal file
176
autorag/nodes/generator/vllm_api.py
Normal file
@@ -0,0 +1,176 @@
|
||||
import logging
|
||||
from typing import List, Tuple
|
||||
import time
|
||||
|
||||
import pandas as pd
|
||||
import requests
|
||||
from asyncio import to_thread
|
||||
|
||||
from autorag.nodes.generator.base import BaseGenerator
|
||||
from autorag.utils.util import get_event_loop, process_batch, result_to_dataframe
|
||||
|
||||
logger = logging.getLogger("AutoRAG")
|
||||
|
||||
DEFAULT_MAX_TOKENS = 4096 # Default token limit
|
||||
|
||||
|
||||
class VllmAPI(BaseGenerator):
|
||||
def __init__(
|
||||
self,
|
||||
project_dir,
|
||||
llm: str,
|
||||
uri: str,
|
||||
max_tokens: int = None,
|
||||
batch: int = 16,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
VLLM API Wrapper for OpenAI-compatible chat/completions format.
|
||||
|
||||
:param project_dir: Project directory.
|
||||
:param llm: Model name (e.g., LLaMA model).
|
||||
:param uri: VLLM API server URI.
|
||||
:param max_tokens: Maximum token limit.
|
||||
Default is 4096.
|
||||
:param batch: Request batch size.
|
||||
Default is 16.
|
||||
"""
|
||||
super().__init__(project_dir, llm, *args, **kwargs)
|
||||
assert batch > 0, "Batch size must be greater than 0."
|
||||
self.uri = uri.rstrip("/") # Set API URI
|
||||
self.batch = batch
|
||||
# Use the provided max_tokens if available, otherwise use the default
|
||||
self.max_token_size = max_tokens if max_tokens else DEFAULT_MAX_TOKENS
|
||||
self.max_model_len = self.get_max_model_length()
|
||||
logger.info(f"{llm} max model length: {self.max_model_len}")
|
||||
|
||||
@result_to_dataframe(["generated_texts", "generated_tokens", "generated_log_probs"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
prompts = self.cast_to_run(previous_result)
|
||||
return self._pure(prompts, **kwargs)
|
||||
|
||||
def _pure(
|
||||
self, prompts: List[str], truncate: bool = True, **kwargs
|
||||
) -> Tuple[List[str], List[List[int]], List[List[float]]]:
|
||||
"""
|
||||
Method to call the VLLM API to generate text.
|
||||
|
||||
:param prompts: List of input prompts.
|
||||
:param truncate: Whether to truncate input prompts to fit within the token limit.
|
||||
:param kwargs: Additional options (e.g., temperature, top_p).
|
||||
:return: Generated text, token lists, and log probability lists.
|
||||
"""
|
||||
if kwargs.get("logprobs") is not None:
|
||||
kwargs.pop("logprobs")
|
||||
logger.warning(
|
||||
"parameter logprob does not effective. It always set to True."
|
||||
)
|
||||
if kwargs.get("n") is not None:
|
||||
kwargs.pop("n")
|
||||
logger.warning("parameter n does not effective. It always set to 1.")
|
||||
|
||||
if truncate:
|
||||
prompts = list(map(lambda p: self.truncate_by_token(p), prompts))
|
||||
loop = get_event_loop()
|
||||
tasks = [to_thread(self.get_result, prompt, **kwargs) for prompt in prompts]
|
||||
results = loop.run_until_complete(process_batch(tasks, self.batch))
|
||||
|
||||
answer_result = list(map(lambda x: x[0], results))
|
||||
token_result = list(map(lambda x: x[1], results))
|
||||
logprob_result = list(map(lambda x: x[2], results))
|
||||
return answer_result, token_result, logprob_result
|
||||
|
||||
def truncate_by_token(self, prompt: str) -> str:
|
||||
"""
|
||||
Function to truncate prompts to fit within the maximum token limit.
|
||||
"""
|
||||
tokens = self.encoding_for_model(prompt)["tokens"] # Simple tokenization
|
||||
return self.decoding_for_model(tokens[: self.max_model_len])["prompt"]
|
||||
|
||||
def call_vllm_api(self, prompt: str, **kwargs) -> dict:
|
||||
"""
|
||||
Calls the VLLM API to get chat/completions responses.
|
||||
|
||||
:param prompt: Input prompt.
|
||||
:param kwargs: Additional API options (e.g., temperature, max_tokens).
|
||||
:return: API response.
|
||||
"""
|
||||
payload = {
|
||||
"model": self.llm,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"temperature": kwargs.get("temperature", 0.4),
|
||||
"max_tokens": min(
|
||||
kwargs.get("max_tokens", self.max_token_size), self.max_token_size
|
||||
),
|
||||
"logprobs": True,
|
||||
"n": 1,
|
||||
}
|
||||
start_time = time.time() # Record request start time
|
||||
response = requests.post(f"{self.uri}/v1/chat/completions", json=payload)
|
||||
end_time = time.time() # Record request end time
|
||||
|
||||
response.raise_for_status()
|
||||
elapsed_time = end_time - start_time # Calculate elapsed time
|
||||
logger.info(
|
||||
f"Request chat completions to vllm server completed in {elapsed_time:.2f} seconds"
|
||||
)
|
||||
return response.json()
|
||||
|
||||
# Additional method: abstract method implementation
|
||||
async def astream(self, prompt: str, **kwargs):
|
||||
"""
|
||||
Asynchronous streaming method not implemented.
|
||||
"""
|
||||
raise NotImplementedError("astream method is not implemented for VLLM API yet.")
|
||||
|
||||
def stream(self, prompt: str, **kwargs):
|
||||
"""
|
||||
Synchronous streaming method not implemented.
|
||||
"""
|
||||
raise NotImplementedError("stream method is not implemented for VLLM API yet.")
|
||||
|
||||
def get_result(self, prompt: str, **kwargs):
|
||||
response = self.call_vllm_api(prompt, **kwargs)
|
||||
choice = response["choices"][0]
|
||||
answer = choice["message"]["content"]
|
||||
|
||||
# Handle cases where logprobs is None
|
||||
if choice.get("logprobs") and "content" in choice["logprobs"]:
|
||||
logprobs = list(map(lambda x: x["logprob"], choice["logprobs"]["content"]))
|
||||
tokens = list(
|
||||
map(
|
||||
lambda x: self.encoding_for_model(x["token"])["tokens"],
|
||||
choice["logprobs"]["content"],
|
||||
)
|
||||
)
|
||||
else:
|
||||
logprobs = []
|
||||
tokens = []
|
||||
|
||||
return answer, tokens, logprobs
|
||||
|
||||
def encoding_for_model(self, answer_piece: str):
|
||||
payload = {
|
||||
"model": self.llm,
|
||||
"prompt": answer_piece,
|
||||
"add_special_tokens": True,
|
||||
}
|
||||
response = requests.post(f"{self.uri}/tokenize", json=payload)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
def decoding_for_model(self, tokens: list[int]):
|
||||
payload = {
|
||||
"model": self.llm,
|
||||
"tokens": tokens,
|
||||
}
|
||||
response = requests.post(f"{self.uri}/detokenize", json=payload)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
def get_max_model_length(self):
|
||||
response = requests.get(f"{self.uri}/v1/models")
|
||||
response.raise_for_status()
|
||||
json_data = response.json()
|
||||
return json_data["data"][0]["max_model_len"]
|
||||
2
autorag/nodes/passageaugmenter/__init__.py
Normal file
2
autorag/nodes/passageaugmenter/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
from .pass_passage_augmenter import PassPassageAugmenter
|
||||
from .prev_next_augmenter import PrevNextPassageAugmenter
|
||||
80
autorag/nodes/passageaugmenter/base.py
Normal file
80
autorag/nodes/passageaugmenter/base.py
Normal file
@@ -0,0 +1,80 @@
|
||||
import abc
|
||||
import logging
|
||||
import os
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.schema import BaseModule
|
||||
from autorag.utils import (
|
||||
validate_qa_dataset,
|
||||
sort_by_scores,
|
||||
validate_corpus_dataset,
|
||||
cast_corpus_dataset,
|
||||
)
|
||||
from autorag.utils.util import select_top_k
|
||||
|
||||
logger = logging.getLogger("AutoRAG")
|
||||
|
||||
|
||||
class BasePassageAugmenter(BaseModule, metaclass=abc.ABCMeta):
|
||||
def __init__(self, project_dir: str, *args, **kwargs):
|
||||
logger.info(
|
||||
f"Initialize passage augmenter node - {self.__class__.__name__} module..."
|
||||
)
|
||||
data_dir = os.path.join(project_dir, "data")
|
||||
corpus_df = pd.read_parquet(
|
||||
os.path.join(data_dir, "corpus.parquet"), engine="pyarrow"
|
||||
)
|
||||
validate_corpus_dataset(corpus_df)
|
||||
corpus_df = cast_corpus_dataset(corpus_df)
|
||||
self.corpus_df = corpus_df
|
||||
|
||||
def __del__(self):
|
||||
logger.info(
|
||||
f"Initialize passage augmenter node - {self.__class__.__name__} module..."
|
||||
)
|
||||
|
||||
def cast_to_run(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
logger.info(
|
||||
f"Running passage augmenter node - {self.__class__.__name__} module..."
|
||||
)
|
||||
validate_qa_dataset(previous_result)
|
||||
|
||||
# find ids columns
|
||||
assert (
|
||||
"retrieved_ids" in previous_result.columns
|
||||
), "previous_result must have retrieved_ids column."
|
||||
ids = previous_result["retrieved_ids"].tolist()
|
||||
|
||||
return ids
|
||||
|
||||
@staticmethod
|
||||
def sort_by_scores(
|
||||
augmented_contents,
|
||||
augmented_ids,
|
||||
augmented_scores,
|
||||
top_k: int,
|
||||
reverse: bool = True,
|
||||
):
|
||||
# sort by scores
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"contents": augmented_contents,
|
||||
"ids": augmented_ids,
|
||||
"scores": augmented_scores,
|
||||
}
|
||||
)
|
||||
df[["contents", "ids", "scores"]] = df.apply(
|
||||
lambda row: sort_by_scores(row, reverse=reverse),
|
||||
axis=1,
|
||||
result_type="expand",
|
||||
)
|
||||
|
||||
# select by top_k
|
||||
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
|
||||
|
||||
return (
|
||||
results["contents"].tolist(),
|
||||
results["ids"].tolist(),
|
||||
results["scores"].tolist(),
|
||||
)
|
||||
43
autorag/nodes/passageaugmenter/pass_passage_augmenter.py
Normal file
43
autorag/nodes/passageaugmenter/pass_passage_augmenter.py
Normal file
@@ -0,0 +1,43 @@
|
||||
from typing import List
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.passageaugmenter.base import BasePassageAugmenter
|
||||
from autorag.utils import result_to_dataframe
|
||||
|
||||
|
||||
class PassPassageAugmenter(BasePassageAugmenter):
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
"""
|
||||
Run the passage augmenter node - PassPassageAugmenter module.
|
||||
|
||||
:param previous_result: The previous result Dataframe.
|
||||
:param top_k: You must input the top_k value to get the top k results.
|
||||
:param kwargs: Not affected.
|
||||
:return: DataFrame with retrieved_contents, retrieved_ids, and retrieve_scores columns
|
||||
"""
|
||||
top_k = kwargs.pop("top_k")
|
||||
|
||||
ids = self.cast_to_run(previous_result)
|
||||
contents = previous_result["retrieved_contents"].tolist()
|
||||
scores = previous_result["retrieve_scores"].tolist()
|
||||
|
||||
augmented_ids, augmented_contents, augmented_scores = self._pure(
|
||||
ids, contents, scores
|
||||
)
|
||||
return self.sort_by_scores(
|
||||
augmented_contents, augmented_ids, augmented_scores, top_k
|
||||
)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
ids_list: List[List[str]],
|
||||
contents_list: List[List[str]],
|
||||
scores_list: List[List[float]],
|
||||
):
|
||||
"""
|
||||
Do not perform augmentation.
|
||||
Return given passages, scores, and ids as is.
|
||||
"""
|
||||
return ids_list, contents_list, scores_list
|
||||
155
autorag/nodes/passageaugmenter/prev_next_augmenter.py
Normal file
155
autorag/nodes/passageaugmenter/prev_next_augmenter.py
Normal file
@@ -0,0 +1,155 @@
|
||||
from typing import List, Union
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from autorag.embedding.base import EmbeddingModel
|
||||
from autorag.evaluation.metric.util import calculate_cosine_similarity
|
||||
from autorag.nodes.passageaugmenter.base import BasePassageAugmenter
|
||||
from autorag.utils.util import (
|
||||
filter_dict_keys,
|
||||
fetch_contents,
|
||||
embedding_query_content,
|
||||
result_to_dataframe,
|
||||
empty_cuda_cache,
|
||||
)
|
||||
|
||||
|
||||
class PrevNextPassageAugmenter(BasePassageAugmenter):
|
||||
def __init__(
|
||||
self,
|
||||
project_dir: str,
|
||||
embedding_model: Union[str, dict] = "openai",
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Initialize the PrevNextPassageAugmenter module.
|
||||
|
||||
:param project_dir:
|
||||
:param embedding_model: The embedding model name to use for calculating cosine similarity
|
||||
Default is openai (text-embedding-ada-002)
|
||||
:param kwargs:
|
||||
"""
|
||||
super().__init__(project_dir, *args, **kwargs)
|
||||
slim_corpus_df = self.corpus_df[["doc_id", "metadata"]]
|
||||
slim_corpus_df.loc[:, "metadata"] = slim_corpus_df["metadata"].apply(
|
||||
filter_dict_keys, keys=["prev_id", "next_id"]
|
||||
)
|
||||
self.slim_corpus_df = slim_corpus_df
|
||||
|
||||
# init embedding model
|
||||
self.embedding_model = EmbeddingModel.load(embedding_model)()
|
||||
|
||||
def __del__(self):
|
||||
del self.embedding_model
|
||||
empty_cuda_cache()
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
"""
|
||||
Run the passage augmenter node - PrevNextPassageAugmenter module.
|
||||
|
||||
:param previous_result: The previous result Dataframe.
|
||||
:param top_k: You must input the top_k value to get the top k results.
|
||||
:param kwargs: Not affected.
|
||||
:return: DataFrame with retrieved_contents, retrieved_ids, and retrieve_scores columns
|
||||
"""
|
||||
top_k = kwargs.pop("top_k")
|
||||
|
||||
ids = self.cast_to_run(previous_result)
|
||||
# find queries columns
|
||||
assert (
|
||||
"query" in previous_result.columns
|
||||
), "previous_result must have query column."
|
||||
queries = previous_result["query"].tolist()
|
||||
|
||||
mode = kwargs.pop("mode", "both")
|
||||
num_passages = kwargs.pop("num_passages", 1)
|
||||
augmented_ids = self._pure(ids, num_passages, mode)
|
||||
|
||||
# fetch contents from corpus to use augmented ids
|
||||
augmented_contents = fetch_contents(self.corpus_df, augmented_ids)
|
||||
|
||||
query_embeddings, contents_embeddings = embedding_query_content(
|
||||
queries, augmented_contents, self.embedding_model, batch=128
|
||||
)
|
||||
|
||||
# get scores from calculated cosine similarity
|
||||
augmented_scores = [
|
||||
np.array(
|
||||
[
|
||||
calculate_cosine_similarity(query_embedding, x)
|
||||
for x in content_embeddings
|
||||
]
|
||||
).tolist()
|
||||
for query_embedding, content_embeddings in zip(
|
||||
query_embeddings, contents_embeddings
|
||||
)
|
||||
]
|
||||
return self.sort_by_scores(
|
||||
augmented_contents, augmented_ids, augmented_scores, top_k
|
||||
)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
ids_list: List[List[str]],
|
||||
num_passages: int = 1,
|
||||
mode: str = "both",
|
||||
) -> List[List[str]]:
|
||||
"""
|
||||
Add passages before and/or after the retrieved passage.
|
||||
For more information, visit https://docs.llamaindex.ai/en/stable/examples/node_postprocessor/PrevNextPostprocessorDemo/.
|
||||
|
||||
:param ids_list: The list of lists of ids retrieved
|
||||
:param num_passages: The number of passages to add before and after the retrieved passage
|
||||
Default is 1.
|
||||
:param mode: The mode of augmentation
|
||||
'prev': add passages before the retrieved passage
|
||||
'next': add passages after the retrieved passage
|
||||
'both': add passages before and after the retrieved passage
|
||||
Default is 'next'.
|
||||
:return: The list of lists of augmented ids
|
||||
"""
|
||||
if mode not in ["prev", "next", "both"]:
|
||||
raise ValueError(f"mode must be 'prev', 'next', or 'both', but got {mode}")
|
||||
|
||||
augmented_ids = [
|
||||
(
|
||||
lambda ids: prev_next_augmenter_pure(
|
||||
ids, self.slim_corpus_df, mode, num_passages
|
||||
)
|
||||
)(ids)
|
||||
for ids in ids_list
|
||||
]
|
||||
|
||||
return augmented_ids
|
||||
|
||||
|
||||
def prev_next_augmenter_pure(
|
||||
ids: List[str], corpus_df: pd.DataFrame, mode: str, num_passages: int
|
||||
):
|
||||
def fetch_id_sequence(start_id, key):
|
||||
sequence = []
|
||||
current_id = start_id
|
||||
for _ in range(num_passages):
|
||||
current_id = (
|
||||
corpus_df.loc[corpus_df["doc_id"] == current_id]["metadata"]
|
||||
.values[0]
|
||||
.get(key)
|
||||
)
|
||||
if current_id is None:
|
||||
break
|
||||
sequence.append(current_id)
|
||||
return sequence
|
||||
|
||||
augmented_group = []
|
||||
for id_ in ids:
|
||||
current_ids = [id_]
|
||||
if mode in ["prev", "both"]:
|
||||
current_ids = fetch_id_sequence(id_, "prev_id")[::-1] + current_ids
|
||||
if mode in ["next", "both"]:
|
||||
current_ids += fetch_id_sequence(id_, "next_id")
|
||||
augmented_group.extend(current_ids)
|
||||
return augmented_group
|
||||
131
autorag/nodes/passageaugmenter/run.py
Normal file
131
autorag/nodes/passageaugmenter/run.py
Normal file
@@ -0,0 +1,131 @@
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
from typing import List, Dict
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.retrieval.run import evaluate_retrieval_node
|
||||
from autorag.schema.metricinput import MetricInput
|
||||
from autorag.strategy import measure_speed, filter_by_threshold, select_best
|
||||
from autorag.utils.util import apply_recursive, to_list
|
||||
|
||||
logger = logging.getLogger("AutoRAG")
|
||||
|
||||
|
||||
def run_passage_augmenter_node(
|
||||
modules: List,
|
||||
module_params: List[Dict],
|
||||
previous_result: pd.DataFrame,
|
||||
node_line_dir: str,
|
||||
strategies: Dict,
|
||||
) -> pd.DataFrame:
|
||||
if not os.path.exists(node_line_dir):
|
||||
os.makedirs(node_line_dir)
|
||||
project_dir = pathlib.PurePath(node_line_dir).parent.parent
|
||||
qa_df = pd.read_parquet(
|
||||
os.path.join(project_dir, "data", "qa.parquet"), engine="pyarrow"
|
||||
)
|
||||
retrieval_gt = qa_df["retrieval_gt"].tolist()
|
||||
retrieval_gt = apply_recursive(lambda x: str(x), to_list(retrieval_gt))
|
||||
|
||||
results, execution_times = zip(
|
||||
*map(
|
||||
lambda task: measure_speed(
|
||||
task[0].run_evaluator,
|
||||
project_dir=project_dir,
|
||||
previous_result=previous_result,
|
||||
**task[1],
|
||||
),
|
||||
zip(modules, module_params),
|
||||
)
|
||||
)
|
||||
average_times = list(map(lambda x: x / len(results[0]), execution_times))
|
||||
metric_inputs = [
|
||||
MetricInput(retrieval_gt=ret_gt, query=query, generation_gt=gen_gt)
|
||||
for ret_gt, query, gen_gt in zip(
|
||||
retrieval_gt,
|
||||
previous_result["query"].tolist(),
|
||||
previous_result["generation_gt"].tolist(),
|
||||
)
|
||||
]
|
||||
|
||||
# run metrics before filtering
|
||||
if strategies.get("metrics") is None:
|
||||
raise ValueError(
|
||||
"You must at least one metrics for passage_augmenter evaluation."
|
||||
)
|
||||
results = list(
|
||||
map(
|
||||
lambda x: evaluate_retrieval_node(
|
||||
x,
|
||||
metric_inputs,
|
||||
strategies.get("metrics"),
|
||||
),
|
||||
results,
|
||||
)
|
||||
)
|
||||
|
||||
# save results to folder
|
||||
save_dir = os.path.join(node_line_dir, "passage_augmenter") # node name
|
||||
if not os.path.exists(save_dir):
|
||||
os.makedirs(save_dir)
|
||||
filepaths = list(
|
||||
map(lambda x: os.path.join(save_dir, f"{x}.parquet"), range(len(modules)))
|
||||
)
|
||||
list(
|
||||
map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths))
|
||||
) # execute save to parquet
|
||||
filenames = list(map(lambda x: os.path.basename(x), filepaths))
|
||||
|
||||
summary_df = pd.DataFrame(
|
||||
{
|
||||
"filename": filenames,
|
||||
"module_name": list(map(lambda module: module.__name__, modules)),
|
||||
"module_params": module_params,
|
||||
"execution_time": average_times,
|
||||
**{
|
||||
f"passage_augmenter_{metric}": list(
|
||||
map(lambda result: result[metric].mean(), results)
|
||||
)
|
||||
for metric in strategies.get("metrics")
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
# filter by strategies
|
||||
if strategies.get("speed_threshold") is not None:
|
||||
results, filenames = filter_by_threshold(
|
||||
results, average_times, strategies["speed_threshold"], filenames
|
||||
)
|
||||
selected_result, selected_filename = select_best(
|
||||
results,
|
||||
strategies.get("metrics"),
|
||||
filenames,
|
||||
strategies.get("strategy", "mean"),
|
||||
)
|
||||
# change metric name columns to passage_augmenter_metric_name
|
||||
selected_result = selected_result.rename(
|
||||
columns={
|
||||
metric_name: f"passage_augmenter_{metric_name}"
|
||||
for metric_name in strategies["metrics"]
|
||||
}
|
||||
)
|
||||
# drop retrieval result columns in previous_result
|
||||
previous_result = previous_result.drop(
|
||||
columns=["retrieved_contents", "retrieved_ids", "retrieve_scores"]
|
||||
)
|
||||
best_result = pd.concat([previous_result, selected_result], axis=1)
|
||||
|
||||
# add 'is_best' column to summary file
|
||||
summary_df["is_best"] = summary_df["filename"] == selected_filename
|
||||
|
||||
# save files
|
||||
summary_df.to_csv(os.path.join(save_dir, "summary.csv"), index=False)
|
||||
best_result.to_parquet(
|
||||
os.path.join(
|
||||
save_dir, f"best_{os.path.splitext(selected_filename)[0]}.parquet"
|
||||
),
|
||||
index=False,
|
||||
)
|
||||
return best_result
|
||||
4
autorag/nodes/passagecompressor/__init__.py
Normal file
4
autorag/nodes/passagecompressor/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from .longllmlingua import LongLLMLingua
|
||||
from .pass_compressor import PassCompressor
|
||||
from .refine import Refine
|
||||
from .tree_summarize import TreeSummarize
|
||||
83
autorag/nodes/passagecompressor/base.py
Normal file
83
autorag/nodes/passagecompressor/base.py
Normal file
@@ -0,0 +1,83 @@
|
||||
import abc
|
||||
import logging
|
||||
from typing import Dict
|
||||
|
||||
import pandas as pd
|
||||
from llama_index.core.llms import LLM
|
||||
|
||||
from autorag import generator_models
|
||||
from autorag.schema import BaseModule
|
||||
from autorag.utils import result_to_dataframe
|
||||
|
||||
logger = logging.getLogger("AutoRAG")
|
||||
|
||||
|
||||
class BasePassageCompressor(BaseModule, metaclass=abc.ABCMeta):
|
||||
def __init__(self, project_dir: str, *args, **kwargs):
|
||||
logger.info(
|
||||
f"Initialize passage compressor node - {self.__class__.__name__} module..."
|
||||
)
|
||||
|
||||
def __del__(self):
|
||||
logger.info(
|
||||
f"Deleting passage compressor node - {self.__class__.__name__} module..."
|
||||
)
|
||||
|
||||
def cast_to_run(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
logger.info(
|
||||
f"Running passage compressor node - {self.__class__.__name__} module..."
|
||||
)
|
||||
assert all(
|
||||
[
|
||||
column in previous_result.columns
|
||||
for column in [
|
||||
"query",
|
||||
"retrieved_contents",
|
||||
]
|
||||
]
|
||||
), "previous_result must have retrieved_contents, retrieved_ids, and retrieve_scores columns."
|
||||
assert len(previous_result) > 0, "previous_result must have at least one row."
|
||||
|
||||
queries = previous_result["query"].tolist()
|
||||
retrieved_contents = previous_result["retrieved_contents"].tolist()
|
||||
return queries, retrieved_contents
|
||||
|
||||
|
||||
class LlamaIndexCompressor(BasePassageCompressor, metaclass=abc.ABCMeta):
|
||||
param_list = ["prompt", "chat_prompt", "batch"]
|
||||
|
||||
def __init__(self, project_dir: str, **kwargs):
|
||||
"""
|
||||
Initialize passage compressor module.
|
||||
|
||||
:param project_dir: The project directory
|
||||
:param llm: The llm name that will be used to summarize.
|
||||
The LlamaIndex LLM model can be used in here.
|
||||
:param kwargs: Extra parameter for init llm
|
||||
"""
|
||||
super().__init__(project_dir)
|
||||
kwargs_dict = dict(
|
||||
filter(lambda x: x[0] not in self.param_list, kwargs.items())
|
||||
)
|
||||
llm_name = kwargs_dict.pop("llm")
|
||||
self.llm: LLM = make_llm(llm_name, kwargs_dict)
|
||||
|
||||
def __del__(self):
|
||||
del self.llm
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries, retrieved_contents = self.cast_to_run(previous_result)
|
||||
param_dict = dict(filter(lambda x: x[0] in self.param_list, kwargs.items()))
|
||||
result = self._pure(queries, retrieved_contents, **param_dict)
|
||||
return list(map(lambda x: [x], result))
|
||||
|
||||
|
||||
def make_llm(llm_name: str, kwargs: Dict) -> LLM:
|
||||
if llm_name not in generator_models:
|
||||
raise KeyError(
|
||||
f"{llm_name} is not supported. "
|
||||
"You can add it manually by calling autorag.generator_models."
|
||||
)
|
||||
return generator_models[llm_name](**kwargs)
|
||||
115
autorag/nodes/passagecompressor/longllmlingua.py
Normal file
115
autorag/nodes/passagecompressor/longllmlingua.py
Normal file
@@ -0,0 +1,115 @@
|
||||
from typing import List, Optional
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.passagecompressor.base import BasePassageCompressor
|
||||
from autorag.utils.util import pop_params, result_to_dataframe, empty_cuda_cache
|
||||
|
||||
|
||||
# TODO: Parallel Processing Refactoring at #460
|
||||
|
||||
|
||||
class LongLLMLingua(BasePassageCompressor):
|
||||
def __init__(
|
||||
self, project_dir: str, model_name: str = "NousResearch/Llama-2-7b-hf", **kwargs
|
||||
):
|
||||
try:
|
||||
from llmlingua import PromptCompressor
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"LongLLMLingua is not installed. Please install it by running `pip install llmlingua`."
|
||||
)
|
||||
|
||||
super().__init__(project_dir)
|
||||
model_init_params = pop_params(PromptCompressor.__init__, kwargs)
|
||||
self.llm_lingua = PromptCompressor(model_name=model_name, **model_init_params)
|
||||
|
||||
def __del__(self):
|
||||
del self.llm_lingua
|
||||
empty_cuda_cache()
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries, retrieved_contents = self.cast_to_run(previous_result)
|
||||
results = self._pure(queries, retrieved_contents, **kwargs)
|
||||
return list(map(lambda x: [x], results))
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents: List[List[str]],
|
||||
instructions: Optional[str] = None,
|
||||
target_token: int = 300,
|
||||
**kwargs,
|
||||
) -> List[str]:
|
||||
"""
|
||||
Compresses the retrieved texts using LongLLMLingua.
|
||||
For more information, visit https://github.com/microsoft/LLMLingua.
|
||||
|
||||
:param queries: The queries for retrieved passages.
|
||||
:param contents: The contents of retrieved passages.
|
||||
:param model_name: The model name to use for compression.
|
||||
The default is "NousResearch/Llama-2-7b-hf".
|
||||
:param instructions: The instructions for compression.
|
||||
Default is None. When it is None, it will use default instructions.
|
||||
:param target_token: The target token for compression.
|
||||
Default is 300.
|
||||
:param kwargs: Additional keyword arguments.
|
||||
:return: The list of compressed texts.
|
||||
"""
|
||||
if instructions is None:
|
||||
instructions = "Given the context, please answer the final question"
|
||||
results = [
|
||||
llmlingua_pure(
|
||||
query, contents_, self.llm_lingua, instructions, target_token, **kwargs
|
||||
)
|
||||
for query, contents_ in zip(queries, contents)
|
||||
]
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def llmlingua_pure(
|
||||
query: str,
|
||||
contents: List[str],
|
||||
llm_lingua,
|
||||
instructions: str,
|
||||
target_token: int = 300,
|
||||
**kwargs,
|
||||
) -> str:
|
||||
"""
|
||||
Return the compressed text.
|
||||
|
||||
:param query: The query for retrieved passages.
|
||||
:param contents: The contents of retrieved passages.
|
||||
:param llm_lingua: The llm instance, that will be used to compress.
|
||||
:param instructions: The instructions for compression.
|
||||
:param target_token: The target token for compression.
|
||||
Default is 300.
|
||||
:param kwargs: Additional keyword arguments.
|
||||
:return: The compressed text.
|
||||
"""
|
||||
try:
|
||||
from llmlingua import PromptCompressor
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"LongLLMLingua is not installed. Please install it by running `pip install llmlingua`."
|
||||
)
|
||||
# split by "\n\n" (recommended by LongLLMLingua authors)
|
||||
new_context_texts = [c for context in contents for c in context.split("\n\n")]
|
||||
compress_prompt_params = pop_params(PromptCompressor.compress_prompt, kwargs)
|
||||
compressed_prompt = llm_lingua.compress_prompt(
|
||||
new_context_texts,
|
||||
question=query,
|
||||
instruction=instructions,
|
||||
rank_method="longllmlingua",
|
||||
target_token=target_token,
|
||||
**compress_prompt_params,
|
||||
)
|
||||
compressed_prompt_txt = compressed_prompt["compressed_prompt"]
|
||||
|
||||
# separate out the question and instruction
|
||||
result = "\n\n".join(compressed_prompt_txt.split("\n\n")[1:-1])
|
||||
|
||||
return result
|
||||
16
autorag/nodes/passagecompressor/pass_compressor.py
Normal file
16
autorag/nodes/passagecompressor/pass_compressor.py
Normal file
@@ -0,0 +1,16 @@
|
||||
from typing import List
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.passagecompressor.base import BasePassageCompressor
|
||||
from autorag.utils import result_to_dataframe
|
||||
|
||||
|
||||
class PassCompressor(BasePassageCompressor):
|
||||
@result_to_dataframe(["retrieved_contents"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
_, contents = self.cast_to_run(previous_result)
|
||||
return self._pure(contents)
|
||||
|
||||
def _pure(self, contents: List[List[str]]):
|
||||
return contents
|
||||
54
autorag/nodes/passagecompressor/refine.py
Normal file
54
autorag/nodes/passagecompressor/refine.py
Normal file
@@ -0,0 +1,54 @@
|
||||
from typing import List, Optional
|
||||
|
||||
from llama_index.core import PromptTemplate
|
||||
from llama_index.core.prompts import PromptType
|
||||
from llama_index.core.prompts.utils import is_chat_model
|
||||
from llama_index.core.response_synthesizers import Refine as rf
|
||||
|
||||
from autorag.nodes.passagecompressor.base import LlamaIndexCompressor
|
||||
from autorag.utils.util import get_event_loop, process_batch
|
||||
|
||||
|
||||
class Refine(LlamaIndexCompressor):
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents: List[List[str]],
|
||||
prompt: Optional[str] = None,
|
||||
chat_prompt: Optional[str] = None,
|
||||
batch: int = 16,
|
||||
) -> List[str]:
|
||||
"""
|
||||
Refine a response to a query across text chunks.
|
||||
This function is a wrapper for llama_index.response_synthesizers.Refine.
|
||||
For more information, visit https://docs.llamaindex.ai/en/stable/examples/response_synthesizers/refine/.
|
||||
|
||||
:param queries: The queries for retrieved passages.
|
||||
:param contents: The contents of retrieved passages.
|
||||
:param prompt: The prompt template for refine.
|
||||
If you want to use chat prompt, you should pass chat_prompt instead.
|
||||
At prompt, you must specify where to put 'context_msg' and 'query_str'.
|
||||
Default is None. When it is None, it will use llama index default prompt.
|
||||
:param chat_prompt: The chat prompt template for refine.
|
||||
If you want to use normal prompt, you should pass prompt instead.
|
||||
At prompt, you must specify where to put 'context_msg' and 'query_str'.
|
||||
Default is None. When it is None, it will use llama index default chat prompt.
|
||||
:param batch: The batch size for llm.
|
||||
Set low if you face some errors.
|
||||
Default is 16.
|
||||
:return: The list of compressed texts.
|
||||
"""
|
||||
if prompt is not None and not is_chat_model(self.llm):
|
||||
refine_template = PromptTemplate(prompt, prompt_type=PromptType.REFINE)
|
||||
elif chat_prompt is not None and is_chat_model(self.llm):
|
||||
refine_template = PromptTemplate(chat_prompt, prompt_type=PromptType.REFINE)
|
||||
else:
|
||||
refine_template = None
|
||||
summarizer = rf(llm=self.llm, refine_template=refine_template, verbose=True)
|
||||
tasks = [
|
||||
summarizer.aget_response(query, content)
|
||||
for query, content in zip(queries, contents)
|
||||
]
|
||||
loop = get_event_loop()
|
||||
results = loop.run_until_complete(process_batch(tasks, batch_size=batch))
|
||||
return results
|
||||
186
autorag/nodes/passagecompressor/run.py
Normal file
186
autorag/nodes/passagecompressor/run.py
Normal file
@@ -0,0 +1,186 @@
|
||||
import os.path
|
||||
import pathlib
|
||||
from typing import List, Dict
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.evaluation.metric import (
|
||||
retrieval_token_recall,
|
||||
retrieval_token_precision,
|
||||
retrieval_token_f1,
|
||||
)
|
||||
from autorag.schema.metricinput import MetricInput
|
||||
from autorag.strategy import measure_speed, filter_by_threshold, select_best
|
||||
from autorag.utils.util import fetch_contents
|
||||
|
||||
|
||||
def run_passage_compressor_node(
|
||||
modules: List,
|
||||
module_params: List[Dict],
|
||||
previous_result: pd.DataFrame,
|
||||
node_line_dir: str,
|
||||
strategies: Dict,
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Run evaluation and select the best module among passage compressor modules.
|
||||
|
||||
:param modules: Passage compressor modules to run.
|
||||
:param module_params: Passage compressor module parameters.
|
||||
:param previous_result: Previous result dataframe.
|
||||
Could be retrieval, reranker modules result.
|
||||
It means it must contain 'query', 'retrieved_contents', 'retrieved_ids', 'retrieve_scores' columns.
|
||||
:param node_line_dir: This node line's directory.
|
||||
:param strategies: Strategies for passage compressor node.
|
||||
In this node, we use
|
||||
You can skip evaluation when you use only one module and a module parameter.
|
||||
:return: The best result dataframe with previous result columns.
|
||||
This node will replace 'retrieved_contents' to compressed passages, so its length will be one.
|
||||
"""
|
||||
if not os.path.exists(node_line_dir):
|
||||
os.makedirs(node_line_dir)
|
||||
project_dir = pathlib.PurePath(node_line_dir).parent.parent
|
||||
data_dir = os.path.join(project_dir, "data")
|
||||
save_dir = os.path.join(node_line_dir, "passage_compressor")
|
||||
if not os.path.exists(save_dir):
|
||||
os.makedirs(save_dir)
|
||||
|
||||
# make retrieval contents gt
|
||||
qa_data = pd.read_parquet(os.path.join(data_dir, "qa.parquet"), engine="pyarrow")
|
||||
corpus_data = pd.read_parquet(
|
||||
os.path.join(data_dir, "corpus.parquet"), engine="pyarrow"
|
||||
)
|
||||
# check qa_data have retrieval_gt
|
||||
assert all(
|
||||
len(x[0]) > 0 for x in qa_data["retrieval_gt"].tolist()
|
||||
), "Can't use passage compressor if you don't have retrieval gt values in QA dataset."
|
||||
|
||||
# run modules
|
||||
results, execution_times = zip(
|
||||
*map(
|
||||
lambda task: measure_speed(
|
||||
task[0].run_evaluator,
|
||||
project_dir=project_dir,
|
||||
previous_result=previous_result,
|
||||
**task[1],
|
||||
),
|
||||
zip(modules, module_params),
|
||||
)
|
||||
)
|
||||
results = list(results)
|
||||
average_times = list(map(lambda x: x / len(results[0]), execution_times))
|
||||
|
||||
retrieval_gt_contents = list(
|
||||
map(lambda x: fetch_contents(corpus_data, x), qa_data["retrieval_gt"].tolist())
|
||||
)
|
||||
|
||||
metric_inputs = [
|
||||
MetricInput(retrieval_gt_contents=ret_cont_gt)
|
||||
for ret_cont_gt in retrieval_gt_contents
|
||||
]
|
||||
|
||||
# run metrics before filtering
|
||||
if strategies.get("metrics") is None:
|
||||
raise ValueError(
|
||||
"You must at least one metrics for retrieval contents evaluation."
|
||||
"It can be 'retrieval_token_f1', 'retrieval_token_precision', 'retrieval_token_recall'."
|
||||
)
|
||||
results = list(
|
||||
map(
|
||||
lambda x: evaluate_passage_compressor_node(
|
||||
x, metric_inputs, strategies.get("metrics")
|
||||
),
|
||||
results,
|
||||
)
|
||||
)
|
||||
|
||||
# save results to folder
|
||||
filepaths = list(
|
||||
map(lambda x: os.path.join(save_dir, f"{x}.parquet"), range(len(modules)))
|
||||
)
|
||||
list(
|
||||
map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths))
|
||||
) # execute save to parquet
|
||||
filenames = list(map(lambda x: os.path.basename(x), filepaths))
|
||||
|
||||
# make summary file
|
||||
summary_df = pd.DataFrame(
|
||||
{
|
||||
"filename": filenames,
|
||||
"module_name": list(map(lambda module: module.__name__, modules)),
|
||||
"module_params": module_params,
|
||||
"execution_time": average_times,
|
||||
**{
|
||||
f"passage_compressor_{metric}": list(
|
||||
map(lambda result: result[metric].mean(), results)
|
||||
)
|
||||
for metric in strategies.get("metrics")
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
# filter by strategies
|
||||
if strategies.get("speed_threshold") is not None:
|
||||
results, filenames = filter_by_threshold(
|
||||
results, average_times, strategies["speed_threshold"], filenames
|
||||
)
|
||||
selected_result, selected_filename = select_best(
|
||||
results,
|
||||
strategies.get("metrics"),
|
||||
filenames,
|
||||
strategies.get("strategy", "mean"),
|
||||
)
|
||||
new_retrieved_contents = selected_result["retrieved_contents"]
|
||||
previous_result["retrieved_contents"] = new_retrieved_contents
|
||||
selected_result = selected_result.drop(columns=["retrieved_contents"])
|
||||
best_result = pd.concat([previous_result, selected_result], axis=1)
|
||||
|
||||
# add 'is_best' column to summary file
|
||||
summary_df["is_best"] = summary_df["filename"] == selected_filename
|
||||
|
||||
# add prefix 'passage_compressor' to best_result columns
|
||||
best_result = best_result.rename(
|
||||
columns={
|
||||
metric_name: f"passage_compressor_{metric_name}"
|
||||
for metric_name in strategies.get("metrics")
|
||||
}
|
||||
)
|
||||
|
||||
# save the result files
|
||||
best_result.to_parquet(
|
||||
os.path.join(
|
||||
save_dir, f"best_{os.path.splitext(selected_filename)[0]}.parquet"
|
||||
),
|
||||
index=False,
|
||||
)
|
||||
summary_df.to_csv(os.path.join(save_dir, "summary.csv"), index=False)
|
||||
return best_result
|
||||
|
||||
|
||||
def evaluate_passage_compressor_node(
|
||||
result_df: pd.DataFrame, metric_inputs: List[MetricInput], metrics: List[str]
|
||||
):
|
||||
metric_funcs = {
|
||||
retrieval_token_recall.__name__: retrieval_token_recall,
|
||||
retrieval_token_precision.__name__: retrieval_token_precision,
|
||||
retrieval_token_f1.__name__: retrieval_token_f1,
|
||||
}
|
||||
for metric_input, generated_text in zip(
|
||||
metric_inputs, result_df["retrieved_contents"].tolist()
|
||||
):
|
||||
metric_input.retrieved_contents = generated_text
|
||||
metrics = list(filter(lambda x: x in metric_funcs.keys(), metrics))
|
||||
if len(metrics) <= 0:
|
||||
raise ValueError(f"metrics must be one of {metric_funcs.keys()}")
|
||||
metrics_scores = dict(
|
||||
map(
|
||||
lambda metric: (
|
||||
metric,
|
||||
metric_funcs[metric](
|
||||
metric_inputs=metric_inputs,
|
||||
),
|
||||
),
|
||||
metrics,
|
||||
)
|
||||
)
|
||||
result_df = pd.concat([result_df, pd.DataFrame(metrics_scores)], axis=1)
|
||||
return result_df
|
||||
56
autorag/nodes/passagecompressor/tree_summarize.py
Normal file
56
autorag/nodes/passagecompressor/tree_summarize.py
Normal file
@@ -0,0 +1,56 @@
|
||||
from typing import List, Optional
|
||||
|
||||
from llama_index.core import PromptTemplate
|
||||
from llama_index.core.prompts import PromptType
|
||||
from llama_index.core.prompts.utils import is_chat_model
|
||||
from llama_index.core.response_synthesizers import TreeSummarize as ts
|
||||
|
||||
from autorag.nodes.passagecompressor.base import LlamaIndexCompressor
|
||||
from autorag.utils.util import get_event_loop, process_batch
|
||||
|
||||
|
||||
class TreeSummarize(LlamaIndexCompressor):
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents: List[List[str]],
|
||||
prompt: Optional[str] = None,
|
||||
chat_prompt: Optional[str] = None,
|
||||
batch: int = 16,
|
||||
) -> List[str]:
|
||||
"""
|
||||
Recursively merge retrieved texts and summarizes them in a bottom-up fashion.
|
||||
This function is a wrapper for llama_index.response_synthesizers.TreeSummarize.
|
||||
For more information, visit https://docs.llamaindex.ai/en/latest/examples/response_synthesizers/tree_summarize.html.
|
||||
|
||||
:param queries: The queries for retrieved passages.
|
||||
:param contents: The contents of retrieved passages.
|
||||
:param prompt: The prompt template for summarization.
|
||||
If you want to use chat prompt, you should pass chat_prompt instead.
|
||||
At prompt, you must specify where to put 'context_str' and 'query_str'.
|
||||
Default is None. When it is None, it will use llama index default prompt.
|
||||
:param chat_prompt: The chat prompt template for summarization.
|
||||
If you want to use normal prompt, you should pass prompt instead.
|
||||
At prompt, you must specify where to put 'context_str' and 'query_str'.
|
||||
Default is None. When it is None, it will use llama index default chat prompt.
|
||||
:param batch: The batch size for llm.
|
||||
Set low if you face some errors.
|
||||
Default is 16.
|
||||
:return: The list of compressed texts.
|
||||
"""
|
||||
if prompt is not None and not is_chat_model(self.llm):
|
||||
summary_template = PromptTemplate(prompt, prompt_type=PromptType.SUMMARY)
|
||||
elif chat_prompt is not None and is_chat_model(self.llm):
|
||||
summary_template = PromptTemplate(
|
||||
chat_prompt, prompt_type=PromptType.SUMMARY
|
||||
)
|
||||
else:
|
||||
summary_template = None
|
||||
summarizer = ts(llm=self.llm, summary_template=summary_template, use_async=True)
|
||||
tasks = [
|
||||
summarizer.aget_response(query, content)
|
||||
for query, content in zip(queries, contents)
|
||||
]
|
||||
loop = get_event_loop()
|
||||
results = loop.run_until_complete(process_batch(tasks, batch_size=batch))
|
||||
return results
|
||||
6
autorag/nodes/passagefilter/__init__.py
Normal file
6
autorag/nodes/passagefilter/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
from .pass_passage_filter import PassPassageFilter
|
||||
from .percentile_cutoff import PercentileCutoff
|
||||
from .recency import RecencyFilter
|
||||
from .similarity_percentile_cutoff import SimilarityPercentileCutoff
|
||||
from .similarity_threshold_cutoff import SimilarityThresholdCutoff
|
||||
from .threshold_cutoff import ThresholdCutoff
|
||||
50
autorag/nodes/passagefilter/base.py
Normal file
50
autorag/nodes/passagefilter/base.py
Normal file
@@ -0,0 +1,50 @@
|
||||
import abc
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.schema.base import BaseModule
|
||||
from autorag.utils import validate_qa_dataset
|
||||
|
||||
logger = logging.getLogger("AutoRAG")
|
||||
|
||||
|
||||
class BasePassageFilter(BaseModule, metaclass=abc.ABCMeta):
|
||||
def __init__(self, project_dir: Union[str, Path], *args, **kwargs):
|
||||
logger.info(f"Initialize passage filter node - {self.__class__.__name__}")
|
||||
|
||||
def __del__(self):
|
||||
logger.info(f"Prompt maker node - {self.__class__.__name__} module is deleted.")
|
||||
|
||||
def cast_to_run(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
logger.info(
|
||||
f"Running passage filter node - {self.__class__.__name__} module..."
|
||||
)
|
||||
validate_qa_dataset(previous_result)
|
||||
|
||||
# find queries columns
|
||||
assert (
|
||||
"query" in previous_result.columns
|
||||
), "previous_result must have query column."
|
||||
queries = previous_result["query"].tolist()
|
||||
|
||||
# find contents_list columns
|
||||
assert (
|
||||
"retrieved_contents" in previous_result.columns
|
||||
), "previous_result must have retrieved_contents column."
|
||||
contents = previous_result["retrieved_contents"].tolist()
|
||||
|
||||
# find scores columns
|
||||
assert (
|
||||
"retrieve_scores" in previous_result.columns
|
||||
), "previous_result must have retrieve_scores column."
|
||||
scores = previous_result["retrieve_scores"].tolist()
|
||||
|
||||
# find ids columns
|
||||
assert (
|
||||
"retrieved_ids" in previous_result.columns
|
||||
), "previous_result must have retrieved_ids column."
|
||||
ids = previous_result["retrieved_ids"].tolist()
|
||||
return queries, contents, scores, ids
|
||||
14
autorag/nodes/passagefilter/pass_passage_filter.py
Normal file
14
autorag/nodes/passagefilter/pass_passage_filter.py
Normal file
@@ -0,0 +1,14 @@
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.passagefilter.base import BasePassageFilter
|
||||
from autorag.utils import result_to_dataframe
|
||||
|
||||
|
||||
class PassPassageFilter(BasePassageFilter):
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
_, contents, scores, ids = self.cast_to_run(previous_result)
|
||||
return contents, ids, scores
|
||||
|
||||
def _pure(self, *args, **kwargs):
|
||||
pass
|
||||
58
autorag/nodes/passagefilter/percentile_cutoff.py
Normal file
58
autorag/nodes/passagefilter/percentile_cutoff.py
Normal file
@@ -0,0 +1,58 @@
|
||||
from typing import List, Tuple
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.passagefilter.base import BasePassageFilter
|
||||
from autorag.utils.util import sort_by_scores, select_top_k, result_to_dataframe
|
||||
|
||||
|
||||
class PercentileCutoff(BasePassageFilter):
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries, contents, scores, ids = self.cast_to_run(previous_result)
|
||||
return self._pure(queries, contents, scores, ids, *args, **kwargs)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents_list: List[List[str]],
|
||||
scores_list: List[List[float]],
|
||||
ids_list: List[List[str]],
|
||||
percentile: float,
|
||||
reverse: bool = False,
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Filter out the contents that are below the content's length times percentile.
|
||||
If This is a filter and does not override scores.
|
||||
If the value of content's length times percentile is less than 1, keep the only one highest similarity content.
|
||||
|
||||
:param queries: The list of queries to use for filtering
|
||||
:param contents_list: The list of lists of contents to filter
|
||||
:param scores_list: The list of lists of scores retrieved
|
||||
:param ids_list: The list of lists of ids retrieved
|
||||
:param percentile: The percentile to cut off
|
||||
:param reverse: If True, the lower the score, the better
|
||||
Default is False.
|
||||
:return: Tuple of lists containing the filtered contents, ids, and scores
|
||||
"""
|
||||
num_top_k = max(1, int(len(scores_list[0]) * percentile))
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"contents": contents_list,
|
||||
"ids": ids_list,
|
||||
"scores": scores_list,
|
||||
}
|
||||
)
|
||||
|
||||
reverse = not reverse
|
||||
df[["contents", "ids", "scores"]] = df.apply(
|
||||
sort_by_scores, axis=1, result_type="expand", reverse=reverse
|
||||
)
|
||||
results = select_top_k(df, ["contents", "ids", "scores"], num_top_k)
|
||||
|
||||
return (
|
||||
results["contents"].tolist(),
|
||||
results["ids"].tolist(),
|
||||
results["scores"].tolist(),
|
||||
)
|
||||
105
autorag/nodes/passagefilter/recency.py
Normal file
105
autorag/nodes/passagefilter/recency.py
Normal file
@@ -0,0 +1,105 @@
|
||||
import logging
|
||||
import os
|
||||
from datetime import datetime, date
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple, Union
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.passagefilter.base import BasePassageFilter
|
||||
from autorag.utils import fetch_contents, result_to_dataframe
|
||||
|
||||
logger = logging.getLogger("AutoRAG")
|
||||
|
||||
|
||||
class RecencyFilter(BasePassageFilter):
|
||||
def __init__(self, project_dir: Union[str, Path], *args, **kwargs):
|
||||
super().__init__(project_dir, *args, **kwargs)
|
||||
self.corpus_df = pd.read_parquet(
|
||||
os.path.join(project_dir, "data", "corpus.parquet"), engine="pyarrow"
|
||||
)
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
_, contents, scores, ids = self.cast_to_run(previous_result, *args, **kwargs)
|
||||
metadatas = fetch_contents(self.corpus_df, ids, column_name="metadata")
|
||||
times = [
|
||||
[time["last_modified_datetime"] for time in time_list]
|
||||
for time_list in metadatas
|
||||
]
|
||||
return self._pure(contents, scores, ids, times, *args, **kwargs)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
contents_list: List[List[str]],
|
||||
scores_list: List[List[float]],
|
||||
ids_list: List[List[str]],
|
||||
time_list: List[List[datetime]],
|
||||
threshold_datetime: Union[datetime, date],
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Filter out the contents that are below the threshold datetime.
|
||||
If all contents are filtered, keep the only one recency content.
|
||||
If the threshold date format is incorrect, return the original contents.
|
||||
|
||||
:param contents_list: The list of lists of contents to filter
|
||||
:param scores_list: The list of lists of scores retrieved
|
||||
:param ids_list: The list of lists of ids retrieved
|
||||
:param time_list: The list of lists of datetime retrieved
|
||||
:param threshold_datetime: The threshold to cut off.
|
||||
In recency filter, you have to use the datetime.datetime object or datetime.date object.
|
||||
All you need to do is to set the date at your YAML file.
|
||||
For example, you can write "2010-09-09 3:45:06" or "2010-09-09" in the YAML file.
|
||||
:return: Tuple of lists containing the filtered contents, ids, and scores
|
||||
"""
|
||||
if not (
|
||||
isinstance(threshold_datetime, datetime)
|
||||
or isinstance(threshold_datetime, date)
|
||||
):
|
||||
raise ValueError(
|
||||
f"Threshold should be a datetime object, but got {type(threshold_datetime)}"
|
||||
)
|
||||
|
||||
if not isinstance(threshold_datetime, datetime):
|
||||
threshold_datetime = datetime.combine(
|
||||
threshold_datetime, datetime.min.time()
|
||||
)
|
||||
|
||||
time_list = [
|
||||
list(
|
||||
map(
|
||||
lambda t: datetime.combine(t, datetime.min.time())
|
||||
if not isinstance(t, datetime)
|
||||
else t,
|
||||
time,
|
||||
)
|
||||
)
|
||||
for time in time_list
|
||||
]
|
||||
|
||||
def sort_row(contents, scores, ids, time, _datetime_threshold):
|
||||
combined = list(zip(contents, scores, ids, time))
|
||||
combined_filtered = [
|
||||
item for item in combined if item[3] >= _datetime_threshold
|
||||
]
|
||||
|
||||
if combined_filtered:
|
||||
remain_contents, remain_scores, remain_ids, _ = zip(*combined_filtered)
|
||||
else:
|
||||
combined.sort(key=lambda x: x[3], reverse=True)
|
||||
remain_contents, remain_scores, remain_ids, _ = zip(*combined[:1])
|
||||
|
||||
return list(remain_contents), list(remain_ids), list(remain_scores)
|
||||
|
||||
remain_contents_list, remain_ids_list, remain_scores_list = zip(
|
||||
*map(
|
||||
sort_row,
|
||||
contents_list,
|
||||
scores_list,
|
||||
ids_list,
|
||||
time_list,
|
||||
[threshold_datetime] * len(contents_list),
|
||||
)
|
||||
)
|
||||
|
||||
return remain_contents_list, remain_ids_list, remain_scores_list
|
||||
138
autorag/nodes/passagefilter/run.py
Normal file
138
autorag/nodes/passagefilter/run.py
Normal file
@@ -0,0 +1,138 @@
|
||||
import os
|
||||
import pathlib
|
||||
from typing import List, Dict
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.retrieval.run import evaluate_retrieval_node
|
||||
from autorag.schema.metricinput import MetricInput
|
||||
from autorag.strategy import measure_speed, filter_by_threshold, select_best
|
||||
from autorag.utils.util import to_list, apply_recursive
|
||||
|
||||
|
||||
def run_passage_filter_node(
|
||||
modules: List,
|
||||
module_params: List[Dict],
|
||||
previous_result: pd.DataFrame,
|
||||
node_line_dir: str,
|
||||
strategies: Dict,
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Run evaluation and select the best module among passage filter node results.
|
||||
|
||||
:param modules: Passage filter modules to run.
|
||||
:param module_params: Passage filter module parameters.
|
||||
:param previous_result: Previous result dataframe.
|
||||
Could be retrieval, reranker, passage filter modules result.
|
||||
It means it must contain 'query', 'retrieved_contents', 'retrieved_ids', 'retrieve_scores' columns.
|
||||
:param node_line_dir: This node line's directory.
|
||||
:param strategies: Strategies for passage filter node.
|
||||
In this node, we use 'retrieval_f1', 'retrieval_recall' and 'retrieval_precision'.
|
||||
You can skip evaluation when you use only one module and a module parameter.
|
||||
:return: The best result dataframe with previous result columns.
|
||||
"""
|
||||
if not os.path.exists(node_line_dir):
|
||||
os.makedirs(node_line_dir)
|
||||
project_dir = pathlib.PurePath(node_line_dir).parent.parent
|
||||
qa_df = pd.read_parquet(
|
||||
os.path.join(project_dir, "data", "qa.parquet"), engine="pyarrow"
|
||||
)
|
||||
retrieval_gt = qa_df["retrieval_gt"].tolist()
|
||||
retrieval_gt = apply_recursive(lambda x: str(x), to_list(retrieval_gt))
|
||||
|
||||
# make rows to metric_inputs
|
||||
metric_inputs = [
|
||||
MetricInput(retrieval_gt=ret_gt, query=query, generation_gt=gen_gt)
|
||||
for ret_gt, query, gen_gt in zip(
|
||||
retrieval_gt, qa_df["query"].tolist(), qa_df["generation_gt"].tolist()
|
||||
)
|
||||
]
|
||||
|
||||
results, execution_times = zip(
|
||||
*map(
|
||||
lambda task: measure_speed(
|
||||
task[0].run_evaluator,
|
||||
project_dir=project_dir,
|
||||
previous_result=previous_result,
|
||||
**task[1],
|
||||
),
|
||||
zip(modules, module_params),
|
||||
)
|
||||
)
|
||||
average_times = list(map(lambda x: x / len(results[0]), execution_times))
|
||||
|
||||
# run metrics before filtering
|
||||
if strategies.get("metrics") is None:
|
||||
raise ValueError("You must at least one metrics for passage_filter evaluation.")
|
||||
results = list(
|
||||
map(
|
||||
lambda x: evaluate_retrieval_node(
|
||||
x,
|
||||
metric_inputs,
|
||||
strategies.get("metrics"),
|
||||
),
|
||||
results,
|
||||
)
|
||||
)
|
||||
|
||||
# save results to folder
|
||||
save_dir = os.path.join(node_line_dir, "passage_filter") # node name
|
||||
if not os.path.exists(save_dir):
|
||||
os.makedirs(save_dir)
|
||||
filepaths = list(
|
||||
map(lambda x: os.path.join(save_dir, f"{x}.parquet"), range(len(modules)))
|
||||
)
|
||||
list(
|
||||
map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths))
|
||||
) # execute save to parquet
|
||||
filenames = list(map(lambda x: os.path.basename(x), filepaths))
|
||||
|
||||
summary_df = pd.DataFrame(
|
||||
{
|
||||
"filename": filenames,
|
||||
"module_name": list(map(lambda module: module.__name__, modules)),
|
||||
"module_params": module_params,
|
||||
"execution_time": average_times,
|
||||
**{
|
||||
f"passage_filter_{metric}": list(
|
||||
map(lambda result: result[metric].mean(), results)
|
||||
)
|
||||
for metric in strategies.get("metrics")
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
# filter by strategies
|
||||
if strategies.get("speed_threshold") is not None:
|
||||
results, filenames = filter_by_threshold(
|
||||
results, average_times, strategies["speed_threshold"], filenames
|
||||
)
|
||||
selected_result, selected_filename = select_best(
|
||||
results,
|
||||
strategies.get("metrics"),
|
||||
filenames,
|
||||
strategies.get("strategy", "mean"),
|
||||
)
|
||||
selected_result = selected_result.rename(
|
||||
columns={
|
||||
metric_name: f"passage_filter_{metric_name}"
|
||||
for metric_name in strategies["metrics"]
|
||||
}
|
||||
)
|
||||
previous_result = previous_result.drop(
|
||||
columns=["retrieved_contents", "retrieved_ids", "retrieve_scores"]
|
||||
)
|
||||
best_result = pd.concat([previous_result, selected_result], axis=1)
|
||||
|
||||
# add 'is_best' column to summary file
|
||||
summary_df["is_best"] = summary_df["filename"] == selected_filename
|
||||
|
||||
# save files
|
||||
summary_df.to_csv(os.path.join(save_dir, "summary.csv"), index=False)
|
||||
best_result.to_parquet(
|
||||
os.path.join(
|
||||
save_dir, f"best_{os.path.splitext(selected_filename)[0]}.parquet"
|
||||
),
|
||||
index=False,
|
||||
)
|
||||
return best_result
|
||||
134
autorag/nodes/passagefilter/similarity_percentile_cutoff.py
Normal file
134
autorag/nodes/passagefilter/similarity_percentile_cutoff.py
Normal file
@@ -0,0 +1,134 @@
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from autorag.embedding.base import EmbeddingModel
|
||||
from autorag.evaluation.metric.util import calculate_cosine_similarity
|
||||
from autorag.nodes.passagefilter.base import BasePassageFilter
|
||||
from autorag.nodes.passagefilter.similarity_threshold_cutoff import (
|
||||
embedding_query_content,
|
||||
)
|
||||
from autorag.utils import result_to_dataframe
|
||||
from autorag.utils.util import empty_cuda_cache, pop_params
|
||||
|
||||
|
||||
class SimilarityPercentileCutoff(BasePassageFilter):
|
||||
def __init__(self, project_dir: Union[str, Path], *args, **kwargs):
|
||||
"""
|
||||
Initialize the SimilarityPercentileCutoff module
|
||||
|
||||
:param project_dir: The project directory to use for initializing the module
|
||||
:param embedding_model: The embedding model string to use for calculating similarity
|
||||
Default is "openai" which is OpenAI text-embedding-ada-002 embedding model.
|
||||
"""
|
||||
super().__init__(project_dir, *args, **kwargs)
|
||||
embedding_model = kwargs.pop("embedding_model", "openai")
|
||||
self.embedding_model = EmbeddingModel.load(embedding_model)()
|
||||
|
||||
def __del__(self):
|
||||
super().__del__()
|
||||
del self.embedding_model
|
||||
|
||||
empty_cuda_cache()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, **kwargs):
|
||||
queries, contents, scores, ids = self.cast_to_run(previous_result)
|
||||
kwargs = pop_params(self._pure, kwargs)
|
||||
return self._pure(queries, contents, scores, ids, **kwargs)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents_list: List[List[str]],
|
||||
scores_list: List[List[float]],
|
||||
ids_list: List[List[str]],
|
||||
percentile: float,
|
||||
batch: int = 128,
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Re-calculate each content's similarity with the query and filter out the contents that are below the content's
|
||||
length times percentile. If This is a filter and does not override scores. The output of scores is not coming from
|
||||
query-content similarity.
|
||||
If the value of content's length times percentile is less than 1, keep the only one highest similarity content.
|
||||
|
||||
:param queries: The list of queries to use for filtering
|
||||
:param contents_list: The list of lists of contents to filter
|
||||
:param scores_list: The list of lists of scores retrieved
|
||||
:param ids_list: The list of lists of ids retrieved
|
||||
:param percentile: The percentile to cut off
|
||||
:param batch: The number of queries to be processed in a batch
|
||||
Default is 128.
|
||||
:return: Tuple of lists containing the filtered contents, ids, and scores
|
||||
"""
|
||||
query_embeddings, content_embeddings = embedding_query_content(
|
||||
queries, contents_list, self.embedding_model, batch
|
||||
)
|
||||
|
||||
results = list(
|
||||
map(
|
||||
lambda x: self.__row_pure(x[0], x[1], x[2], x[3], x[4], percentile),
|
||||
zip(
|
||||
query_embeddings,
|
||||
content_embeddings,
|
||||
contents_list,
|
||||
ids_list,
|
||||
scores_list,
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
remain_content_list = list(map(lambda x: x[0], results))
|
||||
remain_ids_list = list(map(lambda x: x[1], results))
|
||||
remain_scores_list = list(map(lambda x: x[2], results))
|
||||
|
||||
return remain_content_list, remain_ids_list, remain_scores_list
|
||||
|
||||
@staticmethod
|
||||
def __row_pure(
|
||||
query_embedding: str,
|
||||
content_embeddings: List[List[float]],
|
||||
content_list: List[str],
|
||||
ids_list: List[str],
|
||||
scores_list: List[float],
|
||||
percentile: float,
|
||||
) -> Tuple[List[str], List[str], List[float]]:
|
||||
"""
|
||||
Return tuple of lists containing the filtered contents, ids, and scores
|
||||
|
||||
:param query_embedding: Query embedding
|
||||
:param content_embeddings: Each content embedding
|
||||
:param content_list: Each content
|
||||
:param ids_list: Each id
|
||||
:param scores_list: Each score
|
||||
:param percentile: The percentile to cut off
|
||||
:return: Tuple of lists containing the filtered contents, ids, and scores
|
||||
"""
|
||||
num_top_k = int(len(content_embeddings) * percentile)
|
||||
|
||||
if num_top_k == 0:
|
||||
num_top_k = 1
|
||||
|
||||
similarities = np.array(
|
||||
list(
|
||||
map(
|
||||
lambda x: calculate_cosine_similarity(query_embedding, x),
|
||||
content_embeddings,
|
||||
)
|
||||
)
|
||||
).tolist()
|
||||
|
||||
content_id_score_similarity = list(
|
||||
zip(ids_list, content_list, scores_list, similarities)
|
||||
)
|
||||
|
||||
sorted_content_id_score_similarity = sorted(
|
||||
content_id_score_similarity, key=lambda x: x[3], reverse=True
|
||||
)[:num_top_k]
|
||||
|
||||
content_result, id_result, score_result, _ = zip(
|
||||
*sorted_content_id_score_similarity
|
||||
)
|
||||
return list(content_result), list(id_result), list(score_result)
|
||||
112
autorag/nodes/passagefilter/similarity_threshold_cutoff.py
Normal file
112
autorag/nodes/passagefilter/similarity_threshold_cutoff.py
Normal file
@@ -0,0 +1,112 @@
|
||||
from typing import List, Tuple
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from autorag.embedding.base import EmbeddingModel
|
||||
from autorag.evaluation.metric.util import calculate_cosine_similarity
|
||||
from autorag.nodes.passagefilter.base import BasePassageFilter
|
||||
from autorag.utils.util import (
|
||||
embedding_query_content,
|
||||
empty_cuda_cache,
|
||||
result_to_dataframe,
|
||||
pop_params,
|
||||
)
|
||||
|
||||
|
||||
class SimilarityThresholdCutoff(BasePassageFilter):
|
||||
def __init__(self, project_dir: str, *args, **kwargs):
|
||||
"""
|
||||
Initialize the SimilarityThresholdCutoff module
|
||||
|
||||
:param project_dir: The project directory to use for initializing the module
|
||||
:param embedding_model: The embedding model string to use for calculating similarity
|
||||
Default is "openai" which is OpenAI text-embedding-ada-002 embedding model.
|
||||
"""
|
||||
super().__init__(project_dir, *args, **kwargs)
|
||||
embedding_model = kwargs.get("embedding_model", "openai")
|
||||
self.embedding_model = EmbeddingModel.load(embedding_model)()
|
||||
|
||||
def __del__(self):
|
||||
del self.embedding_model
|
||||
empty_cuda_cache()
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
kwargs = pop_params(self._pure, kwargs)
|
||||
queries, contents, scores, ids = self.cast_to_run(previous_result)
|
||||
return self._pure(queries, contents, scores, ids, *args, **kwargs)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents_list: List[List[str]],
|
||||
scores_list: List[List[float]],
|
||||
ids_list: List[List[str]],
|
||||
threshold: float,
|
||||
batch: int = 128,
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Re-calculate each content's similarity with the query and filter out the contents that are below the threshold.
|
||||
If all contents are filtered, keep the only one highest similarity content.
|
||||
This is a filter and does not override scores.
|
||||
The output of scores is not coming from query-content similarity.
|
||||
|
||||
:param queries: The list of queries to use for filtering
|
||||
:param contents_list: The list of lists of contents to filter
|
||||
:param scores_list: The list of lists of scores retrieved
|
||||
:param ids_list: The list of lists of ids retrieved
|
||||
:param threshold: The threshold to cut off
|
||||
:param batch: The number of queries to be processed in a batch
|
||||
Default is 128.
|
||||
:return: Tuple of lists containing the filtered contents, ids, and scores
|
||||
"""
|
||||
query_embeddings, content_embeddings = embedding_query_content(
|
||||
queries, contents_list, self.embedding_model, batch
|
||||
)
|
||||
|
||||
remain_indices = list(
|
||||
map(
|
||||
lambda x: self.__row_pure(x[0], x[1], threshold),
|
||||
zip(query_embeddings, content_embeddings),
|
||||
)
|
||||
)
|
||||
|
||||
remain_content_list = list(
|
||||
map(lambda c, idx: [c[i] for i in idx], contents_list, remain_indices)
|
||||
)
|
||||
remain_scores_list = list(
|
||||
map(lambda s, idx: [s[i] for i in idx], scores_list, remain_indices)
|
||||
)
|
||||
remain_ids_list = list(
|
||||
map(lambda _id, idx: [_id[i] for i in idx], ids_list, remain_indices)
|
||||
)
|
||||
return remain_content_list, remain_ids_list, remain_scores_list
|
||||
|
||||
@staticmethod
|
||||
def __row_pure(
|
||||
query_embedding: str, content_embeddings: List[List[float]], threshold: float
|
||||
) -> List[int]:
|
||||
"""
|
||||
Return indices that have to remain.
|
||||
Return at least one index if there is nothing to remain.
|
||||
|
||||
:param query_embedding: Query embedding
|
||||
:param content_embeddings: Each content embedding
|
||||
:param threshold: The threshold to cut off
|
||||
:return: Indices to remain at the contents
|
||||
"""
|
||||
|
||||
similarities = np.array(
|
||||
list(
|
||||
map(
|
||||
lambda x: calculate_cosine_similarity(query_embedding, x),
|
||||
content_embeddings,
|
||||
)
|
||||
)
|
||||
)
|
||||
result = np.where(similarities >= threshold)[0].tolist()
|
||||
if len(result) > 0:
|
||||
return result
|
||||
return [np.argmax(similarities)]
|
||||
78
autorag/nodes/passagefilter/threshold_cutoff.py
Normal file
78
autorag/nodes/passagefilter/threshold_cutoff.py
Normal file
@@ -0,0 +1,78 @@
|
||||
from typing import List, Tuple
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.passagefilter.base import BasePassageFilter
|
||||
from autorag.utils.util import convert_inputs_to_list, result_to_dataframe
|
||||
|
||||
|
||||
class ThresholdCutoff(BasePassageFilter):
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
_, contents, scores, ids = self.cast_to_run(previous_result)
|
||||
return self._pure(contents, scores, ids, *args, **kwargs)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
contents_list: List[List[str]],
|
||||
scores_list: List[List[float]],
|
||||
ids_list: List[List[str]],
|
||||
threshold: float,
|
||||
reverse: bool = False,
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Filters the contents, scores, and ids based on a previous result's score.
|
||||
Keeps at least one item per query if all scores are below the threshold.
|
||||
|
||||
:param contents_list: List of content strings for each query.
|
||||
:param scores_list: List of scores for each content.
|
||||
:param ids_list: List of ids for each content.
|
||||
:param threshold: The minimum score to keep an item.
|
||||
:param reverse: If True, the lower the score, the better.
|
||||
Default is False.
|
||||
:return: Filtered lists of contents, ids, and scores.
|
||||
"""
|
||||
remain_indices = list(
|
||||
map(lambda x: self.__row_pure(x, threshold, reverse), scores_list)
|
||||
)
|
||||
|
||||
remain_content_list = list(
|
||||
map(lambda c, idx: [c[i] for i in idx], contents_list, remain_indices)
|
||||
)
|
||||
remain_scores_list = list(
|
||||
map(lambda s, idx: [s[i] for i in idx], scores_list, remain_indices)
|
||||
)
|
||||
remain_ids_list = list(
|
||||
map(lambda _id, idx: [_id[i] for i in idx], ids_list, remain_indices)
|
||||
)
|
||||
|
||||
return remain_content_list, remain_ids_list, remain_scores_list
|
||||
|
||||
@convert_inputs_to_list
|
||||
def __row_pure(
|
||||
self, scores_list: List[float], threshold: float, reverse: bool = False
|
||||
) -> List[int]:
|
||||
"""
|
||||
Return indices that have to remain.
|
||||
Return at least one index if there is nothing to remain.
|
||||
|
||||
:param scores_list: Each score
|
||||
:param threshold: The threshold to cut off
|
||||
:param reverse: If True, the lower the score, the better
|
||||
Default is False.
|
||||
:return: Indices to remain at the contents
|
||||
"""
|
||||
assert isinstance(scores_list, list), "scores_list must be a list."
|
||||
|
||||
if reverse:
|
||||
remain_indices = [
|
||||
i for i, score in enumerate(scores_list) if score <= threshold
|
||||
]
|
||||
default_index = scores_list.index(min(scores_list))
|
||||
else:
|
||||
remain_indices = [
|
||||
i for i, score in enumerate(scores_list) if score >= threshold
|
||||
]
|
||||
default_index = scores_list.index(max(scores_list))
|
||||
|
||||
return remain_indices if remain_indices else [default_index]
|
||||
18
autorag/nodes/passagereranker/__init__.py
Normal file
18
autorag/nodes/passagereranker/__init__.py
Normal file
@@ -0,0 +1,18 @@
|
||||
from .cohere import CohereReranker
|
||||
from .colbert import ColbertReranker
|
||||
from .flag_embedding import FlagEmbeddingReranker
|
||||
from .flag_embedding_llm import FlagEmbeddingLLMReranker
|
||||
from .jina import JinaReranker
|
||||
from .koreranker import KoReranker
|
||||
from .monot5 import MonoT5
|
||||
from .pass_reranker import PassReranker
|
||||
from .rankgpt import RankGPT
|
||||
from .sentence_transformer import SentenceTransformerReranker
|
||||
from .time_reranker import TimeReranker
|
||||
from .upr import Upr
|
||||
from .openvino import OpenVINOReranker
|
||||
from .voyageai import VoyageAIReranker
|
||||
from .mixedbreadai import MixedbreadAIReranker
|
||||
from .flashrank import FlashRankReranker
|
||||
|
||||
from .dragonkue2 import DragonKue2 # 250313 추가 - 김용연
|
||||
55
autorag/nodes/passagereranker/base.py
Normal file
55
autorag/nodes/passagereranker/base.py
Normal file
@@ -0,0 +1,55 @@
|
||||
import abc
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.schema import BaseModule
|
||||
from autorag.utils import validate_qa_dataset
|
||||
|
||||
logger = logging.getLogger("AutoRAG")
|
||||
|
||||
|
||||
class BasePassageReranker(BaseModule, metaclass=abc.ABCMeta):
|
||||
def __init__(self, project_dir: Union[str, Path], *args, **kwargs):
|
||||
logger.info(
|
||||
f"Initialize passage reranker node - {self.__class__.__name__} module..."
|
||||
)
|
||||
|
||||
def __del__(self):
|
||||
logger.info(
|
||||
f"Deleting passage reranker node - {self.__class__.__name__} module..."
|
||||
)
|
||||
|
||||
def cast_to_run(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
logger.info(
|
||||
f"Running passage reranker node - {self.__class__.__name__} module..."
|
||||
)
|
||||
validate_qa_dataset(previous_result)
|
||||
|
||||
# find queries columns
|
||||
assert (
|
||||
"query" in previous_result.columns
|
||||
), "previous_result must have query column."
|
||||
queries = previous_result["query"].tolist()
|
||||
|
||||
# find contents_list columns
|
||||
assert (
|
||||
"retrieved_contents" in previous_result.columns
|
||||
), "previous_result must have retrieved_contents column."
|
||||
contents = previous_result["retrieved_contents"].tolist()
|
||||
|
||||
# find scores columns
|
||||
assert (
|
||||
"retrieve_scores" in previous_result.columns
|
||||
), "previous_result must have retrieve_scores column."
|
||||
scores = previous_result["retrieve_scores"].tolist()
|
||||
|
||||
# find ids columns
|
||||
assert (
|
||||
"retrieved_ids" in previous_result.columns
|
||||
), "previous_result must have retrieved_ids column."
|
||||
ids = previous_result["retrieved_ids"].tolist()
|
||||
|
||||
return queries, contents, scores, ids
|
||||
119
autorag/nodes/passagereranker/cohere.py
Normal file
119
autorag/nodes/passagereranker/cohere.py
Normal file
@@ -0,0 +1,119 @@
|
||||
import os
|
||||
from typing import List, Tuple
|
||||
|
||||
import cohere
|
||||
import pandas as pd
|
||||
from cohere import RerankResponseResultsItem
|
||||
|
||||
from autorag.nodes.passagereranker.base import BasePassageReranker
|
||||
from autorag.utils.util import get_event_loop, process_batch, result_to_dataframe
|
||||
|
||||
|
||||
class CohereReranker(BasePassageReranker):
|
||||
def __init__(self, project_dir: str, *args, **kwargs):
|
||||
"""
|
||||
Initialize Cohere rerank node.
|
||||
|
||||
:param project_dir: The project directory path.
|
||||
:param api_key: The API key for Cohere rerank.
|
||||
You can set it in the environment variable COHERE_API_KEY.
|
||||
Or, you can directly set it on the config YAML file using this parameter.
|
||||
Default is env variable "COHERE_API_KEY".
|
||||
:param kwargs: Extra arguments that are not affected
|
||||
"""
|
||||
super().__init__(project_dir)
|
||||
api_key = kwargs.pop("api_key", None)
|
||||
api_key = os.getenv("COHERE_API_KEY", None) if api_key is None else api_key
|
||||
if api_key is None:
|
||||
api_key = os.getenv("CO_API_KEY", None)
|
||||
if api_key is None:
|
||||
raise KeyError(
|
||||
"Please set the API key for Cohere rerank in the environment variable COHERE_API_KEY "
|
||||
"or directly set it on the config YAML file."
|
||||
)
|
||||
|
||||
self.cohere_client = cohere.AsyncClientV2(api_key=api_key)
|
||||
|
||||
def __del__(self):
|
||||
del self.cohere_client
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries, contents, scores, ids = self.cast_to_run(previous_result)
|
||||
top_k = kwargs.pop("top_k")
|
||||
batch = kwargs.pop("batch", 64)
|
||||
model = kwargs.pop("model", "rerank-v3.5")
|
||||
return self._pure(queries, contents, scores, ids, top_k, batch, model)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents_list: List[List[str]],
|
||||
scores_list: List[List[float]],
|
||||
ids_list: List[List[str]],
|
||||
top_k: int,
|
||||
batch: int = 64,
|
||||
model: str = "rerank-v3.5",
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Rerank a list of contents with Cohere rerank models.
|
||||
You can get the API key from https://cohere.com/rerank and set it in the environment variable COHERE_API_KEY.
|
||||
|
||||
:param queries: The list of queries to use for reranking
|
||||
:param contents_list: The list of lists of contents to rerank
|
||||
:param scores_list: The list of lists of scores retrieved from the initial ranking
|
||||
:param ids_list: The list of lists of ids retrieved from the initial ranking
|
||||
:param top_k: The number of passages to be retrieved
|
||||
:param batch: The number of queries to be processed in a batch
|
||||
:param model: The model name for Cohere rerank.
|
||||
You can choose between "rerank-v3.5", "rerank-english-v3.0", and "rerank-multilingual-v3.0".
|
||||
Default is "rerank-v3.5".
|
||||
:return: Tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
# Run async cohere_rerank_pure function
|
||||
tasks = [
|
||||
cohere_rerank_pure(self.cohere_client, model, query, document, ids, top_k)
|
||||
for query, document, ids in zip(queries, contents_list, ids_list)
|
||||
]
|
||||
loop = get_event_loop()
|
||||
results = loop.run_until_complete(process_batch(tasks, batch_size=batch))
|
||||
content_result = list(map(lambda x: x[0], results))
|
||||
id_result = list(map(lambda x: x[1], results))
|
||||
score_result = list(map(lambda x: x[2], results))
|
||||
|
||||
return content_result, id_result, score_result
|
||||
|
||||
|
||||
async def cohere_rerank_pure(
|
||||
cohere_client: cohere.AsyncClient,
|
||||
model: str,
|
||||
query: str,
|
||||
documents: List[str],
|
||||
ids: List[str],
|
||||
top_k: int,
|
||||
) -> Tuple[List[str], List[str], List[float]]:
|
||||
"""
|
||||
Rerank a list of contents with Cohere rerank models.
|
||||
|
||||
:param cohere_client: The Cohere AsyncClient to use for reranking
|
||||
:param model: The model name for Cohere rerank
|
||||
:param query: The query to use for reranking
|
||||
:param documents: The list of contents to rerank
|
||||
:param ids: The list of ids corresponding to the documents
|
||||
:param top_k: The number of passages to be retrieved
|
||||
:return: Tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
rerank_results = await cohere_client.rerank(
|
||||
model=model,
|
||||
query=query,
|
||||
documents=documents,
|
||||
top_n=top_k,
|
||||
return_documents=False,
|
||||
)
|
||||
results: List[RerankResponseResultsItem] = rerank_results.results
|
||||
reranked_scores: List[float] = list(map(lambda x: x.relevance_score, results))
|
||||
indices = list(map(lambda x: x.index, results))
|
||||
reranked_contents: List[str] = list(map(lambda i: documents[i], indices))
|
||||
reranked_ids: List[str] = list(map(lambda i: ids[i], indices))
|
||||
return reranked_contents, reranked_ids, reranked_scores
|
||||
213
autorag/nodes/passagereranker/colbert.py
Normal file
213
autorag/nodes/passagereranker/colbert.py
Normal file
@@ -0,0 +1,213 @@
|
||||
from typing import List, Tuple
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.passagereranker.base import BasePassageReranker
|
||||
from autorag.utils.util import (
|
||||
flatten_apply,
|
||||
sort_by_scores,
|
||||
select_top_k,
|
||||
pop_params,
|
||||
result_to_dataframe,
|
||||
empty_cuda_cache,
|
||||
)
|
||||
|
||||
|
||||
class ColbertReranker(BasePassageReranker):
|
||||
def __init__(
|
||||
self,
|
||||
project_dir: str,
|
||||
model_name: str = "colbert-ir/colbertv2.0",
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Initialize a colbert rerank model for reranking.
|
||||
|
||||
:param project_dir: The project directory
|
||||
:param model_name: The model name for Colbert rerank.
|
||||
You can choose a colbert model for reranking.
|
||||
The default is "colbert-ir/colbertv2.0".
|
||||
:param kwargs: Extra parameter for the model.
|
||||
"""
|
||||
super().__init__(project_dir)
|
||||
try:
|
||||
import torch
|
||||
from transformers import AutoModel, AutoTokenizer
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Pytorch is not installed. Please install pytorch to use Colbert reranker."
|
||||
)
|
||||
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
model_params = pop_params(AutoModel.from_pretrained, kwargs)
|
||||
self.model = AutoModel.from_pretrained(model_name, **model_params).to(
|
||||
self.device
|
||||
)
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
|
||||
def __del__(self):
|
||||
del self.model
|
||||
empty_cuda_cache()
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries, contents, _, ids = self.cast_to_run(previous_result)
|
||||
top_k = kwargs.pop("top_k")
|
||||
batch = kwargs.pop("batch", 64)
|
||||
return self._pure(queries, contents, ids, top_k, batch)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents_list: List[List[str]],
|
||||
ids_list: List[List[str]],
|
||||
top_k: int,
|
||||
batch: int = 64,
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Rerank a list of contents with Colbert rerank models.
|
||||
You can get more information about a Colbert model at https://huggingface.co/colbert-ir/colbertv2.0.
|
||||
It uses BERT-based model, so recommend using CUDA gpu for faster reranking.
|
||||
|
||||
:param queries: The list of queries to use for reranking
|
||||
:param contents_list: The list of lists of contents to rerank
|
||||
:param ids_list: The list of lists of ids retrieved from the initial ranking
|
||||
:param top_k: The number of passages to be retrieved
|
||||
:param batch: The number of queries to be processed in a batch
|
||||
Default is 64.
|
||||
|
||||
:return: Tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
|
||||
# get query and content embeddings
|
||||
query_embedding_list = get_colbert_embedding_batch(
|
||||
queries, self.model, self.tokenizer, batch
|
||||
)
|
||||
content_embedding_list = flatten_apply(
|
||||
get_colbert_embedding_batch,
|
||||
contents_list,
|
||||
model=self.model,
|
||||
tokenizer=self.tokenizer,
|
||||
batch_size=batch,
|
||||
)
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"ids": ids_list,
|
||||
"query_embedding": query_embedding_list,
|
||||
"contents": contents_list,
|
||||
"content_embedding": content_embedding_list,
|
||||
}
|
||||
)
|
||||
temp_df = df.explode("content_embedding")
|
||||
temp_df["score"] = temp_df.apply(
|
||||
lambda x: get_colbert_score(x["query_embedding"], x["content_embedding"]),
|
||||
axis=1,
|
||||
)
|
||||
df["scores"] = (
|
||||
temp_df.groupby(level=0, sort=False)["score"].apply(list).tolist()
|
||||
)
|
||||
df[["contents", "ids", "scores"]] = df.apply(
|
||||
sort_by_scores, axis=1, result_type="expand"
|
||||
)
|
||||
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
|
||||
|
||||
return (
|
||||
results["contents"].tolist(),
|
||||
results["ids"].tolist(),
|
||||
results["scores"].tolist(),
|
||||
)
|
||||
|
||||
|
||||
def get_colbert_embedding_batch(
|
||||
input_strings: List[str], model, tokenizer, batch_size: int
|
||||
) -> List[np.array]:
|
||||
try:
|
||||
import torch
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Pytorch is not installed. Please install pytorch to use Colbert reranker."
|
||||
)
|
||||
encoding = tokenizer(
|
||||
input_strings,
|
||||
return_tensors="pt",
|
||||
padding=True,
|
||||
truncation=True,
|
||||
max_length=model.config.max_position_embeddings,
|
||||
)
|
||||
|
||||
input_batches = slice_tokenizer_result(encoding, batch_size)
|
||||
result_embedding = []
|
||||
with torch.no_grad():
|
||||
for encoding_batch in input_batches:
|
||||
result_embedding.append(model(**encoding_batch).last_hidden_state)
|
||||
total_tensor = torch.cat(
|
||||
result_embedding, dim=0
|
||||
) # shape [batch_size, token_length, embedding_dim]
|
||||
tensor_results = list(total_tensor.chunk(total_tensor.size()[0]))
|
||||
|
||||
if torch.cuda.is_available():
|
||||
return list(map(lambda x: x.detach().cpu().numpy(), tensor_results))
|
||||
else:
|
||||
return list(map(lambda x: x.detach().numpy(), tensor_results))
|
||||
|
||||
|
||||
def slice_tokenizer_result(tokenizer_output, batch_size):
|
||||
input_ids_batches = slice_tensor(tokenizer_output["input_ids"], batch_size)
|
||||
attention_mask_batches = slice_tensor(
|
||||
tokenizer_output["attention_mask"], batch_size
|
||||
)
|
||||
token_type_ids_batches = slice_tensor(
|
||||
tokenizer_output.get("token_type_ids", None), batch_size
|
||||
)
|
||||
return [
|
||||
{
|
||||
"input_ids": input_ids,
|
||||
"attention_mask": attention_mask,
|
||||
"token_type_ids": token_type_ids,
|
||||
}
|
||||
for input_ids, attention_mask, token_type_ids in zip(
|
||||
input_ids_batches, attention_mask_batches, token_type_ids_batches
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
def slice_tensor(input_tensor, batch_size):
|
||||
try:
|
||||
import torch
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Pytorch is not installed. Please install pytorch to use Colbert reranker."
|
||||
)
|
||||
# Calculate the number of full batches
|
||||
num_full_batches = input_tensor.size(0) // batch_size
|
||||
|
||||
# Slice the tensor into batches
|
||||
tensor_list = [
|
||||
input_tensor[i * batch_size : (i + 1) * batch_size]
|
||||
for i in range(num_full_batches)
|
||||
]
|
||||
|
||||
# Handle the last batch if it's smaller than batch_size
|
||||
remainder = input_tensor.size(0) % batch_size
|
||||
if remainder:
|
||||
tensor_list.append(input_tensor[-remainder:])
|
||||
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
tensor_list = list(map(lambda x: x.to(device), tensor_list))
|
||||
|
||||
return tensor_list
|
||||
|
||||
|
||||
def get_colbert_score(query_embedding: np.array, content_embedding: np.array) -> float:
|
||||
if query_embedding.ndim == 3 and content_embedding.ndim == 3:
|
||||
query_embedding = query_embedding.reshape(-1, query_embedding.shape[-1])
|
||||
content_embedding = content_embedding.reshape(-1, content_embedding.shape[-1])
|
||||
|
||||
sim_matrix = np.dot(query_embedding, content_embedding.T) / (
|
||||
np.linalg.norm(query_embedding, axis=1)[:, np.newaxis]
|
||||
* np.linalg.norm(content_embedding, axis=1)
|
||||
)
|
||||
max_sim_scores = np.max(sim_matrix, axis=1)
|
||||
return float(np.mean(max_sim_scores))
|
||||
138
autorag/nodes/passagereranker/dragonkue2.py
Normal file
138
autorag/nodes/passagereranker/dragonkue2.py
Normal file
@@ -0,0 +1,138 @@
|
||||
# 250313 reranker module_type 추가 - 김용연
|
||||
|
||||
from typing import List, Tuple
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.passagereranker.base import BasePassageReranker
|
||||
from autorag.utils.util import (
|
||||
make_batch,
|
||||
sort_by_scores,
|
||||
flatten_apply,
|
||||
select_top_k,
|
||||
result_to_dataframe,
|
||||
empty_cuda_cache,
|
||||
)
|
||||
|
||||
|
||||
class DragonKue2(BasePassageReranker):
|
||||
def __init__(self, project_dir: str, *args, **kwargs):
|
||||
super().__init__(project_dir)
|
||||
try:
|
||||
import torch
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
||||
except ImportError:
|
||||
raise ImportError("For using dragonkue2, please install torch first.")
|
||||
|
||||
model_path = "dragonkue/bge-reranker-v2-m3-ko"
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
|
||||
self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
|
||||
self.model.eval()
|
||||
# Determine the device to run the model on (GPU if available, otherwise CPU)
|
||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
self.model.to(self.device)
|
||||
|
||||
def __del__(self):
|
||||
del self.model
|
||||
empty_cuda_cache()
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries, contents, _, ids = self.cast_to_run(previous_result)
|
||||
top_k = kwargs.pop("top_k")
|
||||
batch = kwargs.pop("batch", 64)
|
||||
return self._pure(queries, contents, ids, top_k, batch)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents_list: List[List[str]],
|
||||
ids_list: List[List[str]],
|
||||
top_k: int,
|
||||
batch: int = 64,
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Rerank a list of contents based on their relevance to a query using ko-reranker.
|
||||
bge-reranker-v2-m3-ko is a reranker based on korean (https://huggingface.co/dragonkue/bge-reranker-v2-m3-ko).
|
||||
|
||||
:param queries: The list of queries to use for reranking
|
||||
:param contents_list: The list of lists of contents to rerank
|
||||
:param ids_list: The list of lists of ids retrieved from the initial ranking
|
||||
:param top_k: The number of passages to be retrieved
|
||||
:param batch: The number of queries to be processed in a batch
|
||||
Default is 64.
|
||||
:return: Tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
nested_list = [
|
||||
list(map(lambda x: [query, x], content_list))
|
||||
for query, content_list in zip(queries, contents_list)
|
||||
]
|
||||
scores_nps = flatten_apply(
|
||||
dragonku2_run_model,
|
||||
nested_list,
|
||||
model=self.model,
|
||||
batch_size=batch,
|
||||
tokenizer=self.tokenizer,
|
||||
device=self.device,
|
||||
)
|
||||
|
||||
rerank_scores = list(
|
||||
map(
|
||||
lambda scores: exp_normalize(np.array(scores)).astype(float), scores_nps
|
||||
)
|
||||
)
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"contents": contents_list,
|
||||
"ids": ids_list,
|
||||
"scores": rerank_scores,
|
||||
}
|
||||
)
|
||||
df[["contents", "ids", "scores"]] = df.apply(
|
||||
sort_by_scores, axis=1, result_type="expand"
|
||||
)
|
||||
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
|
||||
|
||||
return (
|
||||
results["contents"].tolist(),
|
||||
results["ids"].tolist(),
|
||||
results["scores"].tolist(),
|
||||
)
|
||||
|
||||
|
||||
def dragonku2_run_model(input_texts, model, tokenizer, device, batch_size: int): # 250313 추가 - 김용연
|
||||
try:
|
||||
import torch
|
||||
except ImportError:
|
||||
raise ImportError("For using drangonku2, please install torch first.")
|
||||
batch_input_texts = make_batch(input_texts, batch_size)
|
||||
results = []
|
||||
for batch_texts in batch_input_texts:
|
||||
inputs = tokenizer(
|
||||
batch_texts,
|
||||
padding=True,
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
max_length=512,
|
||||
)
|
||||
inputs = inputs.to(device)
|
||||
with torch.no_grad():
|
||||
scores = (
|
||||
model(**inputs, return_dict=True)
|
||||
.logits.view(
|
||||
-1,
|
||||
)
|
||||
.float()
|
||||
)
|
||||
scores_np = scores.cpu().numpy()
|
||||
results.extend(scores_np)
|
||||
return results
|
||||
|
||||
|
||||
def exp_normalize(x):
|
||||
b = x.max()
|
||||
y = np.exp(x - b)
|
||||
return y / y.sum()
|
||||
112
autorag/nodes/passagereranker/flag_embedding.py
Normal file
112
autorag/nodes/passagereranker/flag_embedding.py
Normal file
@@ -0,0 +1,112 @@
|
||||
from typing import List, Tuple, Iterable
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.passagereranker.base import BasePassageReranker
|
||||
from autorag.utils.util import (
|
||||
make_batch,
|
||||
sort_by_scores,
|
||||
flatten_apply,
|
||||
select_top_k,
|
||||
pop_params,
|
||||
result_to_dataframe,
|
||||
empty_cuda_cache,
|
||||
)
|
||||
|
||||
|
||||
class FlagEmbeddingReranker(BasePassageReranker):
|
||||
def __init__(
|
||||
self, project_dir, model_name: str = "BAAI/bge-reranker-large", *args, **kwargs
|
||||
):
|
||||
"""
|
||||
Initialize the FlagEmbeddingReranker module.
|
||||
|
||||
:param project_dir: The project directory.
|
||||
:param model_name: The name of the BAAI Reranker normal-model name.
|
||||
Default is "BAAI/bge-reranker-large"
|
||||
:param kwargs: Extra parameter for FlagEmbedding.FlagReranker
|
||||
"""
|
||||
super().__init__(project_dir)
|
||||
try:
|
||||
from FlagEmbedding import FlagReranker
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"FlagEmbeddingReranker requires the 'FlagEmbedding' package to be installed."
|
||||
)
|
||||
model_params = pop_params(FlagReranker.__init__, kwargs)
|
||||
model_params.pop("model_name_or_path", None)
|
||||
self.model = FlagReranker(model_name_or_path=model_name, **model_params)
|
||||
|
||||
def __del__(self):
|
||||
del self.model
|
||||
empty_cuda_cache()
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries, contents, _, ids = self.cast_to_run(previous_result)
|
||||
top_k = kwargs.pop("top_k")
|
||||
batch = kwargs.pop("batch", 64)
|
||||
return self._pure(queries, contents, ids, top_k, batch)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents_list: List[List[str]],
|
||||
ids_list: List[List[str]],
|
||||
top_k: int,
|
||||
batch: int = 64,
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Rerank a list of contents based on their relevance to a query using BAAI normal-Reranker model.
|
||||
|
||||
:param queries: The list of queries to use for reranking
|
||||
:param contents_list: The list of lists of contents to rerank
|
||||
:param ids_list: The list of lists of ids retrieved from the initial ranking
|
||||
:param top_k: The number of passages to be retrieved
|
||||
:param batch: The number of queries to be processed in a batch
|
||||
Default is 64.
|
||||
:return: Tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
nested_list = [
|
||||
list(map(lambda x: [query, x], content_list))
|
||||
for query, content_list in zip(queries, contents_list)
|
||||
]
|
||||
rerank_scores = flatten_apply(
|
||||
flag_embedding_run_model, nested_list, model=self.model, batch_size=batch
|
||||
)
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"contents": contents_list,
|
||||
"ids": ids_list,
|
||||
"scores": rerank_scores,
|
||||
}
|
||||
)
|
||||
df[["contents", "ids", "scores"]] = df.apply(
|
||||
sort_by_scores, axis=1, result_type="expand"
|
||||
)
|
||||
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
|
||||
|
||||
return (
|
||||
results["contents"].tolist(),
|
||||
results["ids"].tolist(),
|
||||
results["scores"].tolist(),
|
||||
)
|
||||
|
||||
|
||||
def flag_embedding_run_model(input_texts, model, batch_size: int):
|
||||
try:
|
||||
import torch
|
||||
except ImportError:
|
||||
raise ImportError("FlagEmbeddingReranker requires PyTorch to be installed.")
|
||||
batch_input_texts = make_batch(input_texts, batch_size)
|
||||
results = []
|
||||
for batch_texts in batch_input_texts:
|
||||
with torch.no_grad():
|
||||
pred_scores = model.compute_score(sentence_pairs=batch_texts)
|
||||
if batch_size == 1 or not isinstance(pred_scores, Iterable):
|
||||
results.append(pred_scores)
|
||||
else:
|
||||
results.extend(pred_scores)
|
||||
return results
|
||||
101
autorag/nodes/passagereranker/flag_embedding_llm.py
Normal file
101
autorag/nodes/passagereranker/flag_embedding_llm.py
Normal file
@@ -0,0 +1,101 @@
|
||||
from typing import List, Tuple
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.passagereranker.base import BasePassageReranker
|
||||
from autorag.nodes.passagereranker.flag_embedding import flag_embedding_run_model
|
||||
from autorag.utils.util import (
|
||||
flatten_apply,
|
||||
sort_by_scores,
|
||||
select_top_k,
|
||||
pop_params,
|
||||
result_to_dataframe,
|
||||
empty_cuda_cache,
|
||||
)
|
||||
|
||||
|
||||
class FlagEmbeddingLLMReranker(BasePassageReranker):
|
||||
def __init__(
|
||||
self,
|
||||
project_dir,
|
||||
model_name: str = "BAAI/bge-reranker-v2-gemma",
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Initialize the FlagEmbeddingReranker module.
|
||||
|
||||
:param project_dir: The project directory.
|
||||
:param model_name: The name of the BAAI Reranker LLM-based-model name.
|
||||
Default is "BAAI/bge-reranker-v2-gemma"
|
||||
:param kwargs: Extra parameter for FlagEmbedding.FlagReranker
|
||||
"""
|
||||
super().__init__(project_dir)
|
||||
try:
|
||||
from FlagEmbedding import FlagLLMReranker
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"FlagEmbeddingLLMReranker requires the 'FlagEmbedding' package to be installed."
|
||||
)
|
||||
model_params = pop_params(FlagLLMReranker.__init__, kwargs)
|
||||
model_params.pop("model_name_or_path", None)
|
||||
self.model = FlagLLMReranker(model_name_or_path=model_name, **model_params)
|
||||
|
||||
def __del__(self):
|
||||
del self.model
|
||||
empty_cuda_cache()
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries, contents, _, ids = self.cast_to_run(previous_result)
|
||||
top_k = kwargs.pop("top_k")
|
||||
batch = kwargs.pop("batch", 64)
|
||||
return self._pure(queries, contents, ids, top_k, batch)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents_list: List[List[str]],
|
||||
ids_list: List[List[str]],
|
||||
top_k: int,
|
||||
batch: int = 64,
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Rerank a list of contents based on their relevance to a query using BAAI LLM-based-Reranker model.
|
||||
|
||||
:param queries: The list of queries to use for reranking
|
||||
:param contents_list: The list of lists of contents to rerank
|
||||
:param ids_list: The list of lists of ids retrieved from the initial ranking
|
||||
:param top_k: The number of passages to be retrieved
|
||||
:param batch: The number of queries to be processed in a batch
|
||||
Default is 64.
|
||||
|
||||
:return: tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
|
||||
nested_list = [
|
||||
list(map(lambda x: [query, x], content_list))
|
||||
for query, content_list in zip(queries, contents_list)
|
||||
]
|
||||
rerank_scores = flatten_apply(
|
||||
flag_embedding_run_model, nested_list, model=self.model, batch_size=batch
|
||||
)
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"contents": contents_list,
|
||||
"ids": ids_list,
|
||||
"scores": rerank_scores,
|
||||
}
|
||||
)
|
||||
df[["contents", "ids", "scores"]] = df.apply(
|
||||
sort_by_scores, axis=1, result_type="expand"
|
||||
)
|
||||
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
|
||||
|
||||
return (
|
||||
results["contents"].tolist(),
|
||||
results["ids"].tolist(),
|
||||
results["scores"].tolist(),
|
||||
)
|
||||
245
autorag/nodes/passagereranker/flashrank.py
Normal file
245
autorag/nodes/passagereranker/flashrank.py
Normal file
@@ -0,0 +1,245 @@
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import os
|
||||
import zipfile
|
||||
import requests
|
||||
from tqdm import tqdm
|
||||
import collections
|
||||
from typing import List, Dict, Tuple
|
||||
|
||||
from autorag.nodes.passagereranker.base import BasePassageReranker
|
||||
from autorag.utils import result_to_dataframe
|
||||
from autorag.utils.util import (
|
||||
flatten_apply,
|
||||
sort_by_scores,
|
||||
select_top_k,
|
||||
make_batch,
|
||||
empty_cuda_cache,
|
||||
)
|
||||
|
||||
model_url = "https://huggingface.co/prithivida/flashrank/resolve/main/{}.zip"
|
||||
|
||||
model_file_map = {
|
||||
"ms-marco-TinyBERT-L-2-v2": "flashrank-TinyBERT-L-2-v2.onnx",
|
||||
"ms-marco-MiniLM-L-12-v2": "flashrank-MiniLM-L-12-v2_Q.onnx",
|
||||
"ms-marco-MultiBERT-L-12": "flashrank-MultiBERT-L12_Q.onnx",
|
||||
"rank-T5-flan": "flashrank-rankt5_Q.onnx",
|
||||
"ce-esci-MiniLM-L12-v2": "flashrank-ce-esci-MiniLM-L12-v2_Q.onnx",
|
||||
"miniReranker_arabic_v1": "miniReranker_arabic_v1.onnx",
|
||||
}
|
||||
|
||||
|
||||
class FlashRankReranker(BasePassageReranker):
|
||||
def __init__(
|
||||
self, project_dir: str, model: str = "ms-marco-TinyBERT-L-2-v2", *args, **kwargs
|
||||
):
|
||||
"""
|
||||
Initialize FlashRank rerank node.
|
||||
|
||||
:param project_dir: The project directory path.
|
||||
:param model: The model name for FlashRank rerank.
|
||||
You can get the list of available models from https://github.com/PrithivirajDamodaran/FlashRank.
|
||||
Default is "ms-marco-TinyBERT-L-2-v2".
|
||||
Not support “rank_zephyr_7b_v1_full” due to parallel inference issue.
|
||||
:param kwargs: Extra arguments that are not affected
|
||||
"""
|
||||
super().__init__(project_dir)
|
||||
try:
|
||||
from tokenizers import Tokenizer
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Tokenizer is not installed. Please install tokenizers to use FlashRank reranker."
|
||||
)
|
||||
|
||||
cache_dir = kwargs.pop("cache_dir", "/tmp")
|
||||
max_length = kwargs.pop("max_length", 512)
|
||||
|
||||
self.cache_dir: Path = Path(cache_dir)
|
||||
self.model_dir: Path = self.cache_dir / model
|
||||
self._prepare_model_dir(model)
|
||||
model_file = model_file_map[model]
|
||||
|
||||
try:
|
||||
import onnxruntime as ort
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"onnxruntime is not installed. Please install onnxruntime to use FlashRank reranker."
|
||||
)
|
||||
|
||||
self.session = ort.InferenceSession(str(self.model_dir / model_file))
|
||||
self.tokenizer: Tokenizer = self._get_tokenizer(max_length)
|
||||
|
||||
def __del__(self):
|
||||
del self.session
|
||||
del self.tokenizer
|
||||
empty_cuda_cache()
|
||||
super().__del__()
|
||||
|
||||
def _prepare_model_dir(self, model_name: str):
|
||||
if not self.cache_dir.exists():
|
||||
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if not self.model_dir.exists():
|
||||
self._download_model_files(model_name)
|
||||
|
||||
def _download_model_files(self, model_name: str):
|
||||
local_zip_file = self.cache_dir / f"{model_name}.zip"
|
||||
formatted_model_url = model_url.format(model_name)
|
||||
|
||||
with requests.get(formatted_model_url, stream=True) as r:
|
||||
r.raise_for_status()
|
||||
total_size = int(r.headers.get("content-length", 0))
|
||||
with (
|
||||
open(local_zip_file, "wb") as f,
|
||||
tqdm(
|
||||
desc=local_zip_file.name,
|
||||
total=total_size,
|
||||
unit="iB",
|
||||
unit_scale=True,
|
||||
unit_divisor=1024,
|
||||
) as bar,
|
||||
):
|
||||
for chunk in r.iter_content(chunk_size=8192):
|
||||
size = f.write(chunk)
|
||||
bar.update(size)
|
||||
|
||||
with zipfile.ZipFile(local_zip_file, "r") as zip_ref:
|
||||
zip_ref.extractall(self.cache_dir)
|
||||
os.remove(local_zip_file)
|
||||
|
||||
def _get_tokenizer(self, max_length: int = 512):
|
||||
try:
|
||||
from tokenizers import AddedToken, Tokenizer
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Pytorch is not installed. Please install pytorch to use FlashRank reranker."
|
||||
)
|
||||
config = json.load(open(str(self.model_dir / "config.json")))
|
||||
tokenizer_config = json.load(
|
||||
open(str(self.model_dir / "tokenizer_config.json"))
|
||||
)
|
||||
tokens_map = json.load(open(str(self.model_dir / "special_tokens_map.json")))
|
||||
tokenizer = Tokenizer.from_file(str(self.model_dir / "tokenizer.json"))
|
||||
|
||||
tokenizer.enable_truncation(
|
||||
max_length=min(tokenizer_config["model_max_length"], max_length)
|
||||
)
|
||||
tokenizer.enable_padding(
|
||||
pad_id=config["pad_token_id"], pad_token=tokenizer_config["pad_token"]
|
||||
)
|
||||
|
||||
for token in tokens_map.values():
|
||||
if isinstance(token, str):
|
||||
tokenizer.add_special_tokens([token])
|
||||
elif isinstance(token, dict):
|
||||
tokenizer.add_special_tokens([AddedToken(**token)])
|
||||
|
||||
vocab_file = self.model_dir / "vocab.txt"
|
||||
if vocab_file.exists():
|
||||
tokenizer.vocab = self._load_vocab(vocab_file)
|
||||
tokenizer.ids_to_tokens = collections.OrderedDict(
|
||||
[(ids, tok) for tok, ids in tokenizer.vocab.items()]
|
||||
)
|
||||
return tokenizer
|
||||
|
||||
def _load_vocab(self, vocab_file: Path) -> Dict[str, int]:
|
||||
vocab = collections.OrderedDict()
|
||||
with open(vocab_file, "r", encoding="utf-8") as reader:
|
||||
tokens = reader.readlines()
|
||||
for index, token in enumerate(tokens):
|
||||
token = token.rstrip("\n")
|
||||
vocab[token] = index
|
||||
return vocab
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries, contents, _, ids = self.cast_to_run(previous_result)
|
||||
top_k = kwargs.pop("top_k")
|
||||
batch = kwargs.pop("batch", 64)
|
||||
return self._pure(queries, contents, ids, top_k, batch)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents_list: List[List[str]],
|
||||
ids_list: List[List[str]],
|
||||
top_k: int,
|
||||
batch: int = 64,
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Rerank a list of contents with FlashRank rerank models.
|
||||
|
||||
:param queries: The list of queries to use for reranking
|
||||
:param contents_list: The list of lists of contents to rerank
|
||||
:param ids_list: The list of lists of ids retrieved from the initial ranking
|
||||
:param top_k: The number of passages to be retrieved
|
||||
:param batch: The number of queries to be processed in a batch
|
||||
:return: Tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
nested_list = [
|
||||
list(map(lambda x: [query, x], content_list))
|
||||
for query, content_list in zip(queries, contents_list)
|
||||
]
|
||||
|
||||
rerank_scores = flatten_apply(
|
||||
flashrank_run_model,
|
||||
nested_list,
|
||||
session=self.session,
|
||||
batch_size=batch,
|
||||
tokenizer=self.tokenizer,
|
||||
)
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"contents": contents_list,
|
||||
"ids": ids_list,
|
||||
"scores": rerank_scores,
|
||||
}
|
||||
)
|
||||
df[["contents", "ids", "scores"]] = df.apply(
|
||||
sort_by_scores, axis=1, result_type="expand"
|
||||
)
|
||||
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
|
||||
|
||||
return (
|
||||
results["contents"].tolist(),
|
||||
results["ids"].tolist(),
|
||||
results["scores"].tolist(),
|
||||
)
|
||||
|
||||
|
||||
def flashrank_run_model(input_texts, tokenizer, session, batch_size: int):
|
||||
batch_input_texts = make_batch(input_texts, batch_size)
|
||||
results = []
|
||||
|
||||
for batch_texts in tqdm(batch_input_texts):
|
||||
input_text = tokenizer.encode_batch(batch_texts)
|
||||
input_ids = np.array([e.ids for e in input_text])
|
||||
token_type_ids = np.array([e.type_ids for e in input_text])
|
||||
attention_mask = np.array([e.attention_mask for e in input_text])
|
||||
|
||||
use_token_type_ids = token_type_ids is not None and not np.all(
|
||||
token_type_ids == 0
|
||||
)
|
||||
|
||||
onnx_input = {
|
||||
"input_ids": input_ids.astype(np.int64),
|
||||
"attention_mask": attention_mask.astype(np.int64),
|
||||
}
|
||||
if use_token_type_ids:
|
||||
onnx_input["token_type_ids"] = token_type_ids.astype(np.int64)
|
||||
|
||||
outputs = session.run(None, onnx_input)
|
||||
|
||||
logits = outputs[0]
|
||||
|
||||
if logits.shape[1] == 1:
|
||||
scores = 1 / (1 + np.exp(-logits.flatten()))
|
||||
else:
|
||||
exp_logits = np.exp(logits)
|
||||
scores = exp_logits[:, 1] / np.sum(exp_logits, axis=1)
|
||||
results.extend(scores)
|
||||
return results
|
||||
115
autorag/nodes/passagereranker/jina.py
Normal file
115
autorag/nodes/passagereranker/jina.py
Normal file
@@ -0,0 +1,115 @@
|
||||
import os
|
||||
from typing import List, Tuple
|
||||
|
||||
import aiohttp
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.passagereranker.base import BasePassageReranker
|
||||
from autorag.utils.util import get_event_loop, process_batch, result_to_dataframe
|
||||
|
||||
JINA_API_URL = "https://api.jina.ai/v1/rerank"
|
||||
|
||||
|
||||
class JinaReranker(BasePassageReranker):
|
||||
def __init__(self, project_dir: str, api_key: str = None, *args, **kwargs):
|
||||
"""
|
||||
Initialize Jina rerank node.
|
||||
|
||||
:param project_dir: The project directory path.
|
||||
:param api_key: The API key for Jina rerank.
|
||||
You can set it in the environment variable JINAAI_API_KEY.
|
||||
Or, you can directly set it on the config YAML file using this parameter.
|
||||
Default is env variable "JINAAI_API_KEY".
|
||||
:param kwargs: Extra arguments that are not affected
|
||||
"""
|
||||
super().__init__(project_dir)
|
||||
if api_key is None:
|
||||
api_key = os.getenv("JINAAI_API_KEY", None)
|
||||
if api_key is None:
|
||||
raise ValueError(
|
||||
"API key is not provided."
|
||||
"You can set it as an argument or as an environment variable 'JINAAI_API_KEY'"
|
||||
)
|
||||
self.session = aiohttp.ClientSession(loop=get_event_loop())
|
||||
self.session.headers.update(
|
||||
{"Authorization": f"Bearer {api_key}", "Accept-Encoding": "identity"}
|
||||
)
|
||||
|
||||
def __del__(self):
|
||||
self.session.close()
|
||||
del self.session
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries, contents, _, ids = self.cast_to_run(previous_result)
|
||||
top_k = kwargs.pop("top_k")
|
||||
batch = kwargs.pop("batch", 8)
|
||||
model = kwargs.pop("model", "jina-reranker-v1-base-en")
|
||||
return self._pure(queries, contents, ids, top_k, model, batch)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents_list: List[List[str]],
|
||||
ids_list: List[List[str]],
|
||||
top_k: int,
|
||||
model: str = "jina-reranker-v1-base-en",
|
||||
batch: int = 8,
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Rerank a list of contents with Jina rerank models.
|
||||
You can get the API key from https://jina.ai/reranker and set it in the environment variable JINAAI_API_KEY.
|
||||
|
||||
:param queries: The list of queries to use for reranking
|
||||
:param contents_list: The list of lists of contents to rerank
|
||||
:param ids_list: The list of lists of ids retrieved from the initial ranking
|
||||
:param top_k: The number of passages to be retrieved
|
||||
:param model: The model name for Cohere rerank.
|
||||
You can choose between "jina-reranker-v1-base-en" and "jina-colbert-v1-en".
|
||||
Default is "jina-reranker-v1-base-en".
|
||||
:param batch: The number of queries to be processed in a batch
|
||||
:return: Tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
tasks = [
|
||||
jina_reranker_pure(
|
||||
self.session, query, contents, ids, top_k=top_k, model=model
|
||||
)
|
||||
for query, contents, ids in zip(queries, contents_list, ids_list)
|
||||
]
|
||||
loop = get_event_loop()
|
||||
results = loop.run_until_complete(process_batch(tasks, batch))
|
||||
|
||||
content_result, id_result, score_result = zip(*results)
|
||||
|
||||
return list(content_result), list(id_result), list(score_result)
|
||||
|
||||
|
||||
async def jina_reranker_pure(
|
||||
session,
|
||||
query: str,
|
||||
contents: List[str],
|
||||
ids: List[str],
|
||||
top_k: int,
|
||||
model: str = "jina-reranker-v1-base-en",
|
||||
) -> Tuple[List[str], List[str], List[float]]:
|
||||
async with session.post(
|
||||
JINA_API_URL,
|
||||
json={
|
||||
"query": query,
|
||||
"documents": contents,
|
||||
"model": model,
|
||||
"top_n": top_k,
|
||||
},
|
||||
) as resp:
|
||||
resp_json = await resp.json()
|
||||
if "results" not in resp_json:
|
||||
raise RuntimeError(f"Invalid response from Jina API: {resp_json['detail']}")
|
||||
|
||||
results = resp_json["results"]
|
||||
indices = list(map(lambda x: x["index"], results))
|
||||
score_result = list(map(lambda x: x["relevance_score"], results))
|
||||
id_result = list(map(lambda x: ids[x], indices))
|
||||
content_result = list(map(lambda x: contents[x], indices))
|
||||
|
||||
return content_result, id_result, score_result
|
||||
136
autorag/nodes/passagereranker/koreranker.py
Normal file
136
autorag/nodes/passagereranker/koreranker.py
Normal file
@@ -0,0 +1,136 @@
|
||||
from typing import List, Tuple
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.passagereranker.base import BasePassageReranker
|
||||
from autorag.utils.util import (
|
||||
make_batch,
|
||||
sort_by_scores,
|
||||
flatten_apply,
|
||||
select_top_k,
|
||||
result_to_dataframe,
|
||||
empty_cuda_cache,
|
||||
)
|
||||
|
||||
|
||||
class KoReranker(BasePassageReranker):
|
||||
def __init__(self, project_dir: str, *args, **kwargs):
|
||||
super().__init__(project_dir)
|
||||
try:
|
||||
import torch
|
||||
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
||||
except ImportError:
|
||||
raise ImportError("For using KoReranker, please install torch first.")
|
||||
|
||||
model_path = "Dongjin-kr/ko-reranker"
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
|
||||
self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
|
||||
self.model.eval()
|
||||
# Determine the device to run the model on (GPU if available, otherwise CPU)
|
||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
self.model.to(self.device)
|
||||
|
||||
def __del__(self):
|
||||
del self.model
|
||||
empty_cuda_cache()
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries, contents, _, ids = self.cast_to_run(previous_result)
|
||||
top_k = kwargs.pop("top_k")
|
||||
batch = kwargs.pop("batch", 64)
|
||||
return self._pure(queries, contents, ids, top_k, batch)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents_list: List[List[str]],
|
||||
ids_list: List[List[str]],
|
||||
top_k: int,
|
||||
batch: int = 64,
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Rerank a list of contents based on their relevance to a query using ko-reranker.
|
||||
ko-reranker is a reranker based on korean (https://huggingface.co/Dongjin-kr/ko-reranker).
|
||||
|
||||
:param queries: The list of queries to use for reranking
|
||||
:param contents_list: The list of lists of contents to rerank
|
||||
:param ids_list: The list of lists of ids retrieved from the initial ranking
|
||||
:param top_k: The number of passages to be retrieved
|
||||
:param batch: The number of queries to be processed in a batch
|
||||
Default is 64.
|
||||
:return: Tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
nested_list = [
|
||||
list(map(lambda x: [query, x], content_list))
|
||||
for query, content_list in zip(queries, contents_list)
|
||||
]
|
||||
scores_nps = flatten_apply(
|
||||
koreranker_run_model,
|
||||
nested_list,
|
||||
model=self.model,
|
||||
batch_size=batch,
|
||||
tokenizer=self.tokenizer,
|
||||
device=self.device,
|
||||
)
|
||||
|
||||
rerank_scores = list(
|
||||
map(
|
||||
lambda scores: exp_normalize(np.array(scores)).astype(float), scores_nps
|
||||
)
|
||||
)
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"contents": contents_list,
|
||||
"ids": ids_list,
|
||||
"scores": rerank_scores,
|
||||
}
|
||||
)
|
||||
df[["contents", "ids", "scores"]] = df.apply(
|
||||
sort_by_scores, axis=1, result_type="expand"
|
||||
)
|
||||
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
|
||||
|
||||
return (
|
||||
results["contents"].tolist(),
|
||||
results["ids"].tolist(),
|
||||
results["scores"].tolist(),
|
||||
)
|
||||
|
||||
|
||||
def koreranker_run_model(input_texts, model, tokenizer, device, batch_size: int):
|
||||
try:
|
||||
import torch
|
||||
except ImportError:
|
||||
raise ImportError("For using KoReranker, please install torch first.")
|
||||
batch_input_texts = make_batch(input_texts, batch_size)
|
||||
results = []
|
||||
for batch_texts in batch_input_texts:
|
||||
inputs = tokenizer(
|
||||
batch_texts,
|
||||
padding=True,
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
max_length=512,
|
||||
)
|
||||
inputs = inputs.to(device)
|
||||
with torch.no_grad():
|
||||
scores = (
|
||||
model(**inputs, return_dict=True)
|
||||
.logits.view(
|
||||
-1,
|
||||
)
|
||||
.float()
|
||||
)
|
||||
scores_np = scores.cpu().numpy()
|
||||
results.extend(scores_np)
|
||||
return results
|
||||
|
||||
|
||||
def exp_normalize(x):
|
||||
b = x.max()
|
||||
y = np.exp(x - b)
|
||||
return y / y.sum()
|
||||
126
autorag/nodes/passagereranker/mixedbreadai.py
Normal file
126
autorag/nodes/passagereranker/mixedbreadai.py
Normal file
@@ -0,0 +1,126 @@
|
||||
import os
|
||||
from typing import List, Tuple
|
||||
|
||||
import pandas as pd
|
||||
from mixedbread_ai.client import AsyncMixedbreadAI
|
||||
|
||||
from autorag.nodes.passagereranker.base import BasePassageReranker
|
||||
from autorag.utils.util import (
|
||||
result_to_dataframe,
|
||||
get_event_loop,
|
||||
process_batch,
|
||||
pop_params,
|
||||
)
|
||||
|
||||
|
||||
class MixedbreadAIReranker(BasePassageReranker):
|
||||
def __init__(
|
||||
self,
|
||||
project_dir: str,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Initialize mixedbread-ai rerank node.
|
||||
|
||||
:param project_dir: The project directory path.
|
||||
:param api_key: The API key for MixedbreadAI rerank.
|
||||
You can set it in the environment variable MXBAI_API_KEY.
|
||||
Or, you can directly set it on the config YAML file using this parameter.
|
||||
Default is env variable "MXBAI_API_KEY".
|
||||
:param kwargs: Extra arguments that are not affected
|
||||
"""
|
||||
super().__init__(project_dir)
|
||||
api_key = kwargs.pop("api_key", None)
|
||||
api_key = os.getenv("MXBAI_API_KEY", None) if api_key is None else api_key
|
||||
if api_key is None:
|
||||
raise KeyError(
|
||||
"Please set the API key for Mixedbread AI rerank in the environment variable MXBAI_API_KEY "
|
||||
"or directly set it on the config YAML file."
|
||||
)
|
||||
self.client = AsyncMixedbreadAI(api_key=api_key)
|
||||
|
||||
def __del__(self):
|
||||
del self.client
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries, contents, scores, ids = self.cast_to_run(previous_result)
|
||||
top_k = kwargs.pop("top_k")
|
||||
batch = kwargs.pop("batch", 8)
|
||||
model = kwargs.pop("model", "mixedbread-ai/mxbai-rerank-large-v1")
|
||||
rerank_params = pop_params(self.client.reranking, kwargs)
|
||||
return self._pure(queries, contents, ids, top_k, model, batch, **rerank_params)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents_list: List[List[str]],
|
||||
ids_list: List[List[str]],
|
||||
top_k: int,
|
||||
model: str = "mixedbread-ai/mxbai-rerank-large-v1",
|
||||
batch: int = 8,
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Rerank a list of contents with mixedbread-ai rerank models.
|
||||
You can get the API key from https://www.mixedbread.ai/api-reference#quick-start-guide and set it in the environment variable MXBAI_API_KEY.
|
||||
|
||||
:param queries: The list of queries to use for reranking
|
||||
:param contents_list: The list of lists of contents to rerank
|
||||
:param ids_list: The list of lists of ids retrieved from the initial ranking
|
||||
:param top_k: The number of passages to be retrieved
|
||||
:param model: The model name for mixedbread-ai rerank.
|
||||
You can choose between "mixedbread-ai/mxbai-rerank-large-v1", "mixedbread-ai/mxbai-rerank-base-v1" and "mixedbread-ai/mxbai-rerank-xsmall-v1".
|
||||
Default is "mixedbread-ai/mxbai-rerank-large-v1".
|
||||
:param batch: The number of queries to be processed in a batch
|
||||
:return: Tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
tasks = [
|
||||
mixedbreadai_rerank_pure(
|
||||
self.client, query, contents, ids, top_k=top_k, model=model
|
||||
)
|
||||
for query, contents, ids in zip(queries, contents_list, ids_list)
|
||||
]
|
||||
loop = get_event_loop()
|
||||
results = loop.run_until_complete(process_batch(tasks, batch))
|
||||
|
||||
content_result, id_result, score_result = zip(*results)
|
||||
|
||||
return list(content_result), list(id_result), list(score_result)
|
||||
|
||||
|
||||
async def mixedbreadai_rerank_pure(
|
||||
client: AsyncMixedbreadAI,
|
||||
query: str,
|
||||
documents: List[str],
|
||||
ids: List[str],
|
||||
top_k: int,
|
||||
model: str = "mixedbread-ai/mxbai-rerank-large-v1",
|
||||
) -> Tuple[List[str], List[str], List[float]]:
|
||||
"""
|
||||
Rerank a list of contents with mixedbread-ai rerank models.
|
||||
|
||||
:param client: The mixedbread-ai client to use for reranking
|
||||
:param query: The query to use for reranking
|
||||
:param documents: The list of contents to rerank
|
||||
:param ids: The list of ids corresponding to the documents
|
||||
:param top_k: The number of passages to be retrieved
|
||||
:param model: The model name for mixedbread-ai rerank.
|
||||
You can choose between "mixedbread-ai/mxbai-rerank-large-v1" and "mixedbread-ai/mxbai-rerank-base-v1".
|
||||
Default is "mixedbread-ai/mxbai-rerank-large-v1".
|
||||
:return: Tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
|
||||
results = await client.reranking(
|
||||
query=query,
|
||||
input=documents,
|
||||
top_k=top_k,
|
||||
model=model,
|
||||
)
|
||||
reranked_scores: List[float] = list(map(lambda x: x.score, results.data))
|
||||
reranked_scores_float = list(map(float, reranked_scores))
|
||||
indices = list(map(lambda x: x.index, results.data))
|
||||
reranked_contents = list(map(lambda x: documents[x], indices))
|
||||
reranked_ids: List[str] = list(map(lambda i: ids[i], indices))
|
||||
return reranked_contents, reranked_ids, reranked_scores_float
|
||||
190
autorag/nodes/passagereranker/monot5.py
Normal file
190
autorag/nodes/passagereranker/monot5.py
Normal file
@@ -0,0 +1,190 @@
|
||||
from itertools import chain
|
||||
from typing import List, Tuple
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.passagereranker.base import BasePassageReranker
|
||||
from autorag.utils.util import (
|
||||
make_batch,
|
||||
sort_by_scores,
|
||||
flatten_apply,
|
||||
select_top_k,
|
||||
result_to_dataframe,
|
||||
pop_params,
|
||||
empty_cuda_cache,
|
||||
)
|
||||
|
||||
prediction_tokens = {
|
||||
"castorini/monot5-base-msmarco": ["▁false", "▁true"],
|
||||
"castorini/monot5-base-msmarco-10k": ["▁false", "▁true"],
|
||||
"castorini/monot5-large-msmarco": ["▁false", "▁true"],
|
||||
"castorini/monot5-large-msmarco-10k": ["▁false", "▁true"],
|
||||
"castorini/monot5-base-med-msmarco": ["▁false", "▁true"],
|
||||
"castorini/monot5-3b-med-msmarco": ["▁false", "▁true"],
|
||||
"castorini/monot5-3b-msmarco-10k": ["▁false", "▁true"],
|
||||
"unicamp-dl/mt5-base-en-msmarco": ["▁no", "▁yes"],
|
||||
"unicamp-dl/ptt5-base-pt-msmarco-10k-v2": ["▁não", "▁sim"],
|
||||
"unicamp-dl/ptt5-base-pt-msmarco-100k-v2": ["▁não", "▁sim"],
|
||||
"unicamp-dl/ptt5-base-en-pt-msmarco-100k-v2": ["▁não", "▁sim"],
|
||||
"unicamp-dl/mt5-base-en-pt-msmarco-v2": ["▁no", "▁yes"],
|
||||
"unicamp-dl/mt5-base-mmarco-v2": ["▁no", "▁yes"],
|
||||
"unicamp-dl/mt5-base-en-pt-msmarco-v1": ["▁no", "▁yes"],
|
||||
"unicamp-dl/mt5-base-mmarco-v1": ["▁no", "▁yes"],
|
||||
"unicamp-dl/ptt5-base-pt-msmarco-10k-v1": ["▁não", "▁sim"],
|
||||
"unicamp-dl/ptt5-base-pt-msmarco-100k-v1": ["▁não", "▁sim"],
|
||||
"unicamp-dl/ptt5-base-en-pt-msmarco-10k-v1": ["▁não", "▁sim"],
|
||||
"unicamp-dl/mt5-3B-mmarco-en-pt": ["▁", "▁true"],
|
||||
"unicamp-dl/mt5-13b-mmarco-100k": ["▁", "▁true"],
|
||||
}
|
||||
|
||||
|
||||
class MonoT5(BasePassageReranker):
|
||||
def __init__(
|
||||
self,
|
||||
project_dir: str,
|
||||
model_name: str = "castorini/monot5-3b-msmarco-10k",
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Initialize the MonoT5 reranker.
|
||||
|
||||
:param project_dir: The project directory
|
||||
:param model_name: The name of the MonoT5 model to use for reranking
|
||||
Note: default model name is 'castorini/monot5-3b-msmarco-10k'
|
||||
If there is a '/' in the model name parameter,
|
||||
when we create the file to store the results, the path will be twisted because of the '/'.
|
||||
Therefore, it will be received as '_' instead of '/'.
|
||||
:param kwargs: The extra arguments for the MonoT5 reranker
|
||||
"""
|
||||
super().__init__(project_dir)
|
||||
try:
|
||||
import torch
|
||||
from transformers import T5Tokenizer, T5ForConditionalGeneration
|
||||
except ImportError:
|
||||
raise ImportError("For using MonoT5 Reranker, please install torch first.")
|
||||
# replace '_' to '/'
|
||||
if "_" in model_name:
|
||||
model_name = model_name.replace("_", "/")
|
||||
# Load the tokenizer and model from the pre-trained MonoT5 model
|
||||
self.tokenizer = T5Tokenizer.from_pretrained(model_name)
|
||||
model_params = pop_params(T5ForConditionalGeneration.from_pretrained, kwargs)
|
||||
self.model = T5ForConditionalGeneration.from_pretrained(
|
||||
model_name, **model_params
|
||||
).eval()
|
||||
|
||||
# Determine the device to run the model on (GPU if available, otherwise CPU)
|
||||
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
self.model.to(self.device)
|
||||
|
||||
token_false, token_true = prediction_tokens[model_name]
|
||||
self.token_false_id = self.tokenizer.convert_tokens_to_ids(token_false)
|
||||
self.token_true_id = self.tokenizer.convert_tokens_to_ids(token_true)
|
||||
|
||||
def __del__(self):
|
||||
del self.model
|
||||
del self.tokenizer
|
||||
empty_cuda_cache()
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries, contents, _, ids = self.cast_to_run(previous_result)
|
||||
top_k = kwargs.get("top_k", 3)
|
||||
batch = kwargs.get("batch", 64)
|
||||
return self._pure(queries, contents, ids, top_k, batch)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents_list: List[List[str]],
|
||||
ids_list: List[List[str]],
|
||||
top_k: int,
|
||||
batch: int = 64,
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Rerank a list of contents based on their relevance to a query using MonoT5.
|
||||
|
||||
:param queries: The list of queries to use for reranking
|
||||
:param contents_list: The list of lists of contents to rerank
|
||||
:param ids_list: The list of lists of ids retrieved from the initial ranking
|
||||
:param top_k: The number of passages to be retrieved
|
||||
|
||||
:param batch: The number of queries to be processed in a batch
|
||||
:return: tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
# Retrieve the tokens used by the model to represent false and true predictions
|
||||
|
||||
nested_list = [
|
||||
list(map(lambda x: [f"Query: {query} Document: {x}"], content_list))
|
||||
for query, content_list in zip(queries, contents_list)
|
||||
]
|
||||
|
||||
rerank_scores = flatten_apply(
|
||||
monot5_run_model,
|
||||
nested_list,
|
||||
model=self.model,
|
||||
batch_size=batch,
|
||||
tokenizer=self.tokenizer,
|
||||
device=self.device,
|
||||
token_false_id=self.token_false_id,
|
||||
token_true_id=self.token_true_id,
|
||||
)
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"contents": contents_list,
|
||||
"ids": ids_list,
|
||||
"scores": rerank_scores,
|
||||
}
|
||||
)
|
||||
df[["contents", "ids", "scores"]] = df.apply(
|
||||
sort_by_scores, axis=1, result_type="expand"
|
||||
)
|
||||
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
|
||||
|
||||
return (
|
||||
results["contents"].tolist(),
|
||||
results["ids"].tolist(),
|
||||
results["scores"].tolist(),
|
||||
)
|
||||
|
||||
|
||||
def monot5_run_model(
|
||||
input_texts,
|
||||
model,
|
||||
batch_size: int,
|
||||
tokenizer,
|
||||
device,
|
||||
token_false_id,
|
||||
token_true_id,
|
||||
):
|
||||
try:
|
||||
import torch
|
||||
except ImportError:
|
||||
raise ImportError("For using MonoT5 Reranker, please install torch first.")
|
||||
batch_input_texts = make_batch(input_texts, batch_size)
|
||||
results = []
|
||||
for batch_texts in batch_input_texts:
|
||||
flattened_batch_texts = list(chain.from_iterable(batch_texts))
|
||||
input_encodings = tokenizer(
|
||||
flattened_batch_texts,
|
||||
padding=True,
|
||||
truncation=True,
|
||||
max_length=512,
|
||||
return_tensors="pt",
|
||||
).to(device)
|
||||
with torch.no_grad():
|
||||
outputs = model.generate(
|
||||
input_ids=input_encodings["input_ids"],
|
||||
attention_mask=input_encodings["attention_mask"],
|
||||
output_scores=True,
|
||||
return_dict_in_generate=True,
|
||||
)
|
||||
|
||||
# Extract logits for the 'false' and 'true' tokens from the model's output
|
||||
logits = outputs.scores[-1][:, [token_false_id, token_true_id]]
|
||||
# Calculate the softmax probability of the 'true' token
|
||||
probs = torch.nn.functional.softmax(logits, dim=-1)[:, 1]
|
||||
results.extend(probs.tolist())
|
||||
return results
|
||||
191
autorag/nodes/passagereranker/openvino.py
Normal file
191
autorag/nodes/passagereranker/openvino.py
Normal file
@@ -0,0 +1,191 @@
|
||||
from pathlib import Path
|
||||
from typing import Any, List, Tuple
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.passagereranker.base import BasePassageReranker
|
||||
|
||||
|
||||
from autorag.utils.util import (
|
||||
make_batch,
|
||||
sort_by_scores,
|
||||
flatten_apply,
|
||||
select_top_k,
|
||||
result_to_dataframe,
|
||||
pop_params,
|
||||
empty_cuda_cache,
|
||||
)
|
||||
|
||||
|
||||
class OpenVINOReranker(BasePassageReranker):
|
||||
def __init__(
|
||||
self,
|
||||
project_dir: str,
|
||||
model: str = "BAAI/bge-reranker-large",
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(project_dir)
|
||||
|
||||
try:
|
||||
from huggingface_hub import HfApi
|
||||
from transformers import AutoTokenizer
|
||||
except ImportError as e:
|
||||
raise ValueError(
|
||||
"Could not import huggingface_hub python package. "
|
||||
"Please install it with: "
|
||||
"`pip install -U huggingface_hub`."
|
||||
) from e
|
||||
|
||||
def require_model_export(
|
||||
model_id: str, revision: Any = None, subfolder: Any = None
|
||||
) -> bool:
|
||||
model_dir = Path(model_id)
|
||||
if subfolder is not None:
|
||||
model_dir = model_dir / subfolder
|
||||
if model_dir.is_dir():
|
||||
return (
|
||||
not (model_dir / "openvino_model.xml").exists()
|
||||
or not (model_dir / "openvino_model.bin").exists()
|
||||
)
|
||||
hf_api = HfApi()
|
||||
try:
|
||||
model_info = hf_api.model_info(model_id, revision=revision or "main")
|
||||
normalized_subfolder = (
|
||||
None if subfolder is None else Path(subfolder).as_posix()
|
||||
)
|
||||
model_files = [
|
||||
file.rfilename
|
||||
for file in model_info.siblings
|
||||
if normalized_subfolder is None
|
||||
or file.rfilename.startswith(normalized_subfolder)
|
||||
]
|
||||
ov_model_path = (
|
||||
"openvino_model.xml"
|
||||
if subfolder is None
|
||||
else f"{normalized_subfolder}/openvino_model.xml"
|
||||
)
|
||||
return (
|
||||
ov_model_path not in model_files
|
||||
or ov_model_path.replace(".xml", ".bin") not in model_files
|
||||
)
|
||||
except Exception:
|
||||
return True
|
||||
|
||||
try:
|
||||
from optimum.intel.openvino import OVModelForSequenceClassification
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Please install optimum package to use OpenVINOReranker"
|
||||
"pip install 'optimum[openvino,nncf]'"
|
||||
)
|
||||
|
||||
model_kwargs = pop_params(
|
||||
OVModelForSequenceClassification.from_pretrained, kwargs
|
||||
)
|
||||
|
||||
if require_model_export(model):
|
||||
# use remote model
|
||||
self.model = OVModelForSequenceClassification.from_pretrained(
|
||||
model, export=True, **model_kwargs
|
||||
)
|
||||
else:
|
||||
# use local model
|
||||
self.model = OVModelForSequenceClassification.from_pretrained(
|
||||
model, **model_kwargs
|
||||
)
|
||||
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(model)
|
||||
|
||||
def __del__(self):
|
||||
del self.model
|
||||
del self.tokenizer
|
||||
empty_cuda_cache()
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries, contents, _, ids = self.cast_to_run(previous_result)
|
||||
top_k = kwargs.get("top_k", 3)
|
||||
batch = kwargs.get("batch", 64)
|
||||
return self._pure(queries, contents, ids, top_k, batch)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents_list: List[List[str]],
|
||||
ids_list: List[List[str]],
|
||||
top_k: int,
|
||||
batch: int = 64,
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Rerank a list of contents based on their relevance to a query using MonoT5.
|
||||
|
||||
:param queries: The list of queries to use for reranking
|
||||
:param contents_list: The list of lists of contents to rerank
|
||||
:param ids_list: The list of lists of ids retrieved from the initial ranking
|
||||
:param top_k: The number of passages to be retrieved
|
||||
|
||||
:param batch: The number of queries to be processed in a batch
|
||||
:return: tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
# Retrieve the tokens used by the model to represent false and true predictions
|
||||
|
||||
nested_list = [
|
||||
list(map(lambda x: [query, x], content_list))
|
||||
for query, content_list in zip(queries, contents_list)
|
||||
]
|
||||
|
||||
rerank_scores = flatten_apply(
|
||||
openvino_run_model,
|
||||
nested_list,
|
||||
model=self.model,
|
||||
batch_size=batch,
|
||||
tokenizer=self.tokenizer,
|
||||
)
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"contents": contents_list,
|
||||
"ids": ids_list,
|
||||
"scores": rerank_scores,
|
||||
}
|
||||
)
|
||||
df[["contents", "ids", "scores"]] = df.apply(
|
||||
sort_by_scores, axis=1, result_type="expand"
|
||||
)
|
||||
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
|
||||
|
||||
return (
|
||||
results["contents"].tolist(),
|
||||
results["ids"].tolist(),
|
||||
results["scores"].tolist(),
|
||||
)
|
||||
|
||||
|
||||
def openvino_run_model(
|
||||
input_texts,
|
||||
model,
|
||||
batch_size: int,
|
||||
tokenizer,
|
||||
):
|
||||
batch_input_texts = make_batch(input_texts, batch_size)
|
||||
results = []
|
||||
for batch_texts in batch_input_texts:
|
||||
input_tensors = tokenizer(
|
||||
batch_texts,
|
||||
padding=True,
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
outputs = model(**input_tensors, return_dict=True)
|
||||
if outputs[0].shape[1] > 1:
|
||||
scores = outputs[0][:, 1]
|
||||
else:
|
||||
scores = outputs[0].flatten()
|
||||
|
||||
scores = list(map(float, (1 / (1 + np.exp(-np.array(scores))))))
|
||||
results.extend(scores)
|
||||
return results
|
||||
31
autorag/nodes/passagereranker/pass_reranker.py
Normal file
31
autorag/nodes/passagereranker/pass_reranker.py
Normal file
@@ -0,0 +1,31 @@
|
||||
from typing import List
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.passagereranker.base import BasePassageReranker
|
||||
from autorag.utils import result_to_dataframe
|
||||
|
||||
|
||||
class PassReranker(BasePassageReranker):
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
top_k = kwargs.pop("top_k")
|
||||
|
||||
_, contents_list, scores_list, ids_list = self.cast_to_run(previous_result)
|
||||
return self._pure(contents_list, scores_list, ids_list, top_k)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
contents_list: List[List[str]],
|
||||
scores_list: List[List[float]],
|
||||
ids_list: List[List[str]],
|
||||
top_k: int,
|
||||
):
|
||||
"""
|
||||
Do not perform reranking.
|
||||
Return the given top-k passages as is.
|
||||
"""
|
||||
contents_list = list(map(lambda x: x[:top_k], contents_list))
|
||||
scores_list = list(map(lambda x: x[:top_k], scores_list))
|
||||
ids_list = list(map(lambda x: x[:top_k], ids_list))
|
||||
return contents_list, ids_list, scores_list
|
||||
170
autorag/nodes/passagereranker/rankgpt.py
Normal file
170
autorag/nodes/passagereranker/rankgpt.py
Normal file
@@ -0,0 +1,170 @@
|
||||
from typing import List, Optional, Sequence, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from llama_index.core.base.llms.types import ChatMessage, ChatResponse
|
||||
from llama_index.core.llms import LLM
|
||||
from llama_index.core.postprocessor.rankGPT_rerank import RankGPTRerank
|
||||
from llama_index.core.schema import NodeWithScore, QueryBundle, TextNode
|
||||
from llama_index.core.utils import print_text
|
||||
from llama_index.llms.openai import OpenAI
|
||||
|
||||
from autorag import generator_models
|
||||
from autorag.nodes.passagereranker.base import BasePassageReranker
|
||||
from autorag.utils.util import (
|
||||
get_event_loop,
|
||||
process_batch,
|
||||
pop_params,
|
||||
result_to_dataframe,
|
||||
empty_cuda_cache,
|
||||
)
|
||||
|
||||
|
||||
class RankGPT(BasePassageReranker):
|
||||
def __init__(
|
||||
self, project_dir: str, llm: Optional[Union[str, LLM]] = None, **kwargs
|
||||
):
|
||||
"""
|
||||
Initialize the RankGPT reranker.
|
||||
|
||||
:param project_dir: The project directory
|
||||
:param llm: The LLM model to use for RankGPT rerank.
|
||||
It is a llama index model.
|
||||
Default is the OpenAI model with gpt-4o-mini.
|
||||
:param kwargs: The keyword arguments for the LLM model.
|
||||
"""
|
||||
super().__init__(project_dir)
|
||||
if llm is None:
|
||||
self.llm = OpenAI(model="gpt-4o-mini")
|
||||
else:
|
||||
if not isinstance(llm, LLM):
|
||||
llm_class = generator_models[llm]
|
||||
llm_param = pop_params(llm_class.__init__, kwargs)
|
||||
self.llm = llm_class(**llm_param)
|
||||
else:
|
||||
self.llm = llm
|
||||
|
||||
def __del__(self):
|
||||
del self.llm
|
||||
empty_cuda_cache()
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries, contents, scores, ids = self.cast_to_run(previous_result)
|
||||
top_k = kwargs.get("top_k", 1)
|
||||
verbose = kwargs.get("verbose", False)
|
||||
rankgpt_rerank_prompt = kwargs.get("rankgpt_rerank_prompt", None)
|
||||
batch = kwargs.get("batch", 16)
|
||||
return self._pure(
|
||||
queries=queries,
|
||||
contents_list=contents,
|
||||
scores_list=scores,
|
||||
ids_list=ids,
|
||||
top_k=top_k,
|
||||
verbose=verbose,
|
||||
rankgpt_rerank_prompt=rankgpt_rerank_prompt,
|
||||
batch=batch,
|
||||
)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents_list: List[List[str]],
|
||||
scores_list: List[List[float]],
|
||||
ids_list: List[List[str]],
|
||||
top_k: int,
|
||||
verbose: bool = False,
|
||||
rankgpt_rerank_prompt: Optional[str] = None,
|
||||
batch: int = 16,
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Rerank given context paragraphs using RankGPT.
|
||||
Return pseudo scores, since the actual scores are not available on RankGPT.
|
||||
|
||||
:param queries: The list of queries to use for reranking
|
||||
:param contents_list: The list of lists of contents to rerank
|
||||
:param scores_list: The list of lists of scores retrieved from the initial ranking
|
||||
:param ids_list: The list of lists of ids retrieved from the initial ranking
|
||||
:param top_k: The number of passages to be retrieved
|
||||
:param verbose: Whether to print intermediate steps.
|
||||
:param rankgpt_rerank_prompt: The prompt template for RankGPT rerank.
|
||||
Default is RankGPT's default prompt.
|
||||
:param batch: The number of queries to be processed in a batch.
|
||||
:return: Tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
query_bundles = list(map(lambda query: QueryBundle(query_str=query), queries))
|
||||
nodes_list = [
|
||||
list(
|
||||
map(
|
||||
lambda x: NodeWithScore(node=TextNode(text=x[0]), score=x[1]),
|
||||
zip(content_list, score_list),
|
||||
)
|
||||
)
|
||||
for content_list, score_list in zip(contents_list, scores_list)
|
||||
]
|
||||
|
||||
reranker = AsyncRankGPTRerank(
|
||||
top_n=top_k,
|
||||
llm=self.llm,
|
||||
verbose=verbose,
|
||||
rankgpt_rerank_prompt=rankgpt_rerank_prompt,
|
||||
)
|
||||
|
||||
tasks = [
|
||||
reranker.async_postprocess_nodes(nodes, query, ids)
|
||||
for nodes, query, ids in zip(nodes_list, query_bundles, ids_list)
|
||||
]
|
||||
loop = get_event_loop()
|
||||
rerank_result = loop.run_until_complete(process_batch(tasks, batch_size=batch))
|
||||
content_result = [
|
||||
list(map(lambda x: x.node.text, res[0])) for res in rerank_result
|
||||
]
|
||||
score_result = [
|
||||
np.linspace(1.0, 0.0, len(res[0])).tolist() for res in rerank_result
|
||||
]
|
||||
id_result = [res[1] for res in rerank_result]
|
||||
|
||||
del reranker
|
||||
|
||||
return content_result, id_result, score_result
|
||||
|
||||
|
||||
class AsyncRankGPTRerank(RankGPTRerank):
|
||||
async def async_run_llm(self, messages: Sequence[ChatMessage]) -> ChatResponse:
|
||||
return await self.llm.achat(messages)
|
||||
|
||||
async def async_postprocess_nodes(
|
||||
self,
|
||||
nodes: List[NodeWithScore],
|
||||
query_bundle: QueryBundle,
|
||||
ids: Optional[List[str]] = None,
|
||||
) -> Tuple[List[NodeWithScore], List[str]]:
|
||||
if ids is None:
|
||||
ids = [str(i) for i in range(len(nodes))]
|
||||
|
||||
items = {
|
||||
"query": query_bundle.query_str,
|
||||
"hits": [{"content": node.get_content()} for node in nodes],
|
||||
}
|
||||
|
||||
messages = self.create_permutation_instruction(item=items)
|
||||
permutation = await self.async_run_llm(messages=messages)
|
||||
if permutation.message is not None and permutation.message.content is not None:
|
||||
rerank_ranks = self._receive_permutation(
|
||||
items, str(permutation.message.content)
|
||||
)
|
||||
if self.verbose:
|
||||
print_text(f"After Reranking, new rank list for nodes: {rerank_ranks}")
|
||||
|
||||
initial_results: List[NodeWithScore] = []
|
||||
id_results = []
|
||||
|
||||
for idx in rerank_ranks:
|
||||
initial_results.append(
|
||||
NodeWithScore(node=nodes[idx].node, score=nodes[idx].score)
|
||||
)
|
||||
id_results.append(ids[idx])
|
||||
return initial_results[: self.top_n], id_results[: self.top_n]
|
||||
else:
|
||||
return nodes[: self.top_n], ids[: self.top_n]
|
||||
145
autorag/nodes/passagereranker/run.py
Normal file
145
autorag/nodes/passagereranker/run.py
Normal file
@@ -0,0 +1,145 @@
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
from typing import List, Dict
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.retrieval.run import evaluate_retrieval_node
|
||||
from autorag.schema.metricinput import MetricInput
|
||||
from autorag.strategy import measure_speed, filter_by_threshold, select_best
|
||||
from autorag.utils.util import apply_recursive, to_list
|
||||
|
||||
logger = logging.getLogger("AutoRAG")
|
||||
|
||||
|
||||
def run_passage_reranker_node(
|
||||
modules: List,
|
||||
module_params: List[Dict],
|
||||
previous_result: pd.DataFrame,
|
||||
node_line_dir: str,
|
||||
strategies: Dict,
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Run evaluation and select the best module among passage reranker node results.
|
||||
|
||||
:param modules: Passage reranker modules to run.
|
||||
:param module_params: Passage reranker module parameters.
|
||||
:param previous_result: Previous result dataframe.
|
||||
Could be retrieval, reranker modules result.
|
||||
It means it must contain 'query', 'retrieved_contents', 'retrieved_ids', 'retrieve_scores' columns.
|
||||
:param node_line_dir: This node line's directory.
|
||||
:param strategies: Strategies for passage reranker node.
|
||||
In this node, we use 'retrieval_f1', 'retrieval_recall' and 'retrieval_precision'.
|
||||
You can skip evaluation when you use only one module and a module parameter.
|
||||
:return: The best result dataframe with previous result columns.
|
||||
"""
|
||||
if not os.path.exists(node_line_dir):
|
||||
os.makedirs(node_line_dir)
|
||||
project_dir = pathlib.PurePath(node_line_dir).parent.parent
|
||||
qa_df = pd.read_parquet(
|
||||
os.path.join(project_dir, "data", "qa.parquet"), engine="pyarrow"
|
||||
)
|
||||
retrieval_gt = qa_df["retrieval_gt"].tolist()
|
||||
retrieval_gt = apply_recursive(lambda x: str(x), to_list(retrieval_gt))
|
||||
|
||||
# make rows to metric_inputs
|
||||
metric_inputs = [
|
||||
MetricInput(retrieval_gt=ret_gt, query=query, generation_gt=gen_gt)
|
||||
for ret_gt, query, gen_gt in zip(
|
||||
retrieval_gt, qa_df["query"].tolist(), qa_df["generation_gt"].tolist()
|
||||
)
|
||||
]
|
||||
|
||||
results, execution_times = zip(
|
||||
*map(
|
||||
lambda task: measure_speed(
|
||||
task[0].run_evaluator,
|
||||
project_dir=project_dir,
|
||||
previous_result=previous_result,
|
||||
**task[1],
|
||||
),
|
||||
zip(modules, module_params),
|
||||
)
|
||||
)
|
||||
average_times = list(map(lambda x: x / len(results[0]), execution_times))
|
||||
|
||||
# run metrics before filtering
|
||||
if strategies.get("metrics") is None:
|
||||
raise ValueError(
|
||||
"You must at least one metrics for passage_reranker evaluation."
|
||||
)
|
||||
results = list(
|
||||
map(
|
||||
lambda x: evaluate_retrieval_node(
|
||||
x,
|
||||
metric_inputs,
|
||||
strategies.get("metrics"),
|
||||
),
|
||||
results,
|
||||
)
|
||||
)
|
||||
|
||||
# save results to folder
|
||||
save_dir = os.path.join(node_line_dir, "passage_reranker") # node name
|
||||
if not os.path.exists(save_dir):
|
||||
os.makedirs(save_dir)
|
||||
filepaths = list(
|
||||
map(lambda x: os.path.join(save_dir, f"{x}.parquet"), range(len(modules)))
|
||||
)
|
||||
list(
|
||||
map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths))
|
||||
) # execute save to parquet
|
||||
filenames = list(map(lambda x: os.path.basename(x), filepaths))
|
||||
|
||||
summary_df = pd.DataFrame(
|
||||
{
|
||||
"filename": filenames,
|
||||
"module_name": list(map(lambda module: module.__name__, modules)),
|
||||
"module_params": module_params,
|
||||
"execution_time": average_times,
|
||||
**{
|
||||
f"passage_reranker_{metric}": list(
|
||||
map(lambda result: result[metric].mean(), results)
|
||||
)
|
||||
for metric in strategies.get("metrics")
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
# filter by strategies
|
||||
if strategies.get("speed_threshold") is not None:
|
||||
results, filenames = filter_by_threshold(
|
||||
results, average_times, strategies["speed_threshold"], filenames
|
||||
)
|
||||
selected_result, selected_filename = select_best(
|
||||
results,
|
||||
strategies.get("metrics"),
|
||||
filenames,
|
||||
strategies.get("strategy", "mean"),
|
||||
)
|
||||
# change metric name columns to passage_reranker_metric_name
|
||||
selected_result = selected_result.rename(
|
||||
columns={
|
||||
metric_name: f"passage_reranker_{metric_name}"
|
||||
for metric_name in strategies["metrics"]
|
||||
}
|
||||
)
|
||||
# drop retrieval result columns in previous_result
|
||||
previous_result = previous_result.drop(
|
||||
columns=["retrieved_contents", "retrieved_ids", "retrieve_scores"]
|
||||
)
|
||||
best_result = pd.concat([previous_result, selected_result], axis=1)
|
||||
|
||||
# add 'is_best' column to summary file
|
||||
summary_df["is_best"] = summary_df["filename"] == selected_filename
|
||||
|
||||
# save files
|
||||
summary_df.to_csv(os.path.join(save_dir, "summary.csv"), index=False)
|
||||
best_result.to_parquet(
|
||||
os.path.join(
|
||||
save_dir, f"best_{os.path.splitext(selected_filename)[0]}.parquet"
|
||||
),
|
||||
index=False,
|
||||
)
|
||||
return best_result
|
||||
129
autorag/nodes/passagereranker/sentence_transformer.py
Normal file
129
autorag/nodes/passagereranker/sentence_transformer.py
Normal file
@@ -0,0 +1,129 @@
|
||||
from typing import List, Tuple
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.passagereranker.base import BasePassageReranker
|
||||
from autorag.utils.util import (
|
||||
flatten_apply,
|
||||
make_batch,
|
||||
select_top_k,
|
||||
sort_by_scores,
|
||||
pop_params,
|
||||
result_to_dataframe,
|
||||
empty_cuda_cache,
|
||||
)
|
||||
|
||||
|
||||
class SentenceTransformerReranker(BasePassageReranker):
|
||||
def __init__(
|
||||
self,
|
||||
project_dir: str,
|
||||
model_name: str = "cross-encoder/ms-marco-MiniLM-L-2-v2",
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Initialize the Sentence Transformer reranker node.
|
||||
|
||||
:param project_dir: The project directory
|
||||
:param model_name: The name of the Sentence Transformer model to use for reranking
|
||||
Default is "cross-encoder/ms-marco-MiniLM-L-2-v2"
|
||||
:param kwargs: The CrossEncoder parameters
|
||||
"""
|
||||
super().__init__(project_dir, *args, **kwargs)
|
||||
try:
|
||||
import torch
|
||||
from sentence_transformers import CrossEncoder
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"You have to install AutoRAG[gpu] to use SentenceTransformerReranker"
|
||||
)
|
||||
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
model_params = pop_params(CrossEncoder.__init__, kwargs)
|
||||
self.model = CrossEncoder(model_name, device=self.device, **model_params)
|
||||
|
||||
def __del__(self):
|
||||
del self.model
|
||||
empty_cuda_cache()
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
"""
|
||||
Rerank a list of contents based on their relevance to a query using a Sentence Transformer model.
|
||||
|
||||
:param previous_result: The previous result
|
||||
:param top_k: The number of passages to be retrieved
|
||||
:param batch: The number of queries to be processed in a batch
|
||||
:return: pd DataFrame containing the reranked contents, ids, and scores
|
||||
"""
|
||||
queries, contents_list, scores_list, ids_list = self.cast_to_run(
|
||||
previous_result
|
||||
)
|
||||
top_k = kwargs.get("top_k", 1)
|
||||
batch = kwargs.get("batch", 64)
|
||||
return self._pure(queries, contents_list, ids_list, top_k, batch)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents_list: List[List[str]],
|
||||
ids_list: List[List[str]],
|
||||
top_k: int,
|
||||
batch: int = 64,
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Rerank a list of contents based on their relevance to a query using a Sentence Transformer model.
|
||||
|
||||
:param queries: The list of queries to use for reranking
|
||||
:param contents_list: The list of lists of contents to rerank
|
||||
:param ids_list: The list of lists of ids retrieved from the initial ranking
|
||||
:param top_k: The number of passages to be retrieved
|
||||
:param batch: The number of queries to be processed in a batch
|
||||
|
||||
:return: tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
nested_list = [
|
||||
list(map(lambda x: [query, x], content_list))
|
||||
for query, content_list in zip(queries, contents_list)
|
||||
]
|
||||
rerank_scores = flatten_apply(
|
||||
sentence_transformer_run_model,
|
||||
nested_list,
|
||||
model=self.model,
|
||||
batch_size=batch,
|
||||
)
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"contents": contents_list,
|
||||
"ids": ids_list,
|
||||
"scores": rerank_scores,
|
||||
}
|
||||
)
|
||||
df[["contents", "ids", "scores"]] = df.apply(
|
||||
sort_by_scores, axis=1, result_type="expand"
|
||||
)
|
||||
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
|
||||
|
||||
return (
|
||||
results["contents"].tolist(),
|
||||
results["ids"].tolist(),
|
||||
results["scores"].tolist(),
|
||||
)
|
||||
|
||||
|
||||
def sentence_transformer_run_model(input_texts, model, batch_size: int):
|
||||
try:
|
||||
import torch
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"You have to install AutoRAG[gpu] to use SentenceTransformerReranker"
|
||||
)
|
||||
batch_input_texts = make_batch(input_texts, batch_size)
|
||||
results = []
|
||||
for batch_texts in batch_input_texts:
|
||||
with torch.no_grad():
|
||||
pred_scores = model.predict(sentences=batch_texts, apply_softmax=True)
|
||||
results.extend(pred_scores.tolist())
|
||||
return results
|
||||
1
autorag/nodes/passagereranker/tart/__init__.py
Normal file
1
autorag/nodes/passagereranker/tart/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from .tart import Tart
|
||||
152
autorag/nodes/passagereranker/tart/modeling_enc_t5.py
Normal file
152
autorag/nodes/passagereranker/tart/modeling_enc_t5.py
Normal file
@@ -0,0 +1,152 @@
|
||||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
import copy
|
||||
|
||||
from transformers.modeling_outputs import SequenceClassifierOutput
|
||||
from transformers.models.t5.modeling_t5 import T5Config, T5PreTrainedModel, T5Stack
|
||||
from transformers.utils.model_parallel_utils import assert_device_map, get_device_map
|
||||
|
||||
from autorag.utils.util import empty_cuda_cache
|
||||
|
||||
|
||||
class EncT5ForSequenceClassification(T5PreTrainedModel):
|
||||
_keys_to_ignore_on_load_missing = [
|
||||
r"encoder\.embed_tokens\.weight",
|
||||
]
|
||||
|
||||
def __init__(self, config: T5Config, dropout=0.1):
|
||||
super().__init__(config)
|
||||
try:
|
||||
from torch import nn
|
||||
except ImportError:
|
||||
raise ImportError("Please install PyTorch to use TART reranker.")
|
||||
self.num_labels = config.num_labels
|
||||
self.config = config
|
||||
|
||||
self.shared = nn.Embedding(config.vocab_size, config.d_model)
|
||||
|
||||
encoder_config = copy.deepcopy(config)
|
||||
encoder_config.use_cache = False
|
||||
encoder_config.is_encoder_decoder = False
|
||||
self.encoder = T5Stack(encoder_config, self.shared)
|
||||
|
||||
self.dropout = nn.Dropout(dropout)
|
||||
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
|
||||
|
||||
# Initialize weights and apply final processing
|
||||
self.post_init()
|
||||
|
||||
# Model parallel
|
||||
self.model_parallel = False
|
||||
self.device_map = None
|
||||
|
||||
def parallelize(self, device_map=None):
|
||||
try:
|
||||
import torch
|
||||
except ImportError:
|
||||
raise ImportError("Please install PyTorch to use TART reranker.")
|
||||
self.device_map = (
|
||||
get_device_map(len(self.encoder.block), range(torch.cuda.device_count()))
|
||||
if device_map is None
|
||||
else device_map
|
||||
)
|
||||
assert_device_map(self.device_map, len(self.encoder.block))
|
||||
self.encoder.parallelize(self.device_map)
|
||||
self.classifier = self.classifier.to(self.encoder.first_device)
|
||||
self.model_parallel = True
|
||||
|
||||
def deparallelize(self):
|
||||
self.encoder.deparallelize()
|
||||
self.encoder = self.encoder.to("cpu")
|
||||
self.model_parallel = False
|
||||
self.device_map = None
|
||||
empty_cuda_cache()
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.shared
|
||||
|
||||
def set_input_embeddings(self, new_embeddings):
|
||||
self.shared = new_embeddings
|
||||
self.encoder.set_input_embeddings(new_embeddings)
|
||||
|
||||
def get_encoder(self):
|
||||
return self.encoder
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
"""
|
||||
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
|
||||
class PreTrainedModel
|
||||
"""
|
||||
for layer, heads in heads_to_prune.items():
|
||||
self.encoder.layer[layer].attention.prune_heads(heads)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids=None,
|
||||
attention_mask=None,
|
||||
head_mask=None,
|
||||
inputs_embeds=None,
|
||||
labels=None,
|
||||
output_attentions=None,
|
||||
output_hidden_states=None,
|
||||
return_dict=None,
|
||||
):
|
||||
try:
|
||||
import torch
|
||||
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
||||
except ImportError:
|
||||
raise ImportError("Please install PyTorch to use TART reranker.")
|
||||
return_dict = (
|
||||
return_dict if return_dict is not None else self.config.use_return_dict
|
||||
)
|
||||
|
||||
outputs = self.encoder(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
inputs_embeds=inputs_embeds,
|
||||
head_mask=head_mask,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
hidden_states = outputs[0]
|
||||
pooled_output = hidden_states[:, 0, :] # Take bos token (equiv. to <s>)
|
||||
|
||||
pooled_output = self.dropout(pooled_output)
|
||||
logits = self.classifier(pooled_output)
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
if self.config.problem_type is None:
|
||||
if self.num_labels == 1:
|
||||
self.config.problem_type = "regression"
|
||||
elif self.num_labels > 1 and (
|
||||
labels.dtype == torch.long or labels.dtype == torch.int
|
||||
):
|
||||
self.config.problem_type = "single_label_classification"
|
||||
else:
|
||||
self.config.problem_type = "multi_label_classification"
|
||||
|
||||
if self.config.problem_type == "regression":
|
||||
loss_fct = MSELoss()
|
||||
if self.num_labels == 1:
|
||||
loss = loss_fct(logits.squeeze(), labels.squeeze())
|
||||
else:
|
||||
loss = loss_fct(logits, labels)
|
||||
elif self.config.problem_type == "single_label_classification":
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
||||
elif self.config.problem_type == "multi_label_classification":
|
||||
loss_fct = BCEWithLogitsLoss()
|
||||
loss = loss_fct(logits, labels)
|
||||
if not return_dict:
|
||||
output = (logits,) + outputs[1:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
return SequenceClassifierOutput(
|
||||
loss=loss,
|
||||
logits=logits,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
139
autorag/nodes/passagereranker/tart/tart.py
Normal file
139
autorag/nodes/passagereranker/tart/tart.py
Normal file
@@ -0,0 +1,139 @@
|
||||
from itertools import chain
|
||||
from typing import List, Tuple
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.passagereranker.base import BasePassageReranker
|
||||
from autorag.nodes.passagereranker.tart.modeling_enc_t5 import (
|
||||
EncT5ForSequenceClassification,
|
||||
)
|
||||
from autorag.nodes.passagereranker.tart.tokenization_enc_t5 import EncT5Tokenizer
|
||||
from autorag.utils.util import (
|
||||
make_batch,
|
||||
sort_by_scores,
|
||||
flatten_apply,
|
||||
select_top_k,
|
||||
result_to_dataframe,
|
||||
empty_cuda_cache,
|
||||
)
|
||||
|
||||
|
||||
class Tart(BasePassageReranker):
|
||||
def __init__(self, project_dir: str, *args, **kwargs):
|
||||
super().__init__(project_dir)
|
||||
try:
|
||||
import torch
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"torch is not installed. Please install torch first to use TART reranker."
|
||||
)
|
||||
model_name = "facebook/tart-full-flan-t5-xl"
|
||||
self.model = EncT5ForSequenceClassification.from_pretrained(model_name)
|
||||
self.tokenizer = EncT5Tokenizer.from_pretrained(model_name)
|
||||
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
self.model = self.model.to(self.device)
|
||||
|
||||
def __del__(self):
|
||||
del self.model
|
||||
del self.tokenizer
|
||||
empty_cuda_cache()
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries, contents, _, ids = self.cast_to_run(previous_result)
|
||||
top_k = kwargs.pop("top_k")
|
||||
instruction = kwargs.pop("instruction", "Find passage to answer given question")
|
||||
batch = kwargs.pop("batch", 64)
|
||||
return self._pure(queries, contents, ids, top_k, instruction, batch)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents_list: List[List[str]],
|
||||
ids_list: List[List[str]],
|
||||
top_k: int,
|
||||
instruction: str = "Find passage to answer given question",
|
||||
batch: int = 64,
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Rerank a list of contents based on their relevance to a query using Tart.
|
||||
TART is a reranker based on TART (https://github.com/facebookresearch/tart).
|
||||
You can rerank the passages with the instruction using TARTReranker.
|
||||
The default model is facebook/tart-full-flan-t5-xl.
|
||||
|
||||
:param queries: The list of queries to use for reranking
|
||||
:param contents_list: The list of lists of contents to rerank
|
||||
:param ids_list: The list of lists of ids retrieved from the initial ranking
|
||||
:param top_k: The number of passages to be retrieved
|
||||
:param instruction: The instruction for reranking.
|
||||
Note: default instruction is "Find passage to answer given question"
|
||||
The default instruction from the TART paper is being used.
|
||||
If you want to use a different instruction, you can change the instruction through this parameter
|
||||
:param batch: The number of queries to be processed in a batch
|
||||
:return: tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
nested_list = [
|
||||
[["{} [SEP] {}".format(instruction, query)] for _ in contents]
|
||||
for query, contents in zip(queries, contents_list)
|
||||
]
|
||||
|
||||
rerank_scores = flatten_apply(
|
||||
tart_run_model,
|
||||
nested_list,
|
||||
model=self.model,
|
||||
batch_size=batch,
|
||||
tokenizer=self.tokenizer,
|
||||
device=self.device,
|
||||
contents_list=contents_list,
|
||||
)
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"contents": contents_list,
|
||||
"ids": ids_list,
|
||||
"scores": rerank_scores,
|
||||
}
|
||||
)
|
||||
df[["contents", "ids", "scores"]] = df.apply(
|
||||
sort_by_scores, axis=1, result_type="expand"
|
||||
)
|
||||
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
|
||||
|
||||
return (
|
||||
results["contents"].tolist(),
|
||||
results["ids"].tolist(),
|
||||
results["scores"].tolist(),
|
||||
)
|
||||
|
||||
|
||||
def tart_run_model(
|
||||
input_texts, contents_list, model, batch_size: int, tokenizer, device
|
||||
):
|
||||
try:
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"torch is not installed. Please install torch first to use TART reranker."
|
||||
)
|
||||
flattened_texts = list(chain.from_iterable(input_texts))
|
||||
flattened_contents = list(chain.from_iterable(contents_list))
|
||||
batch_input_texts = make_batch(flattened_texts, batch_size)
|
||||
batch_contents_list = make_batch(flattened_contents, batch_size)
|
||||
results = []
|
||||
for batch_texts, batch_contents in zip(batch_input_texts, batch_contents_list):
|
||||
feature = tokenizer(
|
||||
batch_texts,
|
||||
batch_contents,
|
||||
padding=True,
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
).to(device)
|
||||
with torch.no_grad():
|
||||
pred_scores = model(**feature).logits
|
||||
normalized_scores = [
|
||||
float(score[1]) for score in F.softmax(pred_scores, dim=1)
|
||||
]
|
||||
results.extend(normalized_scores)
|
||||
return results
|
||||
112
autorag/nodes/passagereranker/tart/tokenization_enc_t5.py
Normal file
112
autorag/nodes/passagereranker/tart/tokenization_enc_t5.py
Normal file
@@ -0,0 +1,112 @@
|
||||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from transformers import T5Tokenizer
|
||||
|
||||
|
||||
class EncT5Tokenizer(T5Tokenizer):
|
||||
def __init__(
|
||||
self,
|
||||
vocab_file,
|
||||
bos_token="<s>",
|
||||
eos_token="</s>",
|
||||
unk_token="<unk>",
|
||||
pad_token="<pad>",
|
||||
extra_ids=100,
|
||||
additional_special_tokens=None,
|
||||
sp_model_kwargs: Optional[Dict[str, Any]] = None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
||||
|
||||
super().__init__(
|
||||
vocab_file=vocab_file,
|
||||
bos_token=bos_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
pad_token=pad_token,
|
||||
extra_ids=extra_ids,
|
||||
additional_special_tokens=additional_special_tokens,
|
||||
sp_model_kwargs=sp_model_kwargs,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def get_special_tokens_mask(
|
||||
self,
|
||||
token_ids_0: List[int],
|
||||
token_ids_1: Optional[List[int]] = None,
|
||||
already_has_special_tokens: bool = False,
|
||||
) -> List[int]:
|
||||
"""
|
||||
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
|
||||
special tokens using the tokenizer `prepare_for_model` method.
|
||||
Args:
|
||||
token_ids_0 (`List[int]`):
|
||||
List of IDs.
|
||||
token_ids_1 (`List[int]`, *optional*):
|
||||
Optional second list of IDs for sequence pairs.
|
||||
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not the token list is already formatted with special tokens for the model.
|
||||
Returns:
|
||||
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
||||
"""
|
||||
if already_has_special_tokens:
|
||||
return super().get_special_tokens_mask(
|
||||
token_ids_0=token_ids_0,
|
||||
token_ids_1=token_ids_1,
|
||||
already_has_special_tokens=True,
|
||||
)
|
||||
|
||||
# normal case: some special tokens
|
||||
if token_ids_1 is None:
|
||||
return [1] + ([0] * len(token_ids_0)) + [1]
|
||||
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
||||
|
||||
def create_token_type_ids_from_sequences(
|
||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||
) -> List[int]:
|
||||
"""
|
||||
Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
|
||||
use of token type ids, therefore a list of zeros is returned.
|
||||
Args:
|
||||
token_ids_0 (`List[int]`):
|
||||
List of IDs.
|
||||
token_ids_1 (`List[int]`, *optional*):
|
||||
Optional second list of IDs for sequence pairs.
|
||||
Returns:
|
||||
`List[int]`: List of zeros.
|
||||
"""
|
||||
bos = [self.bos_token_id]
|
||||
eos = [self.eos_token_id]
|
||||
|
||||
if token_ids_1 is None:
|
||||
return len(bos + token_ids_0 + eos) * [0]
|
||||
return len(bos + token_ids_0 + eos + token_ids_1 + eos) * [0]
|
||||
|
||||
def build_inputs_with_special_tokens(
|
||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||
) -> List[int]:
|
||||
"""
|
||||
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
|
||||
adding special tokens. A sequence has the following format:
|
||||
- single sequence: `<s> X </s>`
|
||||
- pair of sequences: `<s> A </s> B </s>`
|
||||
Args:
|
||||
token_ids_0 (`List[int]`):
|
||||
List of IDs to which the special tokens will be added.
|
||||
token_ids_1 (`List[int]`, *optional*):
|
||||
Optional second list of IDs for sequence pairs.
|
||||
Returns:
|
||||
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
|
||||
"""
|
||||
if token_ids_1 is None:
|
||||
return [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
|
||||
else:
|
||||
return (
|
||||
[self.bos_token_id]
|
||||
+ token_ids_0
|
||||
+ [self.eos_token_id]
|
||||
+ token_ids_1
|
||||
+ [self.eos_token_id]
|
||||
)
|
||||
72
autorag/nodes/passagereranker/time_reranker.py
Normal file
72
autorag/nodes/passagereranker/time_reranker.py
Normal file
@@ -0,0 +1,72 @@
|
||||
import os
|
||||
from datetime import datetime
|
||||
from typing import List, Tuple
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.passagereranker.base import BasePassageReranker
|
||||
from autorag.utils import result_to_dataframe, fetch_contents
|
||||
|
||||
|
||||
class TimeReranker(BasePassageReranker):
|
||||
def __init__(self, project_dir: str, *args, **kwargs):
|
||||
super().__init__(project_dir, *args, **kwargs)
|
||||
self.corpus_df = pd.read_parquet(
|
||||
os.path.join(project_dir, "data", "corpus.parquet"), engine="pyarrow"
|
||||
)
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
_, contents, scores, ids = self.cast_to_run(previous_result)
|
||||
metadatas = fetch_contents(self.corpus_df, ids, column_name="metadata")
|
||||
times = [
|
||||
[time["last_modified_datetime"] for time in time_list]
|
||||
for time_list in metadatas
|
||||
]
|
||||
top_k = kwargs.pop("top_k")
|
||||
return self._pure(contents, scores, ids, top_k, times)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
contents_list: List[List[str]],
|
||||
scores_list: List[List[float]],
|
||||
ids_list: List[List[str]],
|
||||
top_k: int,
|
||||
time_list: List[List[datetime]],
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Rerank the passages based on merely the datetime of the passage.
|
||||
It uses 'last_modified_datetime' key in the corpus metadata,
|
||||
so the metadata should be in the format of {'last_modified_datetime': datetime.datetime} at the corpus data file.
|
||||
|
||||
:param contents_list: The list of lists of contents
|
||||
:param scores_list: The list of lists of scores from the initial ranking
|
||||
:param ids_list: The list of lists of ids
|
||||
:param top_k: The number of passages to be retrieved after reranking
|
||||
:param time_list: The metadata list of lists of datetime.datetime
|
||||
It automatically extracts the 'last_modified_datetime' key from the metadata in the corpus data.
|
||||
:return: The reranked contents, ids, and scores
|
||||
"""
|
||||
|
||||
def sort_row(contents, scores, ids, time, top_k):
|
||||
combined = list(zip(contents, scores, ids, time))
|
||||
combined.sort(key=lambda x: x[3], reverse=True)
|
||||
sorted_contents, sorted_scores, sorted_ids, _ = zip(*combined)
|
||||
return (
|
||||
list(sorted_contents)[:top_k],
|
||||
list(sorted_scores)[:top_k],
|
||||
list(sorted_ids)[:top_k],
|
||||
)
|
||||
|
||||
reranked_contents, reranked_scores, reranked_ids = zip(
|
||||
*map(
|
||||
sort_row,
|
||||
contents_list,
|
||||
scores_list,
|
||||
ids_list,
|
||||
time_list,
|
||||
[top_k] * len(contents_list),
|
||||
)
|
||||
)
|
||||
|
||||
return list(reranked_contents), list(reranked_ids), list(reranked_scores)
|
||||
160
autorag/nodes/passagereranker/upr.py
Normal file
160
autorag/nodes/passagereranker/upr.py
Normal file
@@ -0,0 +1,160 @@
|
||||
import logging
|
||||
from typing import List, Tuple
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.passagereranker.base import BasePassageReranker
|
||||
from autorag.utils import result_to_dataframe
|
||||
from autorag.utils.util import select_top_k, sort_by_scores, empty_cuda_cache
|
||||
|
||||
logger = logging.getLogger("AutoRAG")
|
||||
|
||||
|
||||
class Upr(BasePassageReranker):
|
||||
def __init__(
|
||||
self,
|
||||
project_dir: str,
|
||||
use_bf16: bool = False,
|
||||
prefix_prompt: str = "Passage: ",
|
||||
suffix_prompt: str = "Please write a question based on this passage.",
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Initialize the UPR reranker node.
|
||||
|
||||
:param project_dir: The project directory
|
||||
:param use_bf16: Whether to use bfloat16 for the model. Default is False.
|
||||
:param prefix_prompt: The prefix prompt for the language model that generates question for reranking.
|
||||
Default is "Passage: ".
|
||||
The prefix prompt serves as the initial context or instruction for the language model.
|
||||
It sets the stage for what is expected in the output
|
||||
:param suffix_prompt: The suffix prompt for the language model that generates question for reranking.
|
||||
Default is "Please write a question based on this passage.".
|
||||
The suffix prompt provides a cue or a closing instruction to the language model,
|
||||
signaling how to conclude the generated text or what format to follow at the end.
|
||||
:param kwargs: Extra arguments
|
||||
"""
|
||||
super().__init__(project_dir, *args, **kwargs)
|
||||
|
||||
self.scorer = UPRScorer(
|
||||
suffix_prompt=suffix_prompt, prefix_prompt=prefix_prompt, use_bf16=use_bf16
|
||||
)
|
||||
|
||||
def __del__(self):
|
||||
del self.scorer
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries, contents, _, ids = self.cast_to_run(previous_result)
|
||||
top_k = kwargs.pop("top_k")
|
||||
return self._pure(queries, contents, ids, top_k)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents_list: List[List[str]],
|
||||
ids_list: List[List[str]],
|
||||
top_k: int,
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Rerank a list of contents based on their relevance to a query using UPR.
|
||||
UPR is a reranker based on UPR (https://github.com/DevSinghSachan/unsupervised-passage-reranking).
|
||||
The language model will make a question based on the passage and rerank the passages by the likelihood of the question.
|
||||
The default model is t5-large.
|
||||
|
||||
:param queries: The list of queries to use for reranking
|
||||
:param contents_list: The list of lists of contents to rerank
|
||||
:param ids_list: The list of lists of ids retrieved from the initial ranking
|
||||
:param top_k: The number of passages to be retrieved
|
||||
|
||||
:return: tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"query": queries,
|
||||
"contents": contents_list,
|
||||
"ids": ids_list,
|
||||
}
|
||||
)
|
||||
|
||||
df["scores"] = df.apply(
|
||||
lambda row: self.scorer.compute(
|
||||
query=row["query"], contents=row["contents"]
|
||||
),
|
||||
axis=1,
|
||||
)
|
||||
df[["contents", "ids", "scores"]] = df.apply(
|
||||
lambda x: sort_by_scores(x, reverse=False), axis=1, result_type="expand"
|
||||
)
|
||||
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
|
||||
return (
|
||||
results["contents"].tolist(),
|
||||
results["ids"].tolist(),
|
||||
results["scores"].tolist(),
|
||||
)
|
||||
|
||||
|
||||
class UPRScorer:
|
||||
def __init__(self, suffix_prompt: str, prefix_prompt: str, use_bf16: bool = False):
|
||||
try:
|
||||
import torch
|
||||
from transformers import T5Tokenizer, T5ForConditionalGeneration
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"torch is not installed. Please install torch to use UPRReranker."
|
||||
)
|
||||
model_name = "t5-large"
|
||||
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
self.tokenizer = T5Tokenizer.from_pretrained(model_name)
|
||||
self.model = T5ForConditionalGeneration.from_pretrained(
|
||||
model_name, torch_dtype=torch.bfloat16 if use_bf16 else torch.float32
|
||||
).to(self.device)
|
||||
self.suffix_prompt = suffix_prompt
|
||||
self.prefix_prompt = prefix_prompt
|
||||
|
||||
def compute(self, query: str, contents: List[str]) -> List[float]:
|
||||
try:
|
||||
import torch
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"torch is not installed. Please install torch to use UPRReranker."
|
||||
)
|
||||
query_token = self.tokenizer(
|
||||
query, max_length=128, truncation=True, return_tensors="pt"
|
||||
)
|
||||
prompts = list(
|
||||
map(
|
||||
lambda content: f"{self.prefix_prompt} {content} {self.suffix_prompt}",
|
||||
contents,
|
||||
)
|
||||
)
|
||||
prompt_token_outputs = self.tokenizer(
|
||||
prompts,
|
||||
padding="longest",
|
||||
max_length=512,
|
||||
pad_to_multiple_of=8,
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
query_input_ids = torch.repeat_interleave(
|
||||
query_token["input_ids"], len(contents), dim=0
|
||||
).to(self.device)
|
||||
|
||||
with torch.no_grad():
|
||||
logits = self.model(
|
||||
input_ids=prompt_token_outputs["input_ids"].to(self.device),
|
||||
attention_mask=prompt_token_outputs["attention_mask"].to(self.device),
|
||||
labels=query_input_ids,
|
||||
).logits
|
||||
log_softmax = torch.nn.functional.log_softmax(logits, dim=-1)
|
||||
nll = -log_softmax.gather(2, query_input_ids.unsqueeze(2)).squeeze(2)
|
||||
avg_nll = torch.sum(nll, dim=1)
|
||||
return avg_nll.tolist()
|
||||
|
||||
def __del__(self):
|
||||
del self.model
|
||||
del self.tokenizer
|
||||
empty_cuda_cache()
|
||||
109
autorag/nodes/passagereranker/voyageai.py
Normal file
109
autorag/nodes/passagereranker/voyageai.py
Normal file
@@ -0,0 +1,109 @@
|
||||
import os
|
||||
from typing import List, Tuple
|
||||
import pandas as pd
|
||||
import voyageai
|
||||
|
||||
from autorag.nodes.passagereranker.base import BasePassageReranker
|
||||
from autorag.utils.util import result_to_dataframe, get_event_loop, process_batch
|
||||
|
||||
|
||||
class VoyageAIReranker(BasePassageReranker):
|
||||
def __init__(self, project_dir: str, *args, **kwargs):
|
||||
super().__init__(project_dir)
|
||||
api_key = kwargs.pop("api_key", None)
|
||||
api_key = os.getenv("VOYAGE_API_KEY", None) if api_key is None else api_key
|
||||
if api_key is None:
|
||||
raise KeyError(
|
||||
"Please set the API key for VoyageAI rerank in the environment variable VOYAGE_API_KEY "
|
||||
"or directly set it on the config YAML file."
|
||||
)
|
||||
|
||||
self.voyage_client = voyageai.AsyncClient(api_key=api_key)
|
||||
|
||||
def __del__(self):
|
||||
del self.voyage_client
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries, contents, scores, ids = self.cast_to_run(previous_result)
|
||||
top_k = kwargs.pop("top_k")
|
||||
batch = kwargs.pop("batch", 8)
|
||||
model = kwargs.pop("model", "rerank-2")
|
||||
truncation = kwargs.pop("truncation", True)
|
||||
return self._pure(queries, contents, ids, top_k, model, batch, truncation)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[str],
|
||||
contents_list: List[List[str]],
|
||||
ids_list: List[List[str]],
|
||||
top_k: int,
|
||||
model: str = "rerank-2",
|
||||
batch: int = 8,
|
||||
truncation: bool = True,
|
||||
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Rerank a list of contents with VoyageAI rerank models.
|
||||
You can get the API key from https://docs.voyageai.com/docs/api-key-and-installation and set it in the environment variable VOYAGE_API_KEY.
|
||||
|
||||
:param queries: The list of queries to use for reranking
|
||||
:param contents_list: The list of lists of contents to rerank
|
||||
:param ids_list: The list of lists of ids retrieved from the initial ranking
|
||||
:param top_k: The number of passages to be retrieved
|
||||
:param model: The model name for VoyageAI rerank.
|
||||
You can choose between "rerank-2" and "rerank-2-lite".
|
||||
Default is "rerank-2".
|
||||
:param batch: The number of queries to be processed in a batch
|
||||
:param truncation: Whether to truncate the input to satisfy the 'context length limit' on the query and the documents.
|
||||
:return: Tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
tasks = [
|
||||
voyageai_rerank_pure(
|
||||
self.voyage_client, model, query, contents, ids, top_k, truncation
|
||||
)
|
||||
for query, contents, ids in zip(queries, contents_list, ids_list)
|
||||
]
|
||||
loop = get_event_loop()
|
||||
results = loop.run_until_complete(process_batch(tasks, batch))
|
||||
|
||||
content_result, id_result, score_result = zip(*results)
|
||||
|
||||
return list(content_result), list(id_result), list(score_result)
|
||||
|
||||
|
||||
async def voyageai_rerank_pure(
|
||||
voyage_client: voyageai.AsyncClient,
|
||||
model: str,
|
||||
query: str,
|
||||
documents: List[str],
|
||||
ids: List[str],
|
||||
top_k: int,
|
||||
truncation: bool = True,
|
||||
) -> Tuple[List[str], List[str], List[float]]:
|
||||
"""
|
||||
Rerank a list of contents with VoyageAI rerank models.
|
||||
|
||||
:param voyage_client: The Voyage Client to use for reranking
|
||||
:param model: The model name for VoyageAI rerank
|
||||
:param query: The query to use for reranking
|
||||
:param documents: The list of contents to rerank
|
||||
:param ids: The list of ids corresponding to the documents
|
||||
:param top_k: The number of passages to be retrieved
|
||||
:param truncation: Whether to truncate the input to satisfy the 'context length limit' on the query and the documents.
|
||||
:return: Tuple of lists containing the reranked contents, ids, and scores
|
||||
"""
|
||||
rerank_results = await voyage_client.rerank(
|
||||
model=model,
|
||||
query=query,
|
||||
documents=documents,
|
||||
top_k=top_k,
|
||||
truncation=truncation,
|
||||
)
|
||||
reranked_scores: List[float] = list(
|
||||
map(lambda x: x.relevance_score, rerank_results.results)
|
||||
)
|
||||
indices = list(map(lambda x: x.index, rerank_results.results))
|
||||
reranked_contents: List[str] = list(map(lambda i: documents[i], indices))
|
||||
reranked_ids: List[str] = list(map(lambda i: ids[i], indices))
|
||||
return reranked_contents, reranked_ids, reranked_scores
|
||||
3
autorag/nodes/promptmaker/__init__.py
Normal file
3
autorag/nodes/promptmaker/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from .long_context_reorder import LongContextReorder
|
||||
from .window_replacement import WindowReplacement
|
||||
from .fstring import Fstring
|
||||
34
autorag/nodes/promptmaker/base.py
Normal file
34
autorag/nodes/promptmaker/base.py
Normal file
@@ -0,0 +1,34 @@
|
||||
import logging
|
||||
from abc import ABCMeta
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.schema.base import BaseModule
|
||||
|
||||
logger = logging.getLogger("AutoRAG")
|
||||
|
||||
|
||||
class BasePromptMaker(BaseModule, metaclass=ABCMeta):
|
||||
def __init__(self, project_dir: Union[str, Path], *args, **kwargs):
|
||||
logger.info(
|
||||
f"Initialize prompt maker node - {self.__class__.__name__} module..."
|
||||
)
|
||||
|
||||
def __del__(self):
|
||||
logger.info(f"Prompt maker node - {self.__class__.__name__} module is deleted.")
|
||||
|
||||
def cast_to_run(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
logger.info(f"Running prompt maker node - {self.__class__.__name__} module...")
|
||||
# get query and retrieved contents from previous_result
|
||||
assert (
|
||||
"query" in previous_result.columns
|
||||
), "previous_result must have query column."
|
||||
assert (
|
||||
"retrieved_contents" in previous_result.columns
|
||||
), "previous_result must have retrieved_contents column."
|
||||
query = previous_result["query"].tolist()
|
||||
retrieved_contents = previous_result["retrieved_contents"].tolist()
|
||||
prompt = kwargs.pop("prompt")
|
||||
return query, retrieved_contents, prompt
|
||||
49
autorag/nodes/promptmaker/fstring.py
Normal file
49
autorag/nodes/promptmaker/fstring.py
Normal file
@@ -0,0 +1,49 @@
|
||||
from typing import List
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.promptmaker.base import BasePromptMaker
|
||||
from autorag.utils import result_to_dataframe
|
||||
|
||||
|
||||
class Fstring(BasePromptMaker):
|
||||
@result_to_dataframe(["prompts"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
query, retrieved_contents, prompt = self.cast_to_run(
|
||||
previous_result, *args, **kwargs
|
||||
)
|
||||
return self._pure(prompt, query, retrieved_contents)
|
||||
|
||||
def _pure(
|
||||
self, prompt: str, queries: List[str], retrieved_contents: List[List[str]]
|
||||
) -> List[str]:
|
||||
"""
|
||||
Make a prompt using f-string from a query and retrieved_contents.
|
||||
You must type a prompt or prompt list at a config YAML file like this:
|
||||
|
||||
.. Code:: yaml
|
||||
nodes:
|
||||
- node_type: prompt_maker
|
||||
modules:
|
||||
- module_type: fstring
|
||||
prompt: [Answer this question: {query} \n\n {retrieved_contents},
|
||||
Read the passages carefully and answer this question: {query} \n\n Passages: {retrieved_contents}]
|
||||
|
||||
:param prompt: A prompt string.
|
||||
:param queries: List of query strings.
|
||||
:param retrieved_contents: List of retrieved contents.
|
||||
:return: Prompts that are made by f-string.
|
||||
"""
|
||||
|
||||
def fstring_row(
|
||||
_prompt: str, _query: str, _retrieved_contents: List[str]
|
||||
) -> str:
|
||||
contents_str = "\n\n".join(_retrieved_contents)
|
||||
return _prompt.format(query=_query, retrieved_contents=contents_str)
|
||||
|
||||
return list(
|
||||
map(
|
||||
lambda x: fstring_row(prompt, x[0], x[1]),
|
||||
zip(queries, retrieved_contents),
|
||||
)
|
||||
)
|
||||
83
autorag/nodes/promptmaker/long_context_reorder.py
Normal file
83
autorag/nodes/promptmaker/long_context_reorder.py
Normal file
@@ -0,0 +1,83 @@
|
||||
import logging
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.promptmaker.base import BasePromptMaker
|
||||
from autorag.utils import result_to_dataframe
|
||||
|
||||
logger = logging.getLogger("AutoRAG")
|
||||
|
||||
|
||||
class LongContextReorder(BasePromptMaker):
|
||||
@result_to_dataframe(["prompts"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
query, retrieved_contents, prompt = self.cast_to_run(
|
||||
previous_result, *args, **kwargs
|
||||
)
|
||||
assert (
|
||||
"retrieve_scores" in previous_result.columns
|
||||
), "previous_result must have retrieve_scores column."
|
||||
retrieve_scores = previous_result["retrieve_scores"].tolist()
|
||||
return self._pure(prompt, query, retrieved_contents, retrieve_scores)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
prompt: str,
|
||||
queries: List[str],
|
||||
retrieved_contents: List[List[str]],
|
||||
retrieve_scores: List[List[float]],
|
||||
) -> List[str]:
|
||||
"""
|
||||
Models struggle to access significant details found
|
||||
in the center of extended contexts. A study
|
||||
(https://arxiv.org/abs/2307.03172) observed that the best
|
||||
performance typically arises when crucial data is positioned
|
||||
at the start or conclusion of the input context. Additionally,
|
||||
as the input context lengthens, performance drops notably, even
|
||||
in models designed for long contexts."
|
||||
|
||||
.. Code:: yaml
|
||||
nodes:
|
||||
- node_type: prompt_maker
|
||||
modules:
|
||||
- module_type: long_context_reorder
|
||||
prompt: [Answer this question: {query} \n\n {retrieved_contents},
|
||||
Read the passages carefully and answer this question: {query} \n\n Passages: {retrieved_contents}]
|
||||
|
||||
:param prompt: A prompt string.
|
||||
:param queries: List of query strings.
|
||||
:param retrieved_contents: List of retrieved contents.
|
||||
:param retrieve_scores: List of `retrieve scores`.
|
||||
:return: Prompts that are made by long context reorder.
|
||||
"""
|
||||
|
||||
def long_context_reorder_row(
|
||||
_prompt: str,
|
||||
_query: str,
|
||||
_retrieved_contents: List[str],
|
||||
_retrieve_scores: List[float],
|
||||
) -> str:
|
||||
if isinstance(_retrieved_contents, np.ndarray):
|
||||
_retrieved_contents = _retrieved_contents.tolist()
|
||||
if not len(_retrieved_contents) == len(_retrieve_scores):
|
||||
logger.info("If you use a summarizer, the reorder will not proceed.")
|
||||
return _prompt.format(
|
||||
query=_query, retrieved_contents="\n\n".join(_retrieved_contents)
|
||||
)
|
||||
content_scores = list(zip(_retrieved_contents, _retrieve_scores))
|
||||
sorted_content_scores = sorted(
|
||||
content_scores, key=lambda x: x[1], reverse=True
|
||||
)
|
||||
content_result, score_result = zip(*sorted_content_scores)
|
||||
_retrieved_contents.append(content_result[0])
|
||||
contents_str = "\n\n".join(_retrieved_contents)
|
||||
return _prompt.format(query=_query, retrieved_contents=contents_str)
|
||||
|
||||
return list(
|
||||
map(
|
||||
lambda x: long_context_reorder_row(prompt, x[0], x[1], x[2]),
|
||||
zip(queries, retrieved_contents, retrieve_scores),
|
||||
)
|
||||
)
|
||||
280
autorag/nodes/promptmaker/run.py
Normal file
280
autorag/nodes/promptmaker/run.py
Normal file
@@ -0,0 +1,280 @@
|
||||
import os
|
||||
import pathlib
|
||||
from copy import deepcopy
|
||||
from typing import List, Dict, Optional, Union
|
||||
|
||||
import pandas as pd
|
||||
import tokenlog
|
||||
|
||||
from autorag.evaluation import evaluate_generation
|
||||
from autorag.evaluation.util import cast_metrics
|
||||
from autorag.schema.metricinput import MetricInput
|
||||
from autorag.strategy import measure_speed, filter_by_threshold, select_best
|
||||
from autorag.support import get_support_modules
|
||||
from autorag.utils import validate_qa_dataset
|
||||
from autorag.utils.util import make_combinations, explode, split_dataframe
|
||||
|
||||
|
||||
def run_prompt_maker_node(
|
||||
modules: List,
|
||||
module_params: List[Dict],
|
||||
previous_result: pd.DataFrame,
|
||||
node_line_dir: str,
|
||||
strategies: Dict,
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Run prompt maker node.
|
||||
With this function, you can select the best prompt maker module.
|
||||
As default, when you can use only one module, the evaluation will be skipped.
|
||||
If you want to select the best prompt among modules, you can use strategies.
|
||||
When you use them, you must pass 'generator_modules' and its parameters at strategies.
|
||||
Because it uses generator modules and generator metrics for evaluation this module.
|
||||
It is recommended to use one params and modules for evaluation,
|
||||
but you can use multiple params and modules for evaluation.
|
||||
When you don't set generator module at strategies, it will use the default generator module.
|
||||
The default generator module is llama_index_llm with openai gpt-3.5-turbo model.
|
||||
|
||||
:param modules: Prompt maker module classes to run.
|
||||
:param module_params: Prompt maker module parameters.
|
||||
:param previous_result: Previous result dataframe.
|
||||
Could be query expansion's best result or qa data.
|
||||
:param node_line_dir: This node line's directory.
|
||||
:param strategies: Strategies for prompt maker node.
|
||||
:return: The best result dataframe.
|
||||
It contains previous result columns and prompt maker's result columns which is 'prompts'.
|
||||
"""
|
||||
if not os.path.exists(node_line_dir):
|
||||
os.makedirs(node_line_dir)
|
||||
node_dir = os.path.join(node_line_dir, "prompt_maker")
|
||||
if not os.path.exists(node_dir):
|
||||
os.makedirs(node_dir)
|
||||
project_dir = pathlib.PurePath(node_line_dir).parent.parent
|
||||
|
||||
# run modules
|
||||
results, execution_times = zip(
|
||||
*map(
|
||||
lambda task: measure_speed(
|
||||
task[0].run_evaluator,
|
||||
project_dir=project_dir,
|
||||
previous_result=previous_result,
|
||||
**task[1],
|
||||
),
|
||||
zip(modules, module_params),
|
||||
)
|
||||
)
|
||||
average_times = list(map(lambda x: x / len(results[0]), execution_times))
|
||||
|
||||
# get average token usage
|
||||
token_usages = []
|
||||
for i, result in enumerate(results):
|
||||
token_logger = tokenlog.getLogger(
|
||||
f"prompt_maker_{i}", strategies.get("tokenizer", "gpt2")
|
||||
)
|
||||
token_logger.query_batch(result["prompts"].tolist())
|
||||
token_usages.append(token_logger.get_token_usage() / len(result))
|
||||
|
||||
# save results to folder
|
||||
filepaths = list(
|
||||
map(lambda x: os.path.join(node_dir, f"{x}.parquet"), range(len(modules)))
|
||||
)
|
||||
list(
|
||||
map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths))
|
||||
) # execute save to parquet
|
||||
filenames = list(map(lambda x: os.path.basename(x), filepaths))
|
||||
|
||||
# make summary file
|
||||
summary_df = pd.DataFrame(
|
||||
{
|
||||
"filename": filenames,
|
||||
"module_name": list(map(lambda module: module.__name__, modules)),
|
||||
"module_params": module_params,
|
||||
"execution_time": average_times,
|
||||
"average_prompt_token": token_usages,
|
||||
}
|
||||
)
|
||||
|
||||
metric_names, metric_params = cast_metrics(strategies.get("metrics"))
|
||||
|
||||
# Run evaluation when there are more than one module.
|
||||
if len(modules) > 1:
|
||||
# pop general keys from strategies (e.g. metrics, speed_threshold)
|
||||
general_key = ["metrics", "speed_threshold", "token_threshold", "tokenizer"]
|
||||
general_strategy = dict(
|
||||
filter(lambda x: x[0] in general_key, strategies.items())
|
||||
)
|
||||
extra_strategy = dict(
|
||||
filter(lambda x: x[0] not in general_key, strategies.items())
|
||||
)
|
||||
|
||||
# first, filter by threshold if it is enabled.
|
||||
if general_strategy.get("speed_threshold") is not None:
|
||||
results, filenames = filter_by_threshold(
|
||||
results, average_times, general_strategy["speed_threshold"], filenames
|
||||
)
|
||||
|
||||
# Calculate tokens and save to summary
|
||||
if general_strategy.get("token_threshold") is not None:
|
||||
results, filenames = filter_by_threshold(
|
||||
results, token_usages, general_strategy["token_threshold"], filenames
|
||||
)
|
||||
|
||||
# run metrics before filtering
|
||||
if metric_names is None or len(metric_names) <= 0:
|
||||
raise ValueError(
|
||||
"You must at least one metrics for prompt maker evaluation."
|
||||
)
|
||||
|
||||
# get generator modules from strategy
|
||||
generator_callables, generator_params = make_generator_callable_params(
|
||||
extra_strategy
|
||||
)
|
||||
|
||||
# get generation_gt
|
||||
qa_data = pd.read_parquet(
|
||||
os.path.join(project_dir, "data", "qa.parquet"), engine="pyarrow"
|
||||
)
|
||||
validate_qa_dataset(qa_data)
|
||||
generation_gt = qa_data["generation_gt"].tolist()
|
||||
generation_gt = list(map(lambda x: x.tolist(), generation_gt))
|
||||
|
||||
metric_inputs = [MetricInput(generation_gt=gen_gt) for gen_gt in generation_gt]
|
||||
|
||||
all_prompts = []
|
||||
for result in results:
|
||||
all_prompts.extend(result["prompts"].tolist())
|
||||
|
||||
evaluation_result_all = evaluate_one_prompt_maker_node(
|
||||
all_prompts,
|
||||
generator_callables,
|
||||
generator_params,
|
||||
metric_inputs * len(results),
|
||||
general_strategy["metrics"],
|
||||
project_dir,
|
||||
strategy_name=strategies.get("strategy", "mean"),
|
||||
)
|
||||
evaluation_results = split_dataframe(
|
||||
evaluation_result_all, chunk_size=len(results[0])
|
||||
)
|
||||
|
||||
evaluation_df = pd.DataFrame(
|
||||
{
|
||||
"filename": filenames,
|
||||
**{
|
||||
f"prompt_maker_{metric_name}": list(
|
||||
map(lambda x: x[metric_name].mean(), evaluation_results)
|
||||
)
|
||||
for metric_name in metric_names
|
||||
},
|
||||
}
|
||||
)
|
||||
summary_df = pd.merge(
|
||||
on="filename", left=summary_df, right=evaluation_df, how="left"
|
||||
)
|
||||
|
||||
best_result, best_filename = select_best(
|
||||
evaluation_results,
|
||||
metric_names,
|
||||
filenames,
|
||||
strategies.get("strategy", "mean"),
|
||||
)
|
||||
# change metric name columns to prompt_maker_metric_name
|
||||
best_result = best_result.rename(
|
||||
columns={
|
||||
metric_name: f"prompt_maker_{metric_name}"
|
||||
for metric_name in metric_names
|
||||
}
|
||||
)
|
||||
best_result = best_result.drop(columns=["generated_texts"])
|
||||
else:
|
||||
best_result, best_filename = results[0], filenames[0]
|
||||
|
||||
# add 'is_best' column at summary file
|
||||
summary_df["is_best"] = summary_df["filename"] == best_filename
|
||||
|
||||
best_result = pd.concat([previous_result, best_result], axis=1)
|
||||
|
||||
# save files
|
||||
summary_df.to_csv(os.path.join(node_dir, "summary.csv"), index=False)
|
||||
best_result.to_parquet(
|
||||
os.path.join(node_dir, f"best_{os.path.splitext(best_filename)[0]}.parquet"),
|
||||
index=False,
|
||||
)
|
||||
|
||||
return best_result
|
||||
|
||||
|
||||
def make_generator_callable_params(strategy_dict: Dict):
|
||||
node_dict = deepcopy(strategy_dict)
|
||||
generator_module_list: Optional[List[Dict]] = node_dict.pop(
|
||||
"generator_modules", None
|
||||
)
|
||||
if generator_module_list is None:
|
||||
generator_module_list = [
|
||||
{
|
||||
"module_type": "llama_index_llm",
|
||||
"llm": "openai",
|
||||
"model": "gpt-3.5-turbo",
|
||||
}
|
||||
]
|
||||
node_params = node_dict
|
||||
modules = list(
|
||||
map(
|
||||
lambda module_dict: get_support_modules(module_dict.pop("module_type")),
|
||||
generator_module_list,
|
||||
)
|
||||
)
|
||||
param_combinations = list(
|
||||
map(
|
||||
lambda module_dict: make_combinations({**module_dict, **node_params}),
|
||||
generator_module_list,
|
||||
)
|
||||
)
|
||||
return explode(modules, param_combinations)
|
||||
|
||||
|
||||
def evaluate_one_prompt_maker_node(
|
||||
prompts: List[str],
|
||||
generator_classes: List,
|
||||
generator_params: List[Dict],
|
||||
metric_inputs: List[MetricInput],
|
||||
metrics: Union[List[str], List[Dict]],
|
||||
project_dir,
|
||||
strategy_name: str,
|
||||
) -> pd.DataFrame:
|
||||
input_df = pd.DataFrame({"prompts": prompts})
|
||||
generator_results = list(
|
||||
map(
|
||||
lambda x: x[0].run_evaluator(
|
||||
project_dir=project_dir, previous_result=input_df, **x[1]
|
||||
),
|
||||
zip(generator_classes, generator_params),
|
||||
)
|
||||
)
|
||||
evaluation_results = list(
|
||||
map(
|
||||
lambda x: evaluate_generator_result(x[0], metric_inputs, metrics),
|
||||
zip(generator_results, generator_classes),
|
||||
)
|
||||
)
|
||||
metric_names = (
|
||||
list(map(lambda x: x["metric_name"], metrics))
|
||||
if isinstance(metrics[0], dict)
|
||||
else metrics
|
||||
)
|
||||
best_result, _ = select_best(
|
||||
evaluation_results, metric_names, strategy_name=strategy_name
|
||||
)
|
||||
best_result = pd.concat([input_df, best_result], axis=1)
|
||||
return best_result # it has 'generated_texts' column
|
||||
|
||||
|
||||
def evaluate_generator_result(
|
||||
result_df: pd.DataFrame,
|
||||
metric_inputs: List[MetricInput],
|
||||
metrics: Union[List[str], List[Dict]],
|
||||
) -> pd.DataFrame:
|
||||
@evaluate_generation(metric_inputs=metric_inputs, metrics=metrics)
|
||||
def evaluate(df):
|
||||
return df["generated_texts"].tolist()
|
||||
|
||||
return evaluate(result_df)
|
||||
85
autorag/nodes/promptmaker/window_replacement.py
Normal file
85
autorag/nodes/promptmaker/window_replacement.py
Normal file
@@ -0,0 +1,85 @@
|
||||
import logging
|
||||
import os
|
||||
from typing import List, Dict
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.promptmaker.base import BasePromptMaker
|
||||
from autorag.utils import result_to_dataframe, fetch_contents
|
||||
|
||||
logger = logging.getLogger("AutoRAG")
|
||||
|
||||
|
||||
class WindowReplacement(BasePromptMaker):
|
||||
def __init__(self, project_dir: str, *args, **kwargs):
|
||||
super().__init__(project_dir, *args, **kwargs)
|
||||
# load corpus
|
||||
data_dir = os.path.join(project_dir, "data")
|
||||
self.corpus_data = pd.read_parquet(
|
||||
os.path.join(data_dir, "corpus.parquet"), engine="pyarrow"
|
||||
)
|
||||
|
||||
@result_to_dataframe(["prompts"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
query, retrieved_contents, prompt = self.cast_to_run(
|
||||
previous_result, *args, **kwargs
|
||||
)
|
||||
retrieved_ids = previous_result["retrieved_ids"].tolist()
|
||||
# get metadata from corpus
|
||||
retrieved_metadata = fetch_contents(
|
||||
self.corpus_data, retrieved_ids, column_name="metadata"
|
||||
)
|
||||
return self._pure(prompt, query, retrieved_contents, retrieved_metadata)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
prompt: str,
|
||||
queries: List[str],
|
||||
retrieved_contents: List[List[str]],
|
||||
retrieved_metadata: List[List[Dict]],
|
||||
) -> List[str]:
|
||||
"""
|
||||
Replace retrieved_contents with a window to create a Prompt
|
||||
(only available for corpus chunked with Sentence window method)
|
||||
You must type a prompt or prompt list at a config YAML file like this:
|
||||
|
||||
.. Code:: yaml
|
||||
nodes:
|
||||
- node_type: prompt_maker
|
||||
modules:
|
||||
- module_type: window_replacement
|
||||
prompt: [Answer this question: {query} \n\n {retrieved_contents},
|
||||
Read the passages carefully and answer this question: {query} \n\n Passages: {retrieved_contents}]
|
||||
|
||||
:param prompt: A prompt string.
|
||||
:param queries: List of query strings.
|
||||
:param retrieved_contents: List of retrieved contents.
|
||||
:param retrieved_metadata: List of retrieved metadata.
|
||||
:return: Prompts that are made by window_replacement.
|
||||
"""
|
||||
|
||||
def window_replacement_row(
|
||||
_prompt: str,
|
||||
_query: str,
|
||||
_retrieved_contents,
|
||||
_retrieved_metadata: List[Dict],
|
||||
) -> str:
|
||||
window_list = []
|
||||
for content, metadata in zip(_retrieved_contents, _retrieved_metadata):
|
||||
if "window" in metadata:
|
||||
window_list.append(metadata["window"])
|
||||
else:
|
||||
window_list.append(content)
|
||||
logger.info(
|
||||
"Only available for corpus chunked with Sentence window method."
|
||||
"window_replacement will not proceed."
|
||||
)
|
||||
contents_str = "\n\n".join(window_list)
|
||||
return _prompt.format(query=_query, retrieved_contents=contents_str)
|
||||
|
||||
return list(
|
||||
map(
|
||||
lambda x: window_replacement_row(prompt, x[0], x[1], x[2]),
|
||||
zip(queries, retrieved_contents, retrieved_metadata),
|
||||
)
|
||||
)
|
||||
4
autorag/nodes/queryexpansion/__init__.py
Normal file
4
autorag/nodes/queryexpansion/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from .hyde import HyDE
|
||||
from .multi_query_expansion import MultiQueryExpansion
|
||||
from .pass_query_expansion import PassQueryExpansion
|
||||
from .query_decompose import QueryDecompose
|
||||
62
autorag/nodes/queryexpansion/base.py
Normal file
62
autorag/nodes/queryexpansion/base.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import abc
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import List, Union
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.util import make_generator_callable_param
|
||||
from autorag.schema import BaseModule
|
||||
from autorag.utils import validate_qa_dataset
|
||||
|
||||
logger = logging.getLogger("AutoRAG")
|
||||
|
||||
|
||||
class BaseQueryExpansion(BaseModule, metaclass=abc.ABCMeta):
|
||||
def __init__(self, project_dir: Union[str, Path], *args, **kwargs):
|
||||
logger.info(
|
||||
f"Initialize query expansion node - {self.__class__.__name__} module..."
|
||||
)
|
||||
# set generator module for query expansion
|
||||
generator_class, generator_param = make_generator_callable_param(kwargs)
|
||||
self.generator = generator_class(project_dir, **generator_param)
|
||||
|
||||
def __del__(self):
|
||||
del self.generator
|
||||
logger.info(
|
||||
f"Delete query expansion node - {self.__class__.__name__} module..."
|
||||
)
|
||||
|
||||
def cast_to_run(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
logger.info(
|
||||
f"Running query expansion node - {self.__class__.__name__} module..."
|
||||
)
|
||||
validate_qa_dataset(previous_result)
|
||||
|
||||
# find queries columns
|
||||
assert (
|
||||
"query" in previous_result.columns
|
||||
), "previous_result must have query column."
|
||||
queries = previous_result["query"].tolist()
|
||||
return queries
|
||||
|
||||
@staticmethod
|
||||
def _check_expanded_query(queries: List[str], expanded_queries: List[List[str]]):
|
||||
return list(
|
||||
map(
|
||||
lambda query, expanded_query_list: check_expanded_query(
|
||||
query, expanded_query_list
|
||||
),
|
||||
queries,
|
||||
expanded_queries,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def check_expanded_query(query: str, expanded_query_list: List[str]):
|
||||
# check if the expanded query is the same as the original query
|
||||
expanded_query_list = list(map(lambda x: x.strip(), expanded_query_list))
|
||||
return [
|
||||
expanded_query if expanded_query else query
|
||||
for expanded_query in expanded_query_list
|
||||
]
|
||||
43
autorag/nodes/queryexpansion/hyde.py
Normal file
43
autorag/nodes/queryexpansion/hyde.py
Normal file
@@ -0,0 +1,43 @@
|
||||
from typing import List
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.queryexpansion.base import BaseQueryExpansion
|
||||
from autorag.utils import result_to_dataframe
|
||||
|
||||
hyde_prompt = "Please write a passage to answer the question"
|
||||
|
||||
|
||||
class HyDE(BaseQueryExpansion):
|
||||
@result_to_dataframe(["queries"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries = self.cast_to_run(previous_result, *args, **kwargs)
|
||||
|
||||
# pop prompt from kwargs
|
||||
prompt = kwargs.pop("prompt", hyde_prompt)
|
||||
kwargs.pop("generator_module_type", None)
|
||||
|
||||
expanded_queries = self._pure(queries, prompt, **kwargs)
|
||||
return self._check_expanded_query(queries, expanded_queries)
|
||||
|
||||
def _pure(self, queries: List[str], prompt: str = hyde_prompt, **generator_params):
|
||||
"""
|
||||
HyDE, which inspired by "Precise Zero-shot Dense Retrieval without Relevance Labels" (https://arxiv.org/pdf/2212.10496.pdf)
|
||||
LLM model creates a hypothetical passage.
|
||||
And then, retrieve passages using hypothetical passage as a query.
|
||||
:param queries: List[str], queries to retrieve.
|
||||
:param prompt: Prompt to use when generating hypothetical passage
|
||||
:return: List[List[str]], List of hyde results.
|
||||
"""
|
||||
full_prompts = list(
|
||||
map(
|
||||
lambda x: (prompt if not bool(prompt) else hyde_prompt)
|
||||
+ f"\nQuestion: {x}\nPassage:",
|
||||
queries,
|
||||
)
|
||||
)
|
||||
input_df = pd.DataFrame({"prompts": full_prompts})
|
||||
result_df = self.generator.pure(previous_result=input_df, **generator_params)
|
||||
answers = result_df["generated_texts"].tolist()
|
||||
results = list(map(lambda x: [x], answers))
|
||||
return results
|
||||
57
autorag/nodes/queryexpansion/multi_query_expansion.py
Normal file
57
autorag/nodes/queryexpansion/multi_query_expansion.py
Normal file
@@ -0,0 +1,57 @@
|
||||
from typing import List
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.queryexpansion.base import BaseQueryExpansion
|
||||
from autorag.utils import result_to_dataframe
|
||||
|
||||
multi_query_expansion_prompt = """You are an AI language model assistant.
|
||||
Your task is to generate 3 different versions of the given user
|
||||
question to retrieve relevant documents from a vector database.
|
||||
By generating multiple perspectives on the user question,
|
||||
your goal is to help the user overcome some of the limitations
|
||||
of distance-based similarity search. Provide these alternative
|
||||
questions separated by newlines. Original question: {query}"""
|
||||
|
||||
|
||||
class MultiQueryExpansion(BaseQueryExpansion):
|
||||
@result_to_dataframe(["queries"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries = self.cast_to_run(previous_result, *args, **kwargs)
|
||||
|
||||
# pop prompt from kwargs
|
||||
prompt = kwargs.pop("prompt", multi_query_expansion_prompt)
|
||||
kwargs.pop("generator_module_type", None)
|
||||
|
||||
expanded_queries = self._pure(queries, prompt, **kwargs)
|
||||
return self._check_expanded_query(queries, expanded_queries)
|
||||
|
||||
def _pure(
|
||||
self, queries, prompt: str = multi_query_expansion_prompt, **kwargs
|
||||
) -> List[List[str]]:
|
||||
"""
|
||||
Expand a list of queries using a multi-query expansion approach.
|
||||
LLM model generate 3 different versions queries for each input query.
|
||||
|
||||
:param queries: List[str], queries to decompose.
|
||||
:param prompt: str, prompt to use for multi-query expansion.
|
||||
default prompt comes from langchain MultiQueryRetriever default query prompt.
|
||||
:return: List[List[str]], list of expansion query.
|
||||
"""
|
||||
full_prompts = list(map(lambda x: prompt.format(query=x), queries))
|
||||
input_df = pd.DataFrame({"prompts": full_prompts})
|
||||
result_df = self.generator.pure(previous_result=input_df, **kwargs)
|
||||
answers = result_df["generated_texts"].tolist()
|
||||
results = list(
|
||||
map(lambda x: get_multi_query_expansion(x[0], x[1]), zip(queries, answers))
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
def get_multi_query_expansion(query: str, answer: str) -> List[str]:
|
||||
try:
|
||||
queries = answer.split("\n")
|
||||
queries.insert(0, query)
|
||||
return queries
|
||||
except:
|
||||
return [query]
|
||||
22
autorag/nodes/queryexpansion/pass_query_expansion.py
Normal file
22
autorag/nodes/queryexpansion/pass_query_expansion.py
Normal file
@@ -0,0 +1,22 @@
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.queryexpansion.base import BaseQueryExpansion
|
||||
from autorag.utils import result_to_dataframe
|
||||
|
||||
|
||||
class PassQueryExpansion(BaseQueryExpansion):
|
||||
@result_to_dataframe(["queries"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
"""
|
||||
Do not perform query expansion.
|
||||
Return with the same queries.
|
||||
The dimension will be 2-d list, and the column name will be 'queries'.
|
||||
"""
|
||||
assert (
|
||||
"query" in previous_result.columns
|
||||
), "previous_result must have query column."
|
||||
queries = previous_result["query"].tolist()
|
||||
return list(map(lambda x: [x], queries))
|
||||
|
||||
def _pure(self, *args, **kwargs):
|
||||
pass
|
||||
111
autorag/nodes/queryexpansion/query_decompose.py
Normal file
111
autorag/nodes/queryexpansion/query_decompose.py
Normal file
@@ -0,0 +1,111 @@
|
||||
from typing import List
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.queryexpansion.base import BaseQueryExpansion
|
||||
from autorag.utils import result_to_dataframe
|
||||
|
||||
decompose_prompt = """Decompose a question in self-contained sub-questions. Use \"The question needs no decomposition\" when no decomposition is needed.
|
||||
|
||||
Example 1:
|
||||
|
||||
Question: Is Hamlet more common on IMDB than Comedy of Errors?
|
||||
Decompositions:
|
||||
1: How many listings of Hamlet are there on IMDB?
|
||||
2: How many listing of Comedy of Errors is there on IMDB?
|
||||
|
||||
Example 2:
|
||||
|
||||
Question: Are birds important to badminton?
|
||||
|
||||
Decompositions:
|
||||
The question needs no decomposition
|
||||
|
||||
Example 3:
|
||||
|
||||
Question: Is it legal for a licensed child driving Mercedes-Benz to be employed in US?
|
||||
|
||||
Decompositions:
|
||||
1: What is the minimum driving age in the US?
|
||||
2: What is the minimum age for someone to be employed in the US?
|
||||
|
||||
Example 4:
|
||||
|
||||
Question: Are all cucumbers the same texture?
|
||||
|
||||
Decompositions:
|
||||
The question needs no decomposition
|
||||
|
||||
Example 5:
|
||||
|
||||
Question: Hydrogen's atomic number squared exceeds number of Spice Girls?
|
||||
|
||||
Decompositions:
|
||||
1: What is the atomic number of hydrogen?
|
||||
2: How many Spice Girls are there?
|
||||
|
||||
Example 6:
|
||||
|
||||
Question: {question}
|
||||
|
||||
Decompositions:
|
||||
"""
|
||||
|
||||
|
||||
class QueryDecompose(BaseQueryExpansion):
|
||||
@result_to_dataframe(["queries"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries = self.cast_to_run(previous_result, *args, **kwargs)
|
||||
|
||||
# pop prompt from kwargs
|
||||
prompt = kwargs.pop("prompt", decompose_prompt)
|
||||
kwargs.pop("generator_module_type", None)
|
||||
|
||||
expanded_queries = self._pure(queries, prompt, **kwargs)
|
||||
return self._check_expanded_query(queries, expanded_queries)
|
||||
|
||||
def _pure(
|
||||
self, queries: List[str], prompt: str = decompose_prompt, *args, **kwargs
|
||||
) -> List[List[str]]:
|
||||
"""
|
||||
decompose query to little piece of questions.
|
||||
:param queries: List[str], queries to decompose.
|
||||
:param prompt: str, prompt to use for query decomposition.
|
||||
default prompt comes from Visconde's StrategyQA few-shot prompt.
|
||||
:return: List[List[str]], list of decomposed query. Return input query if query is not decomposable.
|
||||
"""
|
||||
full_prompts = []
|
||||
for query in queries:
|
||||
if bool(prompt):
|
||||
full_prompt = f"prompt: {prompt}\n\n question: {query}"
|
||||
else:
|
||||
full_prompt = decompose_prompt.format(question=query)
|
||||
full_prompts.append(full_prompt)
|
||||
input_df = pd.DataFrame({"prompts": full_prompts})
|
||||
result_df = self.generator.pure(previous_result=input_df, *args, **kwargs)
|
||||
answers = result_df["generated_texts"].tolist()
|
||||
results = list(
|
||||
map(lambda x: get_query_decompose(x[0], x[1]), zip(queries, answers))
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
def get_query_decompose(query: str, answer: str) -> List[str]:
|
||||
"""
|
||||
decompose query to little piece of questions.
|
||||
:param query: str, query to decompose.
|
||||
:param answer: str, answer from query_decompose function.
|
||||
:return: List[str], list of a decomposed query. Return input query if query is not decomposable.
|
||||
"""
|
||||
if answer.lower() == "the question needs no decomposition":
|
||||
return [query]
|
||||
try:
|
||||
lines = [line.strip() for line in answer.splitlines() if line.strip()]
|
||||
if lines[0].startswith("Decompositions:"):
|
||||
lines.pop(0)
|
||||
questions = [line.split(":", 1)[1].strip() for line in lines if ":" in line]
|
||||
if not questions:
|
||||
return [query]
|
||||
return questions
|
||||
except:
|
||||
return [query]
|
||||
276
autorag/nodes/queryexpansion/run.py
Normal file
276
autorag/nodes/queryexpansion/run.py
Normal file
@@ -0,0 +1,276 @@
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
from copy import deepcopy
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.retrieval.run import evaluate_retrieval_node
|
||||
from autorag.schema.metricinput import MetricInput
|
||||
from autorag.strategy import measure_speed, filter_by_threshold, select_best
|
||||
from autorag.support import get_support_modules
|
||||
from autorag.utils.util import make_combinations, explode
|
||||
|
||||
logger = logging.getLogger("AutoRAG")
|
||||
|
||||
|
||||
def run_query_expansion_node(
|
||||
modules: List,
|
||||
module_params: List[Dict],
|
||||
previous_result: pd.DataFrame,
|
||||
node_line_dir: str,
|
||||
strategies: Dict,
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Run evaluation and select the best module among query expansion node results.
|
||||
Initially, retrieval is run using expanded_queries, the result of the query_expansion module.
|
||||
The retrieval module is run as a combination of the retrieval_modules in strategies.
|
||||
If there are multiple retrieval_modules, run them all and choose the best result.
|
||||
If there are no retrieval_modules, run them with the default of bm25.
|
||||
In this way, the best result is selected for each module, and then the best result is selected.
|
||||
|
||||
:param modules: Query expansion modules to run.
|
||||
:param module_params: Query expansion module parameters.
|
||||
:param previous_result: Previous result dataframe.
|
||||
In this case, it would be qa data.
|
||||
:param node_line_dir: This node line's directory.
|
||||
:param strategies: Strategies for query expansion node.
|
||||
:return: The best result dataframe.
|
||||
"""
|
||||
if not os.path.exists(node_line_dir):
|
||||
os.makedirs(node_line_dir)
|
||||
node_dir = os.path.join(node_line_dir, "query_expansion")
|
||||
if not os.path.exists(node_dir):
|
||||
os.makedirs(node_dir)
|
||||
project_dir = pathlib.PurePath(node_line_dir).parent.parent
|
||||
|
||||
# run query expansion
|
||||
results, execution_times = zip(
|
||||
*map(
|
||||
lambda task: measure_speed(
|
||||
task[0].run_evaluator,
|
||||
project_dir=project_dir,
|
||||
previous_result=previous_result,
|
||||
**task[1],
|
||||
),
|
||||
zip(modules, module_params),
|
||||
)
|
||||
)
|
||||
average_times = list(map(lambda x: x / len(results[0]), execution_times))
|
||||
|
||||
# save results to folder
|
||||
pseudo_module_params = deepcopy(module_params)
|
||||
for i, module_param in enumerate(pseudo_module_params):
|
||||
if "prompt" in module_params:
|
||||
module_param["prompt"] = str(i)
|
||||
filepaths = list(
|
||||
map(lambda x: os.path.join(node_dir, f"{x}.parquet"), range(len(modules)))
|
||||
)
|
||||
list(
|
||||
map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths))
|
||||
) # execute save to parquet
|
||||
filenames = list(map(lambda x: os.path.basename(x), filepaths))
|
||||
|
||||
# make summary file
|
||||
summary_df = pd.DataFrame(
|
||||
{
|
||||
"filename": filenames,
|
||||
"module_name": list(map(lambda module: module.__name__, modules)),
|
||||
"module_params": module_params,
|
||||
"execution_time": average_times,
|
||||
}
|
||||
)
|
||||
|
||||
# Run evaluation when there are more than one module.
|
||||
if len(modules) > 1:
|
||||
# pop general keys from strategies (e.g. metrics, speed_threshold)
|
||||
general_key = ["metrics", "speed_threshold", "strategy"]
|
||||
general_strategy = dict(
|
||||
filter(lambda x: x[0] in general_key, strategies.items())
|
||||
)
|
||||
extra_strategy = dict(
|
||||
filter(lambda x: x[0] not in general_key, strategies.items())
|
||||
)
|
||||
|
||||
# first, filter by threshold if it is enabled.
|
||||
if general_strategy.get("speed_threshold") is not None:
|
||||
results, filenames = filter_by_threshold(
|
||||
results, average_times, general_strategy["speed_threshold"], filenames
|
||||
)
|
||||
|
||||
# check metrics in strategy
|
||||
if general_strategy.get("metrics") is None:
|
||||
raise ValueError(
|
||||
"You must at least one metrics for query expansion evaluation."
|
||||
)
|
||||
|
||||
if extra_strategy.get("top_k") is None:
|
||||
extra_strategy["top_k"] = 10 # default value
|
||||
|
||||
# get retrieval modules from strategy
|
||||
retrieval_callables, retrieval_params = make_retrieval_callable_params(
|
||||
extra_strategy
|
||||
)
|
||||
|
||||
# get retrieval_gt
|
||||
retrieval_gt = pd.read_parquet(
|
||||
os.path.join(project_dir, "data", "qa.parquet"), engine="pyarrow"
|
||||
)["retrieval_gt"].tolist()
|
||||
|
||||
# make rows to metric_inputs
|
||||
metric_inputs = [
|
||||
MetricInput(retrieval_gt=ret_gt, query=query, generation_gt=gen_gt)
|
||||
for ret_gt, query, gen_gt in zip(
|
||||
retrieval_gt,
|
||||
previous_result["query"].tolist(),
|
||||
previous_result["generation_gt"].tolist(),
|
||||
)
|
||||
]
|
||||
|
||||
# run evaluation
|
||||
evaluation_results = list(
|
||||
map(
|
||||
lambda result: evaluate_one_query_expansion_node(
|
||||
retrieval_callables,
|
||||
retrieval_params,
|
||||
[
|
||||
setattr(metric_input, "queries", queries) or metric_input
|
||||
for metric_input, queries in zip(
|
||||
metric_inputs, result["queries"].to_list()
|
||||
)
|
||||
],
|
||||
general_strategy["metrics"],
|
||||
project_dir,
|
||||
previous_result,
|
||||
general_strategy.get("strategy", "mean"),
|
||||
),
|
||||
results,
|
||||
)
|
||||
)
|
||||
|
||||
evaluation_df = pd.DataFrame(
|
||||
{
|
||||
"filename": filenames,
|
||||
**{
|
||||
f"query_expansion_{metric_name}": list(
|
||||
map(lambda x: x[metric_name].mean(), evaluation_results)
|
||||
)
|
||||
for metric_name in general_strategy["metrics"]
|
||||
},
|
||||
}
|
||||
)
|
||||
summary_df = pd.merge(
|
||||
on="filename", left=summary_df, right=evaluation_df, how="left"
|
||||
)
|
||||
|
||||
best_result, best_filename = select_best(
|
||||
evaluation_results,
|
||||
general_strategy["metrics"],
|
||||
filenames,
|
||||
strategies.get("strategy", "mean"),
|
||||
)
|
||||
# change metric name columns to query_expansion_metric_name
|
||||
best_result = best_result.rename(
|
||||
columns={
|
||||
metric_name: f"query_expansion_{metric_name}"
|
||||
for metric_name in strategies["metrics"]
|
||||
}
|
||||
)
|
||||
best_result = best_result.drop(
|
||||
columns=["retrieved_contents", "retrieved_ids", "retrieve_scores"]
|
||||
)
|
||||
else:
|
||||
best_result, best_filename = results[0], filenames[0]
|
||||
best_result = pd.concat([previous_result, best_result], axis=1)
|
||||
|
||||
# add 'is_best' column at summary file
|
||||
summary_df["is_best"] = summary_df["filename"] == best_filename
|
||||
|
||||
# save files
|
||||
summary_df.to_csv(os.path.join(node_dir, "summary.csv"), index=False)
|
||||
best_result.to_parquet(
|
||||
os.path.join(node_dir, f"best_{os.path.splitext(best_filename)[0]}.parquet"),
|
||||
index=False,
|
||||
)
|
||||
|
||||
return best_result
|
||||
|
||||
|
||||
def evaluate_one_query_expansion_node(
|
||||
retrieval_funcs: List,
|
||||
retrieval_params: List[Dict],
|
||||
metric_inputs: List[MetricInput],
|
||||
metrics: List[str],
|
||||
project_dir,
|
||||
previous_result: pd.DataFrame,
|
||||
strategy_name: str,
|
||||
) -> pd.DataFrame:
|
||||
previous_result["queries"] = [
|
||||
metric_input.queries for metric_input in metric_inputs
|
||||
]
|
||||
retrieval_results = list(
|
||||
map(
|
||||
lambda x: x[0].run_evaluator(
|
||||
project_dir=project_dir, previous_result=previous_result, **x[1]
|
||||
),
|
||||
zip(retrieval_funcs, retrieval_params),
|
||||
)
|
||||
)
|
||||
evaluation_results = list(
|
||||
map(
|
||||
lambda x: evaluate_retrieval_node(
|
||||
x,
|
||||
metric_inputs,
|
||||
metrics,
|
||||
),
|
||||
retrieval_results,
|
||||
)
|
||||
)
|
||||
best_result, _ = select_best(
|
||||
evaluation_results, metrics, strategy_name=strategy_name
|
||||
)
|
||||
best_result = pd.concat([previous_result, best_result], axis=1)
|
||||
return best_result
|
||||
|
||||
|
||||
def make_retrieval_callable_params(strategy_dict: Dict):
|
||||
"""
|
||||
strategy_dict looks like this:
|
||||
|
||||
.. Code:: json
|
||||
|
||||
{
|
||||
"metrics": ["retrieval_f1", "retrieval_recall"],
|
||||
"top_k": 50,
|
||||
"retrieval_modules": [
|
||||
{"module_type": "bm25"},
|
||||
{"module_type": "vectordb", "embedding_model": ["openai", "huggingface"]}
|
||||
]
|
||||
}
|
||||
|
||||
"""
|
||||
node_dict = deepcopy(strategy_dict)
|
||||
retrieval_module_list: Optional[List[Dict]] = node_dict.pop(
|
||||
"retrieval_modules", None
|
||||
)
|
||||
if retrieval_module_list is None:
|
||||
retrieval_module_list = [
|
||||
{
|
||||
"module_type": "bm25",
|
||||
}
|
||||
]
|
||||
node_params = node_dict
|
||||
modules = list(
|
||||
map(
|
||||
lambda module_dict: get_support_modules(module_dict.pop("module_type")),
|
||||
retrieval_module_list,
|
||||
)
|
||||
)
|
||||
param_combinations = list(
|
||||
map(
|
||||
lambda module_dict: make_combinations({**module_dict, **node_params}),
|
||||
retrieval_module_list,
|
||||
)
|
||||
)
|
||||
return explode(modules, param_combinations)
|
||||
4
autorag/nodes/retrieval/__init__.py
Normal file
4
autorag/nodes/retrieval/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from .bm25 import BM25
|
||||
from .hybrid_cc import HybridCC
|
||||
from .hybrid_rrf import HybridRRF
|
||||
from .vectordb import VectorDB
|
||||
127
autorag/nodes/retrieval/base.py
Normal file
127
autorag/nodes/retrieval/base.py
Normal file
@@ -0,0 +1,127 @@
|
||||
import abc
|
||||
import logging
|
||||
import os
|
||||
from typing import List, Union, Tuple
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.schema import BaseModule
|
||||
from autorag.support import get_support_modules
|
||||
from autorag.utils import fetch_contents, result_to_dataframe, validate_qa_dataset
|
||||
from autorag.utils.util import pop_params
|
||||
|
||||
logger = logging.getLogger("AutoRAG")
|
||||
|
||||
|
||||
class BaseRetrieval(BaseModule, metaclass=abc.ABCMeta):
|
||||
def __init__(self, project_dir: str, *args, **kwargs):
|
||||
logger.info(f"Initialize retrieval node - {self.__class__.__name__}")
|
||||
|
||||
self.resources_dir = os.path.join(project_dir, "resources")
|
||||
data_dir = os.path.join(project_dir, "data")
|
||||
# fetch data from corpus_data
|
||||
self.corpus_df = pd.read_parquet(
|
||||
os.path.join(data_dir, "corpus.parquet"), engine="pyarrow"
|
||||
)
|
||||
|
||||
def __del__(self):
|
||||
logger.info(f"Deleting retrieval node - {self.__class__.__name__} module...")
|
||||
|
||||
def cast_to_run(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
logger.info(f"Running retrieval node - {self.__class__.__name__} module...")
|
||||
validate_qa_dataset(previous_result)
|
||||
# find queries columns & type cast queries
|
||||
assert (
|
||||
"query" in previous_result.columns
|
||||
), "previous_result must have query column."
|
||||
if "queries" not in previous_result.columns:
|
||||
previous_result["queries"] = previous_result["query"]
|
||||
previous_result.loc[:, "queries"] = previous_result["queries"].apply(
|
||||
cast_queries
|
||||
)
|
||||
queries = previous_result["queries"].tolist()
|
||||
return queries
|
||||
|
||||
|
||||
class HybridRetrieval(BaseRetrieval, metaclass=abc.ABCMeta):
|
||||
def __init__(
|
||||
self, project_dir: str, target_modules, target_module_params, *args, **kwargs
|
||||
):
|
||||
super().__init__(project_dir)
|
||||
self.target_modules = list(
|
||||
map(
|
||||
lambda x, y: get_support_modules(x)(
|
||||
**y,
|
||||
project_dir=project_dir,
|
||||
),
|
||||
target_modules,
|
||||
target_module_params,
|
||||
)
|
||||
)
|
||||
self.target_module_params = target_module_params
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
result_dfs: List[pd.DataFrame] = list(
|
||||
map(
|
||||
lambda x, y: x.pure(
|
||||
**y,
|
||||
previous_result=previous_result,
|
||||
),
|
||||
self.target_modules,
|
||||
self.target_module_params,
|
||||
)
|
||||
)
|
||||
ids = tuple(
|
||||
map(lambda df: df["retrieved_ids"].apply(list).tolist(), result_dfs)
|
||||
)
|
||||
scores = tuple(
|
||||
map(
|
||||
lambda df: df["retrieve_scores"].apply(list).tolist(),
|
||||
result_dfs,
|
||||
)
|
||||
)
|
||||
|
||||
_pure_params = pop_params(self._pure, kwargs)
|
||||
if "ids" in _pure_params or "scores" in _pure_params:
|
||||
raise ValueError(
|
||||
"With specifying ids or scores, you must use HybridRRF.run_evaluator instead."
|
||||
)
|
||||
ids, scores = self._pure(ids=ids, scores=scores, **_pure_params)
|
||||
contents = fetch_contents(self.corpus_df, ids)
|
||||
return contents, ids, scores
|
||||
|
||||
|
||||
def cast_queries(queries: Union[str, List[str]]) -> List[str]:
|
||||
if isinstance(queries, str):
|
||||
return [queries]
|
||||
elif isinstance(queries, List):
|
||||
return queries
|
||||
else:
|
||||
raise ValueError(f"queries must be str or list, but got {type(queries)}")
|
||||
|
||||
|
||||
def evenly_distribute_passages(
|
||||
ids: List[List[str]], scores: List[List[float]], top_k: int
|
||||
) -> Tuple[List[str], List[float]]:
|
||||
assert len(ids) == len(scores), "ids and scores must have same length."
|
||||
query_cnt = len(ids)
|
||||
avg_len = top_k // query_cnt
|
||||
remainder = top_k % query_cnt
|
||||
|
||||
new_ids = []
|
||||
new_scores = []
|
||||
for i in range(query_cnt):
|
||||
if i < remainder:
|
||||
new_ids.extend(ids[i][: avg_len + 1])
|
||||
new_scores.extend(scores[i][: avg_len + 1])
|
||||
else:
|
||||
new_ids.extend(ids[i][:avg_len])
|
||||
new_scores.extend(scores[i][:avg_len])
|
||||
|
||||
return new_ids, new_scores
|
||||
|
||||
|
||||
def get_bm25_pkl_name(bm25_tokenizer: str):
|
||||
bm25_tokenizer = bm25_tokenizer.replace("/", "")
|
||||
return f"bm25_{bm25_tokenizer}.pkl"
|
||||
365
autorag/nodes/retrieval/bm25.py
Normal file
365
autorag/nodes/retrieval/bm25.py
Normal file
@@ -0,0 +1,365 @@
|
||||
import asyncio
|
||||
import os
|
||||
import pickle
|
||||
import re
|
||||
from typing import List, Dict, Tuple, Callable, Union, Iterable, Optional
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from llama_index.core.indices.keyword_table.utils import simple_extract_keywords
|
||||
from nltk import PorterStemmer
|
||||
from rank_bm25 import BM25Okapi
|
||||
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
||||
|
||||
from autorag.nodes.retrieval.base import (
|
||||
evenly_distribute_passages,
|
||||
BaseRetrieval,
|
||||
get_bm25_pkl_name,
|
||||
)
|
||||
from autorag.utils import validate_corpus_dataset, fetch_contents
|
||||
from autorag.utils.util import (
|
||||
get_event_loop,
|
||||
normalize_string,
|
||||
result_to_dataframe,
|
||||
pop_params,
|
||||
)
|
||||
|
||||
|
||||
def tokenize_ko_kiwi(texts: List[str]) -> List[List[str]]:
|
||||
try:
|
||||
from kiwipiepy import Kiwi, Token
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"You need to install kiwipiepy to use 'ko_kiwi' tokenizer. "
|
||||
"Please install kiwipiepy by running 'pip install kiwipiepy'. "
|
||||
"Or install Korean version of AutoRAG by running 'pip install AutoRAG[ko]'."
|
||||
)
|
||||
texts = list(map(lambda x: x.strip().lower(), texts))
|
||||
kiwi = Kiwi()
|
||||
tokenized_list: Iterable[List[Token]] = kiwi.tokenize(texts)
|
||||
return [list(map(lambda x: x.form, token_list)) for token_list in tokenized_list]
|
||||
|
||||
|
||||
def tokenize_ko_kkma(texts: List[str]) -> List[List[str]]:
|
||||
try:
|
||||
from konlpy.tag import Kkma
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"You need to install konlpy to use 'ko_kkma' tokenizer. "
|
||||
"Please install konlpy by running 'pip install konlpy'. "
|
||||
"Or install Korean version of AutoRAG by running 'pip install AutoRAG[ko]'."
|
||||
)
|
||||
tokenizer = Kkma()
|
||||
tokenized_list: List[List[str]] = list(map(lambda x: tokenizer.morphs(x), texts))
|
||||
return tokenized_list
|
||||
|
||||
|
||||
def tokenize_ko_okt(texts: List[str]) -> List[List[str]]:
|
||||
try:
|
||||
from konlpy.tag import Okt
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"You need to install konlpy to use 'ko_kkma' tokenizer. "
|
||||
"Please install konlpy by running 'pip install konlpy'. "
|
||||
"Or install Korean version of AutoRAG by running 'pip install AutoRAG[ko]'."
|
||||
)
|
||||
tokenizer = Okt()
|
||||
tokenized_list: List[List[str]] = list(map(lambda x: tokenizer.morphs(x), texts))
|
||||
return tokenized_list
|
||||
|
||||
|
||||
def tokenize_porter_stemmer(texts: List[str]) -> List[List[str]]:
|
||||
def tokenize_remove_stopword(text: str, stemmer) -> List[str]:
|
||||
text = text.lower()
|
||||
words = list(simple_extract_keywords(text))
|
||||
return [stemmer.stem(word) for word in words]
|
||||
|
||||
stemmer = PorterStemmer()
|
||||
tokenized_list: List[List[str]] = list(
|
||||
map(lambda x: tokenize_remove_stopword(x, stemmer), texts)
|
||||
)
|
||||
return tokenized_list
|
||||
|
||||
|
||||
def tokenize_space(texts: List[str]) -> List[List[str]]:
|
||||
def tokenize_space_text(text: str) -> List[str]:
|
||||
text = normalize_string(text)
|
||||
return re.split(r"\s+", text.strip())
|
||||
|
||||
return list(map(tokenize_space_text, texts))
|
||||
|
||||
|
||||
def load_bm25_corpus(bm25_path: str) -> Dict:
|
||||
if bm25_path is None:
|
||||
return {}
|
||||
with open(bm25_path, "rb") as f:
|
||||
bm25_corpus = pickle.load(f)
|
||||
return bm25_corpus
|
||||
|
||||
|
||||
def tokenize_ja_sudachipy(texts: List[str]) -> List[List[str]]:
|
||||
try:
|
||||
from sudachipy import dictionary, tokenizer
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"You need to install SudachiPy to use 'sudachipy' tokenizer. "
|
||||
"Please install SudachiPy by running 'pip install sudachipy'."
|
||||
)
|
||||
|
||||
# Initialize SudachiPy with the default tokenizer
|
||||
tokenizer_obj = dictionary.Dictionary(dict="core").create()
|
||||
|
||||
# Choose the tokenizer mode: NORMAL, SEARCH, A
|
||||
mode = tokenizer.Tokenizer.SplitMode.A
|
||||
|
||||
# Tokenize the input texts
|
||||
tokenized_list = []
|
||||
for text in texts:
|
||||
tokens = tokenizer_obj.tokenize(text, mode)
|
||||
tokenized_list.append([token.surface() for token in tokens])
|
||||
|
||||
return tokenized_list
|
||||
|
||||
|
||||
BM25_TOKENIZER = {
|
||||
"porter_stemmer": tokenize_porter_stemmer,
|
||||
"ko_kiwi": tokenize_ko_kiwi,
|
||||
"space": tokenize_space,
|
||||
"ko_kkma": tokenize_ko_kkma,
|
||||
"ko_okt": tokenize_ko_okt,
|
||||
"sudachipy": tokenize_ja_sudachipy,
|
||||
}
|
||||
|
||||
|
||||
class BM25(BaseRetrieval):
|
||||
def __init__(self, project_dir: str, *args, **kwargs):
|
||||
"""
|
||||
Initialize BM25 module.
|
||||
(Retrieval)
|
||||
|
||||
:param project_dir: The project directory path.
|
||||
:param bm25_tokenizer: The tokenizer name that is used to the BM25.
|
||||
It supports 'porter_stemmer', 'ko_kiwi', and huggingface `AutoTokenizer`.
|
||||
You can pass huggingface tokenizer name.
|
||||
Default is porter_stemmer.
|
||||
:param kwargs: The optional arguments.
|
||||
"""
|
||||
|
||||
super().__init__(project_dir)
|
||||
# check if bm25_path and file exist
|
||||
bm25_tokenizer = kwargs.get("bm25_tokenizer", None)
|
||||
if bm25_tokenizer is None:
|
||||
bm25_tokenizer = "porter_stemmer"
|
||||
bm25_path = os.path.join(self.resources_dir, get_bm25_pkl_name(bm25_tokenizer))
|
||||
|
||||
assert (
|
||||
bm25_path is not None
|
||||
), "bm25_path must be specified for using bm25 retrieval."
|
||||
assert os.path.exists(
|
||||
bm25_path
|
||||
), f"bm25_path {bm25_path} does not exist. Please ingest first."
|
||||
|
||||
self.bm25_corpus = load_bm25_corpus(bm25_path)
|
||||
assert (
|
||||
"tokens" and "passage_id" in list(self.bm25_corpus.keys())
|
||||
), "bm25_corpus must contain tokens and passage_id. Please check you ingested bm25 corpus correctly."
|
||||
self.tokenizer = select_bm25_tokenizer(bm25_tokenizer)
|
||||
assert self.bm25_corpus["tokenizer_name"] == bm25_tokenizer, (
|
||||
f"The bm25 corpus tokenizer is {self.bm25_corpus['tokenizer_name']}, but your input is {bm25_tokenizer}. "
|
||||
f"You need to ingest again. Delete bm25 pkl file and re-ingest it."
|
||||
)
|
||||
self.bm25_instance = BM25Okapi(self.bm25_corpus["tokens"])
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries = self.cast_to_run(previous_result)
|
||||
pure_params = pop_params(self._pure, kwargs)
|
||||
ids, scores = self._pure(queries, *args, **pure_params)
|
||||
contents = fetch_contents(self.corpus_df, ids)
|
||||
return contents, ids, scores
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[List[str]],
|
||||
top_k: int,
|
||||
ids: Optional[List[List[str]]] = None,
|
||||
) -> Tuple[List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
BM25 retrieval function.
|
||||
You have to load a pickle file that is already ingested.
|
||||
|
||||
:param queries: 2-d list of query strings.
|
||||
Each element of the list is a query strings of each row.
|
||||
:param top_k: The number of passages to be retrieved.
|
||||
:param ids: The optional list of ids that you want to retrieve.
|
||||
You don't need to specify this in the general use cases.
|
||||
Default is None.
|
||||
:return: The 2-d list contains a list of passage ids that retrieved from bm25 and 2-d list of its scores.
|
||||
It will be a length of queries. And each element has a length of top_k.
|
||||
"""
|
||||
if ids is not None:
|
||||
score_result = list(
|
||||
map(
|
||||
lambda query_list, id_list: get_bm25_scores(
|
||||
query_list,
|
||||
id_list,
|
||||
self.tokenizer,
|
||||
self.bm25_instance,
|
||||
self.bm25_corpus,
|
||||
),
|
||||
queries,
|
||||
ids,
|
||||
)
|
||||
)
|
||||
return ids, score_result
|
||||
|
||||
# run async bm25_pure function
|
||||
tasks = [
|
||||
bm25_pure(
|
||||
input_queries,
|
||||
top_k,
|
||||
self.tokenizer,
|
||||
self.bm25_instance,
|
||||
self.bm25_corpus,
|
||||
)
|
||||
for input_queries in queries
|
||||
]
|
||||
loop = get_event_loop()
|
||||
results = loop.run_until_complete(asyncio.gather(*tasks))
|
||||
id_result = list(map(lambda x: x[0], results))
|
||||
score_result = list(map(lambda x: x[1], results))
|
||||
return id_result, score_result
|
||||
|
||||
|
||||
async def bm25_pure(
|
||||
queries: List[str], top_k: int, tokenizer, bm25_api: BM25Okapi, bm25_corpus: Dict
|
||||
) -> Tuple[List[str], List[float]]:
|
||||
"""
|
||||
Async BM25 retrieval function.
|
||||
Its usage is for async retrieval of bm25 row by row.
|
||||
|
||||
:param queries: A list of query strings.
|
||||
:param top_k: The number of passages to be retrieved.
|
||||
:param tokenizer: A tokenizer that will be used to tokenize queries.
|
||||
:param bm25_api: A bm25 api instance that will be used to retrieve passages.
|
||||
:param bm25_corpus: A dictionary containing the bm25 corpus, which is doc_id from corpus and tokenized corpus.
|
||||
Its data structure looks like this:
|
||||
|
||||
.. Code:: python
|
||||
|
||||
{
|
||||
"tokens": [], # 2d list of tokens
|
||||
"passage_id": [], # 2d list of passage_id. Type must be str.
|
||||
}
|
||||
:return: The tuple contains a list of passage ids that retrieved from bm25 and its scores.
|
||||
"""
|
||||
# I don't make queries operation to async, because queries length might be small, so it will occur overhead.
|
||||
tokenized_queries = tokenize(queries, tokenizer)
|
||||
id_result = []
|
||||
score_result = []
|
||||
for query in tokenized_queries:
|
||||
scores = bm25_api.get_scores(query)
|
||||
sorted_scores = sorted(scores, reverse=True)
|
||||
top_n_index = np.argsort(scores)[::-1][:top_k]
|
||||
ids = [bm25_corpus["passage_id"][i] for i in top_n_index]
|
||||
id_result.append(ids)
|
||||
score_result.append(sorted_scores[:top_k])
|
||||
|
||||
# make a total result to top_k
|
||||
id_result, score_result = evenly_distribute_passages(id_result, score_result, top_k)
|
||||
# sort id_result and score_result by score
|
||||
result = [
|
||||
(_id, score)
|
||||
for score, _id in sorted(
|
||||
zip(score_result, id_result), key=lambda pair: pair[0], reverse=True
|
||||
)
|
||||
]
|
||||
id_result, score_result = zip(*result)
|
||||
return list(id_result), list(score_result)
|
||||
|
||||
|
||||
def get_bm25_scores(
|
||||
queries: List[str],
|
||||
ids: List[str],
|
||||
tokenizer,
|
||||
bm25_api: BM25Okapi,
|
||||
bm25_corpus: Dict,
|
||||
) -> List[float]:
|
||||
if len(ids) == 0 or not bool(ids):
|
||||
return []
|
||||
tokenized_queries = tokenize(queries, tokenizer)
|
||||
result_dict = {id_: [] for id_ in ids}
|
||||
for query in tokenized_queries:
|
||||
scores = bm25_api.get_scores(query)
|
||||
for i, id_ in enumerate(ids):
|
||||
result_dict[id_].append(scores[bm25_corpus["passage_id"].index(id_)])
|
||||
result_df = pd.DataFrame(result_dict)
|
||||
return result_df.max(axis=0).tolist()
|
||||
|
||||
|
||||
def tokenize(queries: List[str], tokenizer) -> List[List[int]]:
|
||||
if isinstance(tokenizer, PreTrainedTokenizerBase):
|
||||
tokenized_queries = tokenizer(queries).input_ids
|
||||
else:
|
||||
tokenized_queries = tokenizer(queries)
|
||||
return tokenized_queries
|
||||
|
||||
|
||||
def bm25_ingest(
|
||||
corpus_path: str, corpus_data: pd.DataFrame, bm25_tokenizer: str = "porter_stemmer"
|
||||
):
|
||||
if not corpus_path.endswith(".pkl"):
|
||||
raise ValueError(f"Corpus path {corpus_path} is not a pickle file.")
|
||||
validate_corpus_dataset(corpus_data)
|
||||
ids = corpus_data["doc_id"].tolist()
|
||||
|
||||
# Initialize bm25_corpus
|
||||
bm25_corpus = pd.DataFrame()
|
||||
|
||||
# Load the BM25 corpus if it exists and get the passage ids
|
||||
if os.path.exists(corpus_path) and os.path.getsize(corpus_path) > 0:
|
||||
with open(corpus_path, "rb") as r:
|
||||
corpus = pickle.load(r)
|
||||
bm25_corpus = pd.DataFrame.from_dict(corpus)
|
||||
duplicated_passage_rows = bm25_corpus[bm25_corpus["passage_id"].isin(ids)]
|
||||
new_passage = corpus_data[
|
||||
~corpus_data["doc_id"].isin(duplicated_passage_rows["passage_id"])
|
||||
]
|
||||
else:
|
||||
new_passage = corpus_data
|
||||
|
||||
if not new_passage.empty:
|
||||
tokenizer = select_bm25_tokenizer(bm25_tokenizer)
|
||||
if isinstance(tokenizer, PreTrainedTokenizerBase):
|
||||
tokenized_corpus = tokenizer(new_passage["contents"].tolist()).input_ids
|
||||
else:
|
||||
tokenized_corpus = tokenizer(new_passage["contents"].tolist())
|
||||
new_bm25_corpus = pd.DataFrame(
|
||||
{
|
||||
"tokens": tokenized_corpus,
|
||||
"passage_id": new_passage["doc_id"].tolist(),
|
||||
}
|
||||
)
|
||||
|
||||
if not bm25_corpus.empty:
|
||||
bm25_corpus_updated = pd.concat(
|
||||
[bm25_corpus, new_bm25_corpus], ignore_index=True
|
||||
)
|
||||
bm25_dict = bm25_corpus_updated.to_dict("list")
|
||||
else:
|
||||
bm25_dict = new_bm25_corpus.to_dict("list")
|
||||
|
||||
# add tokenizer name to bm25_dict
|
||||
bm25_dict["tokenizer_name"] = bm25_tokenizer
|
||||
|
||||
with open(corpus_path, "wb") as w:
|
||||
pickle.dump(bm25_dict, w)
|
||||
|
||||
|
||||
def select_bm25_tokenizer(
|
||||
bm25_tokenizer: str,
|
||||
) -> Callable[[str], List[Union[int, str]]]:
|
||||
if bm25_tokenizer in list(BM25_TOKENIZER.keys()):
|
||||
return BM25_TOKENIZER[bm25_tokenizer]
|
||||
|
||||
return AutoTokenizer.from_pretrained(bm25_tokenizer, use_fast=False)
|
||||
214
autorag/nodes/retrieval/hybrid_cc.py
Normal file
214
autorag/nodes/retrieval/hybrid_cc.py
Normal file
@@ -0,0 +1,214 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Tuple, List, Union
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.retrieval.base import HybridRetrieval
|
||||
from autorag.utils.util import pop_params, fetch_contents, result_to_dataframe
|
||||
|
||||
|
||||
def normalize_mm(scores: List[str], fixed_min_value: float = 0):
|
||||
arr = np.array(scores)
|
||||
max_value = np.max(arr)
|
||||
min_value = np.min(arr)
|
||||
norm_score = (arr - min_value) / (max_value - min_value)
|
||||
return norm_score
|
||||
|
||||
|
||||
def normalize_tmm(scores: List[str], fixed_min_value: float):
|
||||
arr = np.array(scores)
|
||||
max_value = np.max(arr)
|
||||
norm_score = (arr - fixed_min_value) / (max_value - fixed_min_value)
|
||||
return norm_score
|
||||
|
||||
|
||||
def normalize_z(scores: List[str], fixed_min_value: float = 0):
|
||||
arr = np.array(scores)
|
||||
mean_value = np.mean(arr)
|
||||
std_value = np.std(arr)
|
||||
norm_score = (arr - mean_value) / std_value
|
||||
return norm_score
|
||||
|
||||
|
||||
def normalize_dbsf(scores: List[str], fixed_min_value: float = 0):
|
||||
arr = np.array(scores)
|
||||
mean_value = np.mean(arr)
|
||||
std_value = np.std(arr)
|
||||
min_value = mean_value - 3 * std_value
|
||||
max_value = mean_value + 3 * std_value
|
||||
norm_score = (arr - min_value) / (max_value - min_value)
|
||||
return norm_score
|
||||
|
||||
|
||||
normalize_method_dict = {
|
||||
"mm": normalize_mm,
|
||||
"tmm": normalize_tmm,
|
||||
"z": normalize_z,
|
||||
"dbsf": normalize_dbsf,
|
||||
}
|
||||
|
||||
|
||||
class HybridCC(HybridRetrieval):
|
||||
def _pure(
|
||||
self,
|
||||
ids: Tuple,
|
||||
scores: Tuple,
|
||||
top_k: int,
|
||||
weight: float,
|
||||
normalize_method: str = "mm",
|
||||
semantic_theoretical_min_value: float = -1.0,
|
||||
lexical_theoretical_min_value: float = 0.0,
|
||||
):
|
||||
return hybrid_cc(
|
||||
ids,
|
||||
scores,
|
||||
top_k,
|
||||
weight,
|
||||
normalize_method,
|
||||
semantic_theoretical_min_value,
|
||||
lexical_theoretical_min_value,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def run_evaluator(
|
||||
cls,
|
||||
project_dir: Union[str, Path],
|
||||
previous_result: pd.DataFrame,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
if "ids" in kwargs and "scores" in kwargs:
|
||||
data_dir = os.path.join(project_dir, "data")
|
||||
corpus_df = pd.read_parquet(
|
||||
os.path.join(data_dir, "corpus.parquet"), engine="pyarrow"
|
||||
)
|
||||
|
||||
params = pop_params(hybrid_cc, kwargs)
|
||||
assert (
|
||||
"ids" in params and "scores" in params and "top_k" in params
|
||||
), "ids, scores, and top_k must be specified."
|
||||
|
||||
@result_to_dataframe(
|
||||
["retrieved_contents", "retrieved_ids", "retrieve_scores"]
|
||||
)
|
||||
def __cc(**cc_params):
|
||||
ids, scores = hybrid_cc(**cc_params)
|
||||
contents = fetch_contents(corpus_df, ids)
|
||||
return contents, ids, scores
|
||||
|
||||
return __cc(**params)
|
||||
else:
|
||||
assert (
|
||||
"target_modules" in kwargs and "target_module_params" in kwargs
|
||||
), "target_modules and target_module_params must be specified if there is not ids and scores."
|
||||
instance = cls(project_dir, *args, **kwargs)
|
||||
result = instance.pure(previous_result, *args, **kwargs)
|
||||
del instance
|
||||
return result
|
||||
|
||||
|
||||
def hybrid_cc(
|
||||
ids: Tuple,
|
||||
scores: Tuple,
|
||||
top_k: int,
|
||||
weight: float,
|
||||
normalize_method: str = "mm",
|
||||
semantic_theoretical_min_value: float = -1.0,
|
||||
lexical_theoretical_min_value: float = 0.0,
|
||||
) -> Tuple[List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Hybrid CC function.
|
||||
CC (convex combination) is a method to fuse lexical and semantic retrieval results.
|
||||
It is a method that first normalizes the scores of each retrieval result,
|
||||
and then combines them with the given weights.
|
||||
It is uniquer than other retrieval modules, because it does not really execute retrieval,
|
||||
but just fuse the results of other retrieval functions.
|
||||
So you have to run more than two retrieval modules before running this function.
|
||||
And collect ids and scores result from each retrieval module.
|
||||
Make it as tuple and input it to this function.
|
||||
|
||||
:param ids: The tuple of ids that you want to fuse.
|
||||
The length of this must be the same as the length of scores.
|
||||
The semantic retrieval ids must be the first index.
|
||||
:param scores: The retrieve scores that you want to fuse.
|
||||
The length of this must be the same as the length of ids.
|
||||
The semantic retrieval scores must be the first index.
|
||||
:param top_k: The number of passages to be retrieved.
|
||||
:param normalize_method: The normalization method to use.
|
||||
There are some normalization method that you can use at the hybrid cc method.
|
||||
AutoRAG support following.
|
||||
- `mm`: Min-max scaling
|
||||
- `tmm`: Theoretical min-max scaling
|
||||
- `z`: z-score normalization
|
||||
- `dbsf`: 3-sigma normalization
|
||||
:param weight: The weight value. If the weight is 1.0, it means the
|
||||
weight to the semantic module will be 1.0 and weight to the lexical module will be 0.0.
|
||||
:param semantic_theoretical_min_value: This value used by `tmm` normalization method. You can set the
|
||||
theoretical minimum value by yourself. Default is -1.
|
||||
:param lexical_theoretical_min_value: This value used by `tmm` normalization method. You can set the
|
||||
theoretical minimum value by yourself. Default is 0.
|
||||
:return: The tuple of ids and fused scores that fused by CC. Plus, the third element is selected weight value.
|
||||
"""
|
||||
assert len(ids) == len(scores), "The length of ids and scores must be the same."
|
||||
assert len(ids) > 1, "You must input more than one retrieval results."
|
||||
assert top_k > 0, "top_k must be greater than 0."
|
||||
assert weight >= 0, "The weight must be greater than 0."
|
||||
assert weight <= 1, "The weight must be less than 1."
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"semantic_ids": ids[0],
|
||||
"lexical_ids": ids[1],
|
||||
"semantic_score": scores[0],
|
||||
"lexical_score": scores[1],
|
||||
}
|
||||
)
|
||||
|
||||
def cc_pure_apply(row):
|
||||
return fuse_per_query(
|
||||
row["semantic_ids"],
|
||||
row["lexical_ids"],
|
||||
row["semantic_score"],
|
||||
row["lexical_score"],
|
||||
normalize_method=normalize_method,
|
||||
weight=weight,
|
||||
top_k=top_k,
|
||||
semantic_theoretical_min_value=semantic_theoretical_min_value,
|
||||
lexical_theoretical_min_value=lexical_theoretical_min_value,
|
||||
)
|
||||
|
||||
# fixed weight
|
||||
df[["cc_id", "cc_score"]] = df.apply(
|
||||
lambda row: cc_pure_apply(row), axis=1, result_type="expand"
|
||||
)
|
||||
return df["cc_id"].tolist(), df["cc_score"].tolist()
|
||||
|
||||
|
||||
def fuse_per_query(
|
||||
semantic_ids: List[str],
|
||||
lexical_ids: List[str],
|
||||
semantic_scores: List[float],
|
||||
lexical_scores: List[float],
|
||||
normalize_method: str,
|
||||
weight: float,
|
||||
top_k: int,
|
||||
semantic_theoretical_min_value: float,
|
||||
lexical_theoretical_min_value: float,
|
||||
):
|
||||
normalize_func = normalize_method_dict[normalize_method]
|
||||
norm_semantic_scores = normalize_func(
|
||||
semantic_scores, semantic_theoretical_min_value
|
||||
)
|
||||
norm_lexical_scores = normalize_func(lexical_scores, lexical_theoretical_min_value)
|
||||
ids = [semantic_ids, lexical_ids]
|
||||
scores = [norm_semantic_scores, norm_lexical_scores]
|
||||
df = pd.concat(
|
||||
[pd.Series(dict(zip(_id, score))) for _id, score in zip(ids, scores)], axis=1
|
||||
)
|
||||
df.columns = ["semantic", "lexical"]
|
||||
df = df.fillna(0)
|
||||
df["weighted_sum"] = df.mul((weight, 1.0 - weight)).sum(axis=1)
|
||||
df = df.sort_values(by="weighted_sum", ascending=False)
|
||||
return df.index.tolist()[:top_k], df["weighted_sum"][:top_k].tolist()
|
||||
128
autorag/nodes/retrieval/hybrid_rrf.py
Normal file
128
autorag/nodes/retrieval/hybrid_rrf.py
Normal file
@@ -0,0 +1,128 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple, Union
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.retrieval.base import HybridRetrieval
|
||||
from autorag.utils.util import pop_params, fetch_contents, result_to_dataframe
|
||||
|
||||
|
||||
class HybridRRF(HybridRetrieval):
|
||||
def _pure(self, ids, scores, top_k: int, weight: int = 60, rrf_k: int = -1):
|
||||
return hybrid_rrf(ids, scores, top_k, weight, rrf_k)
|
||||
|
||||
@classmethod
|
||||
def run_evaluator(
|
||||
cls,
|
||||
project_dir: Union[str, Path],
|
||||
previous_result: pd.DataFrame,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
if "ids" in kwargs and "scores" in kwargs:
|
||||
data_dir = os.path.join(project_dir, "data")
|
||||
corpus_df = pd.read_parquet(
|
||||
os.path.join(data_dir, "corpus.parquet"), engine="pyarrow"
|
||||
)
|
||||
|
||||
params = pop_params(hybrid_rrf, kwargs)
|
||||
assert (
|
||||
"ids" in params and "scores" in params and "top_k" in params
|
||||
), "ids, scores, and top_k must be specified."
|
||||
|
||||
@result_to_dataframe(
|
||||
["retrieved_contents", "retrieved_ids", "retrieve_scores"]
|
||||
)
|
||||
def __rrf(**rrf_params):
|
||||
ids, scores = hybrid_rrf(**rrf_params)
|
||||
contents = fetch_contents(corpus_df, ids)
|
||||
return contents, ids, scores
|
||||
|
||||
return __rrf(**params)
|
||||
else:
|
||||
assert (
|
||||
"target_modules" in kwargs and "target_module_params" in kwargs
|
||||
), "target_modules and target_module_params must be specified if there is not ids and scores."
|
||||
instance = cls(project_dir, *args, **kwargs)
|
||||
result = instance.pure(previous_result, *args, **kwargs)
|
||||
del instance
|
||||
return result
|
||||
|
||||
|
||||
def hybrid_rrf(
|
||||
ids: Tuple,
|
||||
scores: Tuple,
|
||||
top_k: int,
|
||||
weight: int = 60,
|
||||
rrf_k: int = -1,
|
||||
) -> Tuple[List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Hybrid RRF function.
|
||||
RRF (Rank Reciprocal Fusion) is a method to fuse multiple retrieval results.
|
||||
It is common to fuse dense retrieval and sparse retrieval results using RRF.
|
||||
To use this function, you must input ids and scores as tuple.
|
||||
It is more unique than other retrieval modules because it does not really execute retrieval but just fuses
|
||||
the results of other retrieval functions.
|
||||
So you have to run more than two retrieval modules before running this function.
|
||||
And collect ids and scores result from each retrieval module.
|
||||
Make it as a tuple and input it to this function.
|
||||
|
||||
:param ids: The tuple of ids that you want to fuse.
|
||||
The length of this must be the same as the length of scores.
|
||||
:param scores: The retrieve scores that you want to fuse.
|
||||
The length of this must be the same as the length of ids.
|
||||
:param top_k: The number of passages to be retrieved.
|
||||
:param weight: Hyperparameter for RRF.
|
||||
It was originally rrf_k value.
|
||||
Default is 60.
|
||||
For more information, please visit our documentation.
|
||||
:param rrf_k: (Deprecated) Hyperparameter for RRF.
|
||||
It was originally rrf_k value. Will remove at a further version.
|
||||
:return: The tuple of ids and fused scores that are fused by RRF.
|
||||
"""
|
||||
assert len(ids) == len(scores), "The length of ids and scores must be the same."
|
||||
assert len(ids) > 1, "You must input more than one retrieval results."
|
||||
assert top_k > 0, "top_k must be greater than 0."
|
||||
assert weight > 0, "rrf_k must be greater than 0."
|
||||
|
||||
if rrf_k != -1:
|
||||
weight = int(rrf_k)
|
||||
else:
|
||||
weight = int(weight)
|
||||
|
||||
id_df = pd.DataFrame({f"id_{i}": id_list for i, id_list in enumerate(ids)})
|
||||
score_df = pd.DataFrame(
|
||||
{f"score_{i}": score_list for i, score_list in enumerate(scores)}
|
||||
)
|
||||
df = pd.concat([id_df, score_df], axis=1)
|
||||
|
||||
def rrf_pure_apply(row):
|
||||
ids_tuple = tuple(row[[f"id_{i}" for i in range(len(ids))]].values)
|
||||
scores_tuple = tuple(row[[f"score_{i}" for i in range(len(scores))]].values)
|
||||
return pd.Series(rrf_pure(ids_tuple, scores_tuple, weight, top_k))
|
||||
|
||||
df[["rrf_id", "rrf_score"]] = df.apply(rrf_pure_apply, axis=1)
|
||||
return df["rrf_id"].tolist(), df["rrf_score"].tolist()
|
||||
|
||||
|
||||
def rrf_pure(
|
||||
ids: Tuple, scores: Tuple, rrf_k: int, top_k: int
|
||||
) -> Tuple[List[str], List[float]]:
|
||||
df = pd.concat(
|
||||
[pd.Series(dict(zip(_id, score))) for _id, score in zip(ids, scores)], axis=1
|
||||
)
|
||||
rank_df = df.rank(ascending=False, method="min")
|
||||
rank_df = rank_df.fillna(0)
|
||||
rank_df["rrf"] = rank_df.apply(lambda row: rrf_calculate(row, rrf_k), axis=1)
|
||||
rank_df = rank_df.sort_values(by="rrf", ascending=False)
|
||||
return rank_df.index.tolist()[:top_k], rank_df["rrf"].tolist()[:top_k]
|
||||
|
||||
|
||||
def rrf_calculate(row, rrf_k):
|
||||
result = 0
|
||||
for r in row:
|
||||
if r == 0:
|
||||
continue
|
||||
result += 1 / (r + rrf_k)
|
||||
return result
|
||||
544
autorag/nodes/retrieval/run.py
Normal file
544
autorag/nodes/retrieval/run.py
Normal file
@@ -0,0 +1,544 @@
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
from copy import deepcopy
|
||||
from typing import List, Callable, Dict, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from autorag.evaluation import evaluate_retrieval
|
||||
from autorag.schema.metricinput import MetricInput
|
||||
from autorag.strategy import measure_speed, filter_by_threshold, select_best
|
||||
from autorag.support import get_support_modules
|
||||
from autorag.utils.util import get_best_row, to_list, apply_recursive
|
||||
|
||||
logger = logging.getLogger("AutoRAG")
|
||||
|
||||
semantic_module_names = ["vectordb", "VectorDB"]
|
||||
lexical_module_names = ["bm25", "BM25"]
|
||||
hybrid_module_names = ["hybrid_rrf", "hybrid_cc", "HybridCC", "HybridRRF"]
|
||||
|
||||
|
||||
def run_retrieval_node(
|
||||
modules: List,
|
||||
module_params: List[Dict],
|
||||
previous_result: pd.DataFrame,
|
||||
node_line_dir: str,
|
||||
strategies: Dict,
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Run evaluation and select the best module among retrieval node results.
|
||||
|
||||
:param modules: Retrieval modules to run.
|
||||
:param module_params: Retrieval module parameters.
|
||||
:param previous_result: Previous result dataframe.
|
||||
Could be query expansion's best result or qa data.
|
||||
:param node_line_dir: This node line's directory.
|
||||
:param strategies: Strategies for retrieval node.
|
||||
:return: The best result dataframe.
|
||||
It contains previous result columns and retrieval node's result columns.
|
||||
"""
|
||||
if not os.path.exists(node_line_dir):
|
||||
os.makedirs(node_line_dir)
|
||||
project_dir = pathlib.PurePath(node_line_dir).parent.parent
|
||||
qa_df = pd.read_parquet(
|
||||
os.path.join(project_dir, "data", "qa.parquet"), engine="pyarrow"
|
||||
)
|
||||
retrieval_gt = qa_df["retrieval_gt"].tolist()
|
||||
retrieval_gt = apply_recursive(lambda x: str(x), to_list(retrieval_gt))
|
||||
# make rows to metric_inputs
|
||||
metric_inputs = [
|
||||
MetricInput(retrieval_gt=ret_gt, query=query, generation_gt=gen_gt)
|
||||
for ret_gt, query, gen_gt in zip(
|
||||
retrieval_gt, qa_df["query"].tolist(), qa_df["generation_gt"].tolist()
|
||||
)
|
||||
]
|
||||
|
||||
save_dir = os.path.join(node_line_dir, "retrieval") # node name
|
||||
if not os.path.exists(save_dir):
|
||||
os.makedirs(save_dir)
|
||||
|
||||
def run(input_modules, input_module_params) -> Tuple[List[pd.DataFrame], List]:
|
||||
"""
|
||||
Run input modules and parameters.
|
||||
|
||||
:param input_modules: Input modules
|
||||
:param input_module_params: Input module parameters
|
||||
:return: First, it returns list of result dataframe.
|
||||
Second, it returns list of execution times.
|
||||
"""
|
||||
result, execution_times = zip(
|
||||
*map(
|
||||
lambda task: measure_speed(
|
||||
task[0].run_evaluator,
|
||||
project_dir=project_dir,
|
||||
previous_result=previous_result,
|
||||
**task[1],
|
||||
),
|
||||
zip(input_modules, input_module_params),
|
||||
)
|
||||
)
|
||||
average_times = list(map(lambda x: x / len(result[0]), execution_times))
|
||||
|
||||
# run metrics before filtering
|
||||
if strategies.get("metrics") is None:
|
||||
raise ValueError("You must at least one metrics for retrieval evaluation.")
|
||||
result = list(
|
||||
map(
|
||||
lambda x: evaluate_retrieval_node(
|
||||
x,
|
||||
metric_inputs,
|
||||
strategies.get("metrics"),
|
||||
),
|
||||
result,
|
||||
)
|
||||
)
|
||||
|
||||
return result, average_times
|
||||
|
||||
def save_and_summary(
|
||||
input_modules,
|
||||
input_module_params,
|
||||
result_list,
|
||||
execution_time_list,
|
||||
filename_start: int,
|
||||
):
|
||||
"""
|
||||
Save the result and make summary file
|
||||
|
||||
:param input_modules: Input modules
|
||||
:param input_module_params: Input module parameters
|
||||
:param result_list: Result list
|
||||
:param execution_time_list: Execution times
|
||||
:param filename_start: The first filename to use
|
||||
:return: First, it returns list of result dataframe.
|
||||
Second, it returns list of execution times.
|
||||
"""
|
||||
|
||||
# save results to folder
|
||||
filepaths = list(
|
||||
map(
|
||||
lambda x: os.path.join(save_dir, f"{x}.parquet"),
|
||||
range(filename_start, filename_start + len(input_modules)),
|
||||
)
|
||||
)
|
||||
list(
|
||||
map(
|
||||
lambda x: x[0].to_parquet(x[1], index=False),
|
||||
zip(result_list, filepaths),
|
||||
)
|
||||
) # execute save to parquet
|
||||
filename_list = list(map(lambda x: os.path.basename(x), filepaths))
|
||||
|
||||
summary_df = pd.DataFrame(
|
||||
{
|
||||
"filename": filename_list,
|
||||
"module_name": list(map(lambda module: module.__name__, input_modules)),
|
||||
"module_params": input_module_params,
|
||||
"execution_time": execution_time_list,
|
||||
**{
|
||||
metric: list(map(lambda result: result[metric].mean(), result_list))
|
||||
for metric in strategies.get("metrics")
|
||||
},
|
||||
}
|
||||
)
|
||||
summary_df.to_csv(os.path.join(save_dir, "summary.csv"), index=False)
|
||||
return summary_df
|
||||
|
||||
def find_best(results, average_times, filenames):
|
||||
# filter by strategies
|
||||
if strategies.get("speed_threshold") is not None:
|
||||
results, filenames = filter_by_threshold(
|
||||
results, average_times, strategies["speed_threshold"], filenames
|
||||
)
|
||||
selected_result, selected_filename = select_best(
|
||||
results,
|
||||
strategies.get("metrics"),
|
||||
filenames,
|
||||
strategies.get("strategy", "mean"),
|
||||
)
|
||||
return selected_result, selected_filename
|
||||
|
||||
filename_first = 0
|
||||
# run semantic modules
|
||||
logger.info("Running retrieval node - semantic retrieval module...")
|
||||
if any([module.__name__ in semantic_module_names for module in modules]):
|
||||
semantic_modules, semantic_module_params = zip(
|
||||
*filter(
|
||||
lambda x: x[0].__name__ in semantic_module_names,
|
||||
zip(modules, module_params),
|
||||
)
|
||||
)
|
||||
semantic_results, semantic_times = run(semantic_modules, semantic_module_params)
|
||||
semantic_summary_df = save_and_summary(
|
||||
semantic_modules,
|
||||
semantic_module_params,
|
||||
semantic_results,
|
||||
semantic_times,
|
||||
filename_first,
|
||||
)
|
||||
semantic_selected_result, semantic_selected_filename = find_best(
|
||||
semantic_results, semantic_times, semantic_summary_df["filename"].tolist()
|
||||
)
|
||||
semantic_summary_df["is_best"] = (
|
||||
semantic_summary_df["filename"] == semantic_selected_filename
|
||||
)
|
||||
filename_first += len(semantic_modules)
|
||||
else:
|
||||
(
|
||||
semantic_selected_filename,
|
||||
semantic_summary_df,
|
||||
semantic_results,
|
||||
semantic_times,
|
||||
) = None, pd.DataFrame(), [], []
|
||||
# run lexical modules
|
||||
logger.info("Running retrieval node - lexical retrieval module...")
|
||||
if any([module.__name__ in lexical_module_names for module in modules]):
|
||||
lexical_modules, lexical_module_params = zip(
|
||||
*filter(
|
||||
lambda x: x[0].__name__ in lexical_module_names,
|
||||
zip(modules, module_params),
|
||||
)
|
||||
)
|
||||
lexical_results, lexical_times = run(lexical_modules, lexical_module_params)
|
||||
lexical_summary_df = save_and_summary(
|
||||
lexical_modules,
|
||||
lexical_module_params,
|
||||
lexical_results,
|
||||
lexical_times,
|
||||
filename_first,
|
||||
)
|
||||
lexical_selected_result, lexical_selected_filename = find_best(
|
||||
lexical_results, lexical_times, lexical_summary_df["filename"].tolist()
|
||||
)
|
||||
lexical_summary_df["is_best"] = (
|
||||
lexical_summary_df["filename"] == lexical_selected_filename
|
||||
)
|
||||
filename_first += len(lexical_modules)
|
||||
else:
|
||||
(
|
||||
lexical_selected_filename,
|
||||
lexical_summary_df,
|
||||
lexical_results,
|
||||
lexical_times,
|
||||
) = None, pd.DataFrame(), [], []
|
||||
|
||||
logger.info("Running retrieval node - hybrid retrieval module...")
|
||||
# Next, run hybrid retrieval
|
||||
if any([module.__name__ in hybrid_module_names for module in modules]):
|
||||
hybrid_modules, hybrid_module_params = zip(
|
||||
*filter(
|
||||
lambda x: x[0].__name__ in hybrid_module_names,
|
||||
zip(modules, module_params),
|
||||
)
|
||||
)
|
||||
if all(
|
||||
["target_module_params" in x for x in hybrid_module_params]
|
||||
): # for Runner.run
|
||||
# If target_module_params are already given, run hybrid retrieval directly
|
||||
hybrid_results, hybrid_times = run(hybrid_modules, hybrid_module_params)
|
||||
hybrid_summary_df = save_and_summary(
|
||||
hybrid_modules,
|
||||
hybrid_module_params,
|
||||
hybrid_results,
|
||||
hybrid_times,
|
||||
filename_first,
|
||||
)
|
||||
filename_first += len(hybrid_modules)
|
||||
else: # for Evaluator
|
||||
# get id and score
|
||||
ids_scores = get_ids_and_scores(
|
||||
save_dir,
|
||||
[semantic_selected_filename, lexical_selected_filename],
|
||||
semantic_summary_df,
|
||||
lexical_summary_df,
|
||||
previous_result,
|
||||
)
|
||||
hybrid_module_params = list(
|
||||
map(lambda x: {**x, **ids_scores}, hybrid_module_params)
|
||||
)
|
||||
|
||||
# optimize each modules
|
||||
real_hybrid_times = [
|
||||
get_hybrid_execution_times(semantic_summary_df, lexical_summary_df)
|
||||
] * len(hybrid_module_params)
|
||||
hybrid_times = real_hybrid_times.copy()
|
||||
hybrid_results = []
|
||||
for module, module_param in zip(hybrid_modules, hybrid_module_params):
|
||||
module_result_df, module_best_weight = optimize_hybrid(
|
||||
module,
|
||||
module_param,
|
||||
strategies,
|
||||
metric_inputs,
|
||||
project_dir,
|
||||
previous_result,
|
||||
)
|
||||
module_param["weight"] = module_best_weight
|
||||
hybrid_results.append(module_result_df)
|
||||
|
||||
hybrid_summary_df = save_and_summary(
|
||||
hybrid_modules,
|
||||
hybrid_module_params,
|
||||
hybrid_results,
|
||||
hybrid_times,
|
||||
filename_first,
|
||||
)
|
||||
filename_first += len(hybrid_modules)
|
||||
hybrid_summary_df["execution_time"] = hybrid_times
|
||||
best_semantic_summary_row = semantic_summary_df.loc[
|
||||
semantic_summary_df["is_best"]
|
||||
].iloc[0]
|
||||
best_lexical_summary_row = lexical_summary_df.loc[
|
||||
lexical_summary_df["is_best"]
|
||||
].iloc[0]
|
||||
target_modules = (
|
||||
best_semantic_summary_row["module_name"],
|
||||
best_lexical_summary_row["module_name"],
|
||||
)
|
||||
target_module_params = (
|
||||
best_semantic_summary_row["module_params"],
|
||||
best_lexical_summary_row["module_params"],
|
||||
)
|
||||
hybrid_summary_df = edit_summary_df_params(
|
||||
hybrid_summary_df, target_modules, target_module_params
|
||||
)
|
||||
else:
|
||||
if any([module.__name__ in hybrid_module_names for module in modules]):
|
||||
logger.warning(
|
||||
"You must at least one semantic module and lexical module for hybrid evaluation."
|
||||
"Passing hybrid module."
|
||||
)
|
||||
_, hybrid_summary_df, hybrid_results, hybrid_times = (
|
||||
None,
|
||||
pd.DataFrame(),
|
||||
[],
|
||||
[],
|
||||
)
|
||||
|
||||
summary = pd.concat(
|
||||
[semantic_summary_df, lexical_summary_df, hybrid_summary_df], ignore_index=True
|
||||
)
|
||||
results = semantic_results + lexical_results + hybrid_results
|
||||
average_times = semantic_times + lexical_times + hybrid_times
|
||||
filenames = summary["filename"].tolist()
|
||||
|
||||
# filter by strategies
|
||||
selected_result, selected_filename = find_best(results, average_times, filenames)
|
||||
best_result = pd.concat([previous_result, selected_result], axis=1)
|
||||
|
||||
# add summary.csv 'is_best' column
|
||||
summary["is_best"] = summary["filename"] == selected_filename
|
||||
|
||||
# save the result files
|
||||
best_result.to_parquet(
|
||||
os.path.join(
|
||||
save_dir, f"best_{os.path.splitext(selected_filename)[0]}.parquet"
|
||||
),
|
||||
index=False,
|
||||
)
|
||||
summary.to_csv(os.path.join(save_dir, "summary.csv"), index=False)
|
||||
return best_result
|
||||
|
||||
|
||||
def evaluate_retrieval_node(
|
||||
result_df: pd.DataFrame,
|
||||
metric_inputs: List[MetricInput],
|
||||
metrics: Union[List[str], List[Dict]],
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Evaluate retrieval node from retrieval node result dataframe.
|
||||
|
||||
:param result_df: The result dataframe from a retrieval node.
|
||||
:param metric_inputs: List of metric input schema for AutoRAG.
|
||||
:param metrics: Metric list from input strategies.
|
||||
:return: Return result_df with metrics columns.
|
||||
The columns will be 'retrieved_contents', 'retrieved_ids', 'retrieve_scores', and metric names.
|
||||
"""
|
||||
|
||||
@evaluate_retrieval(
|
||||
metric_inputs=metric_inputs,
|
||||
metrics=metrics,
|
||||
)
|
||||
def evaluate_this_module(df: pd.DataFrame):
|
||||
return (
|
||||
df["retrieved_contents"].tolist(),
|
||||
df["retrieved_ids"].tolist(),
|
||||
df["retrieve_scores"].tolist(),
|
||||
)
|
||||
|
||||
return evaluate_this_module(result_df)
|
||||
|
||||
|
||||
def edit_summary_df_params(
|
||||
summary_df: pd.DataFrame, target_modules, target_module_params
|
||||
) -> pd.DataFrame:
|
||||
def delete_ids_scores(x):
|
||||
del x["ids"]
|
||||
del x["scores"]
|
||||
return x
|
||||
|
||||
summary_df["module_params"] = summary_df["module_params"].apply(delete_ids_scores)
|
||||
summary_df["new_params"] = [
|
||||
{"target_modules": target_modules, "target_module_params": target_module_params}
|
||||
] * len(summary_df)
|
||||
summary_df["module_params"] = summary_df.apply(
|
||||
lambda row: {**row["module_params"], **row["new_params"]}, axis=1
|
||||
)
|
||||
summary_df = summary_df.drop(columns=["new_params"])
|
||||
return summary_df
|
||||
|
||||
|
||||
def get_ids_and_scores(
|
||||
node_dir: str,
|
||||
filenames: List[str],
|
||||
semantic_summary_df: pd.DataFrame,
|
||||
lexical_summary_df: pd.DataFrame,
|
||||
previous_result,
|
||||
) -> Dict[str, Tuple[List[List[str]], List[List[float]]]]:
|
||||
project_dir = pathlib.PurePath(node_dir).parent.parent.parent
|
||||
best_results_df = list(
|
||||
map(
|
||||
lambda filename: pd.read_parquet(
|
||||
os.path.join(node_dir, filename), engine="pyarrow"
|
||||
),
|
||||
filenames,
|
||||
)
|
||||
)
|
||||
ids = tuple(
|
||||
map(lambda df: df["retrieved_ids"].apply(list).tolist(), best_results_df)
|
||||
)
|
||||
scores = tuple(
|
||||
map(lambda df: df["retrieve_scores"].apply(list).tolist(), best_results_df)
|
||||
)
|
||||
# search non-duplicate ids
|
||||
semantic_ids = deepcopy(ids[0])
|
||||
lexical_ids = deepcopy(ids[1])
|
||||
|
||||
def get_non_duplicate_ids(target_ids, compare_ids) -> List[List[str]]:
|
||||
"""
|
||||
Get non-duplicate ids from target_ids and compare_ids.
|
||||
If you want to non-duplicate ids of semantic_ids, you have to put it at target_ids.
|
||||
"""
|
||||
result_ids = []
|
||||
assert len(target_ids) == len(compare_ids)
|
||||
for target_id_list, compare_id_list in zip(target_ids, compare_ids):
|
||||
query_duplicated = list(set(compare_id_list) - set(target_id_list))
|
||||
duplicate_list = query_duplicated if len(query_duplicated) != 0 else []
|
||||
result_ids.append(duplicate_list)
|
||||
return result_ids
|
||||
|
||||
lexical_target_ids = get_non_duplicate_ids(lexical_ids, semantic_ids)
|
||||
semantic_target_ids = get_non_duplicate_ids(semantic_ids, lexical_ids)
|
||||
|
||||
new_id_tuple = (
|
||||
[a + b for a, b in zip(semantic_ids, semantic_target_ids)],
|
||||
[a + b for a, b in zip(lexical_ids, lexical_target_ids)],
|
||||
)
|
||||
|
||||
# search non-duplicate ids' scores
|
||||
new_semantic_scores = get_scores_by_ids(
|
||||
semantic_target_ids, semantic_summary_df, project_dir, previous_result
|
||||
)
|
||||
new_lexical_scores = get_scores_by_ids(
|
||||
lexical_target_ids, lexical_summary_df, project_dir, previous_result
|
||||
)
|
||||
|
||||
new_score_tuple = (
|
||||
[a + b for a, b in zip(scores[0], new_semantic_scores)],
|
||||
[a + b for a, b in zip(scores[1], new_lexical_scores)],
|
||||
)
|
||||
return {
|
||||
"ids": new_id_tuple,
|
||||
"scores": new_score_tuple,
|
||||
}
|
||||
|
||||
|
||||
def get_scores_by_ids(
|
||||
ids: List[List[str]], module_summary_df: pd.DataFrame, project_dir, previous_result
|
||||
) -> List[List[float]]:
|
||||
module_name = get_best_row(module_summary_df)["module_name"]
|
||||
module_params = get_best_row(module_summary_df)["module_params"]
|
||||
module = get_support_modules(module_name)
|
||||
result_df = module.run_evaluator(
|
||||
project_dir=project_dir,
|
||||
previous_result=previous_result,
|
||||
ids=ids,
|
||||
**module_params,
|
||||
)
|
||||
return to_list(result_df["retrieve_scores"].tolist())
|
||||
|
||||
|
||||
def find_unique_elems(list1: List[str], list2: List[str]) -> List[str]:
|
||||
return list(set(list1).symmetric_difference(set(list2)))
|
||||
|
||||
|
||||
def get_hybrid_execution_times(lexical_summary, semantic_summary) -> float:
|
||||
lexical_execution_time = lexical_summary.loc[lexical_summary["is_best"]].iloc[0][
|
||||
"execution_time"
|
||||
]
|
||||
semantic_execution_time = semantic_summary.loc[semantic_summary["is_best"]].iloc[0][
|
||||
"execution_time"
|
||||
]
|
||||
return lexical_execution_time + semantic_execution_time
|
||||
|
||||
|
||||
def optimize_hybrid(
|
||||
hybrid_module_func: Callable,
|
||||
hybrid_module_param: Dict,
|
||||
strategy: Dict,
|
||||
input_metrics: List[MetricInput],
|
||||
project_dir,
|
||||
previous_result,
|
||||
):
|
||||
if (
|
||||
hybrid_module_func.__name__ == "HybridRRF"
|
||||
or hybrid_module_func.__name__ == "hybrid_rrf"
|
||||
):
|
||||
weight_range = hybrid_module_param.pop("weight_range", (4, 80))
|
||||
test_weight_size = weight_range[1] - weight_range[0] + 1
|
||||
elif (
|
||||
hybrid_module_func.__name__ == "HybridCC"
|
||||
or hybrid_module_func.__name__ == "hybrid_cc"
|
||||
):
|
||||
weight_range = hybrid_module_param.pop("weight_range", (0.0, 1.0))
|
||||
test_weight_size = hybrid_module_param.pop("test_weight_size", 101)
|
||||
else:
|
||||
raise ValueError("You must input hybrid module function at hybrid_module_func.")
|
||||
|
||||
weight_candidates = np.linspace(
|
||||
weight_range[0], weight_range[1], test_weight_size
|
||||
).tolist()
|
||||
|
||||
result_list = []
|
||||
for weight_value in weight_candidates:
|
||||
result_df = hybrid_module_func.run_evaluator(
|
||||
project_dir=project_dir,
|
||||
previous_result=previous_result,
|
||||
weight=weight_value,
|
||||
**hybrid_module_param,
|
||||
)
|
||||
result_list.append(result_df)
|
||||
|
||||
# evaluate here
|
||||
if strategy.get("metrics") is None:
|
||||
raise ValueError("You must at least one metrics for retrieval evaluation.")
|
||||
result_list = list(
|
||||
map(
|
||||
lambda x: evaluate_retrieval_node(
|
||||
x,
|
||||
input_metrics,
|
||||
strategy.get("metrics"),
|
||||
),
|
||||
result_list,
|
||||
)
|
||||
)
|
||||
|
||||
# select best result
|
||||
best_result_df, best_weight = select_best(
|
||||
result_list,
|
||||
strategy.get("metrics"),
|
||||
metadatas=weight_candidates,
|
||||
strategy_name=strategy.get("strategy", "normalize_mean"),
|
||||
)
|
||||
return best_result_df, best_weight
|
||||
303
autorag/nodes/retrieval/vectordb.py
Normal file
303
autorag/nodes/retrieval/vectordb.py
Normal file
@@ -0,0 +1,303 @@
|
||||
import itertools
|
||||
import logging
|
||||
import os
|
||||
from typing import List, Tuple, Optional
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from llama_index.core.embeddings import BaseEmbedding
|
||||
from llama_index.embeddings.openai import OpenAIEmbedding
|
||||
|
||||
from autorag.evaluation.metric.util import (
|
||||
calculate_l2_distance,
|
||||
calculate_inner_product,
|
||||
calculate_cosine_similarity,
|
||||
)
|
||||
from autorag.nodes.retrieval.base import evenly_distribute_passages, BaseRetrieval
|
||||
from autorag.utils import (
|
||||
validate_corpus_dataset,
|
||||
cast_corpus_dataset,
|
||||
cast_qa_dataset,
|
||||
validate_qa_dataset,
|
||||
)
|
||||
from autorag.utils.util import (
|
||||
get_event_loop,
|
||||
process_batch,
|
||||
openai_truncate_by_token,
|
||||
flatten_apply,
|
||||
result_to_dataframe,
|
||||
pop_params,
|
||||
fetch_contents,
|
||||
empty_cuda_cache,
|
||||
convert_inputs_to_list,
|
||||
make_batch,
|
||||
)
|
||||
from autorag.vectordb import load_vectordb_from_yaml
|
||||
from autorag.vectordb.base import BaseVectorStore
|
||||
|
||||
logger = logging.getLogger("AutoRAG")
|
||||
|
||||
|
||||
class VectorDB(BaseRetrieval):
|
||||
def __init__(self, project_dir: str, vectordb: str = "default", **kwargs):
|
||||
"""
|
||||
Initialize VectorDB retrieval node.
|
||||
|
||||
:param project_dir: The project directory path.
|
||||
:param vectordb: The vectordb name.
|
||||
You must configure the vectordb name in the config.yaml file.
|
||||
If you don't configure, it uses the default vectordb.
|
||||
:param kwargs: The optional arguments.
|
||||
Not affected in the init method.
|
||||
"""
|
||||
super().__init__(project_dir)
|
||||
|
||||
vectordb_config_path = os.path.join(self.resources_dir, "vectordb.yaml")
|
||||
self.vector_store = load_vectordb_from_yaml(
|
||||
vectordb_config_path, vectordb, project_dir
|
||||
)
|
||||
|
||||
self.embedding_model = self.vector_store.embedding
|
||||
|
||||
def __del__(self):
|
||||
del self.vector_store
|
||||
del self.embedding_model
|
||||
empty_cuda_cache()
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries = self.cast_to_run(previous_result)
|
||||
pure_params = pop_params(self._pure, kwargs)
|
||||
ids, scores = self._pure(queries, **pure_params)
|
||||
contents = fetch_contents(self.corpus_df, ids)
|
||||
return contents, ids, scores
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[List[str]],
|
||||
top_k: int,
|
||||
embedding_batch: int = 128,
|
||||
ids: Optional[List[List[str]]] = None,
|
||||
) -> Tuple[List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
VectorDB retrieval function.
|
||||
You have to get a chroma collection that is already ingested.
|
||||
You have to get an embedding model that is already used in ingesting.
|
||||
|
||||
:param queries: 2-d list of query strings.
|
||||
Each element of the list is a query strings of each row.
|
||||
:param top_k: The number of passages to be retrieved.
|
||||
:param embedding_batch: The number of queries to be processed in parallel.
|
||||
This is used to prevent API error at the query embedding.
|
||||
Default is 128.
|
||||
:param ids: The optional list of ids that you want to retrieve.
|
||||
You don't need to specify this in the general use cases.
|
||||
Default is None.
|
||||
|
||||
:return: The 2-d list contains a list of passage ids that retrieved from vectordb and 2-d list of its scores.
|
||||
It will be a length of queries. And each element has a length of top_k.
|
||||
"""
|
||||
# if ids are specified, fetch the ids score from Chroma
|
||||
if ids is not None:
|
||||
return self.__get_ids_scores(queries, ids, embedding_batch)
|
||||
|
||||
# run async vector_db_pure function
|
||||
tasks = [
|
||||
vectordb_pure(query_list, top_k, self.vector_store)
|
||||
for query_list in queries
|
||||
]
|
||||
loop = get_event_loop()
|
||||
results = loop.run_until_complete(
|
||||
process_batch(tasks, batch_size=embedding_batch)
|
||||
)
|
||||
id_result = list(map(lambda x: x[0], results))
|
||||
score_result = list(map(lambda x: x[1], results))
|
||||
return id_result, score_result
|
||||
|
||||
def __get_ids_scores(self, queries, ids, embedding_batch: int):
|
||||
# truncate queries and embedding execution here.
|
||||
openai_embedding_limit = 8000
|
||||
if isinstance(self.embedding_model, OpenAIEmbedding):
|
||||
queries = list(
|
||||
map(
|
||||
lambda query_list: openai_truncate_by_token(
|
||||
query_list,
|
||||
openai_embedding_limit,
|
||||
self.embedding_model.model_name,
|
||||
),
|
||||
queries,
|
||||
)
|
||||
)
|
||||
|
||||
query_embeddings = flatten_apply(
|
||||
run_query_embedding_batch,
|
||||
queries,
|
||||
embedding_model=self.embedding_model,
|
||||
batch_size=embedding_batch,
|
||||
)
|
||||
|
||||
loop = get_event_loop()
|
||||
|
||||
async def run_fetch(ids):
|
||||
final_result = []
|
||||
for id_list in ids:
|
||||
if len(id_list) == 0:
|
||||
final_result.append([])
|
||||
else:
|
||||
result = await self.vector_store.fetch(id_list)
|
||||
final_result.append(result)
|
||||
return final_result
|
||||
|
||||
content_embeddings = loop.run_until_complete(run_fetch(ids))
|
||||
|
||||
score_result = list(
|
||||
map(
|
||||
lambda query_embedding_list, content_embedding_list: get_id_scores(
|
||||
query_embedding_list,
|
||||
content_embedding_list,
|
||||
similarity_metric=self.vector_store.similarity_metric,
|
||||
),
|
||||
query_embeddings,
|
||||
content_embeddings,
|
||||
)
|
||||
)
|
||||
return ids, score_result
|
||||
|
||||
|
||||
async def vectordb_pure(
|
||||
queries: List[str], top_k: int, vectordb: BaseVectorStore
|
||||
) -> Tuple[List[str], List[float]]:
|
||||
"""
|
||||
Async VectorDB retrieval function.
|
||||
Its usage is for async retrieval of vector_db row by row.
|
||||
|
||||
:param query_embeddings: A list of query embeddings.
|
||||
:param top_k: The number of passages to be retrieved.
|
||||
:param vectordb: The vector store instance.
|
||||
:return: The tuple contains a list of passage ids that are retrieved from vectordb and a list of its scores.
|
||||
"""
|
||||
id_result, score_result = await vectordb.query(queries=queries, top_k=top_k)
|
||||
|
||||
# Distribute passages evenly
|
||||
id_result, score_result = evenly_distribute_passages(id_result, score_result, top_k)
|
||||
# sort id_result and score_result by score
|
||||
result = [
|
||||
(_id, score)
|
||||
for score, _id in sorted(
|
||||
zip(score_result, id_result), key=lambda pair: pair[0], reverse=True
|
||||
)
|
||||
]
|
||||
id_result, score_result = zip(*result)
|
||||
return list(id_result), list(score_result)
|
||||
|
||||
|
||||
async def filter_exist_ids(
|
||||
vectordb: BaseVectorStore,
|
||||
corpus_data: pd.DataFrame,
|
||||
) -> pd.DataFrame:
|
||||
corpus_data = cast_corpus_dataset(corpus_data)
|
||||
validate_corpus_dataset(corpus_data)
|
||||
ids = corpus_data["doc_id"].tolist()
|
||||
|
||||
# Query the collection to check if IDs already exist
|
||||
existed_bool_list = await vectordb.is_exist(ids=ids)
|
||||
# Assuming 'ids' is the key in the response
|
||||
new_passage = corpus_data[~pd.Series(existed_bool_list)]
|
||||
return new_passage
|
||||
|
||||
|
||||
async def filter_exist_ids_from_retrieval_gt(
|
||||
vectordb: BaseVectorStore,
|
||||
qa_data: pd.DataFrame,
|
||||
corpus_data: pd.DataFrame,
|
||||
) -> pd.DataFrame:
|
||||
qa_data = cast_qa_dataset(qa_data)
|
||||
validate_qa_dataset(qa_data)
|
||||
corpus_data = cast_corpus_dataset(corpus_data)
|
||||
validate_corpus_dataset(corpus_data)
|
||||
retrieval_gt = (
|
||||
qa_data["retrieval_gt"]
|
||||
.apply(lambda x: list(itertools.chain.from_iterable(x)))
|
||||
.tolist()
|
||||
)
|
||||
retrieval_gt = list(itertools.chain.from_iterable(retrieval_gt))
|
||||
retrieval_gt = list(set(retrieval_gt))
|
||||
|
||||
existed_bool_list = await vectordb.is_exist(ids=retrieval_gt)
|
||||
add_ids = []
|
||||
for ret_gt, is_exist in zip(retrieval_gt, existed_bool_list):
|
||||
if not is_exist:
|
||||
add_ids.append(ret_gt)
|
||||
new_passage = corpus_data[corpus_data["doc_id"].isin(add_ids)]
|
||||
return new_passage
|
||||
|
||||
|
||||
async def vectordb_ingest(
|
||||
vectordb: BaseVectorStore,
|
||||
corpus_data: pd.DataFrame,
|
||||
):
|
||||
"""
|
||||
Ingest given corpus data to the vectordb.
|
||||
It truncates corpus content when the embedding model is OpenAIEmbedding to the 8000 tokens.
|
||||
Plus, when the corpus content is empty (whitespace), it will be ignored.
|
||||
And if there is a document id that already exists in the collection, it will be ignored.
|
||||
|
||||
:param vectordb: A vector stores instance that you want to ingest.
|
||||
:param corpus_data: The corpus data that contains doc_id and contents columns.
|
||||
"""
|
||||
embedding_batch = vectordb.embedding_batch
|
||||
if not corpus_data.empty:
|
||||
new_contents = corpus_data["contents"].tolist()
|
||||
new_ids = corpus_data["doc_id"].tolist()
|
||||
content_batches = make_batch(new_contents, embedding_batch)
|
||||
id_batches = make_batch(new_ids, embedding_batch)
|
||||
for content_batch, id_batch in zip(content_batches, id_batches):
|
||||
await vectordb.add(ids=id_batch, texts=content_batch)
|
||||
|
||||
|
||||
def run_query_embedding_batch(
|
||||
queries: List[str], embedding_model: BaseEmbedding, batch_size: int
|
||||
) -> List[List[float]]:
|
||||
result = []
|
||||
for i in range(0, len(queries), batch_size):
|
||||
batch = queries[i : i + batch_size]
|
||||
embeddings = embedding_model.get_text_embedding_batch(batch)
|
||||
result.extend(embeddings)
|
||||
return result
|
||||
|
||||
|
||||
@convert_inputs_to_list
|
||||
def get_id_scores( # To find the uncalculated score when fuse the scores for the hybrid retrieval
|
||||
query_embeddings: List[
|
||||
List[float]
|
||||
], # `queries` is input. This is one user input query.
|
||||
content_embeddings: List[List[float]],
|
||||
similarity_metric: str,
|
||||
) -> List[
|
||||
float
|
||||
]: # The most high scores among each query. The length of a result is the same as the contents length.
|
||||
"""
|
||||
Calculate the highest similarity scores between query embeddings and content embeddings.
|
||||
|
||||
:param query_embeddings: A list of lists containing query embeddings.
|
||||
:param content_embeddings: A list of lists containing content embeddings.
|
||||
:param similarity_metric: The similarity metric to use ('l2', 'ip', or 'cosine').
|
||||
:return: A list of the highest similarity scores for each content embedding.
|
||||
"""
|
||||
metric_func_dict = {
|
||||
"l2": lambda x, y: 1 - calculate_l2_distance(x, y),
|
||||
"ip": calculate_inner_product,
|
||||
"cosine": calculate_cosine_similarity,
|
||||
}
|
||||
metric_func = metric_func_dict[similarity_metric]
|
||||
|
||||
result = []
|
||||
for content_embedding in content_embeddings:
|
||||
scores = []
|
||||
for query_embedding in query_embeddings:
|
||||
scores.append(
|
||||
metric_func(np.array(query_embedding), np.array(content_embedding))
|
||||
)
|
||||
result.append(max(scores))
|
||||
return result
|
||||
16
autorag/nodes/util.py
Normal file
16
autorag/nodes/util.py
Normal file
@@ -0,0 +1,16 @@
|
||||
from typing import Optional, Dict
|
||||
|
||||
from autorag.support import get_support_modules
|
||||
|
||||
|
||||
def make_generator_callable_param(generator_dict: Optional[Dict]):
|
||||
if "generator_module_type" not in generator_dict.keys():
|
||||
generator_dict = {
|
||||
"generator_module_type": "llama_index_llm",
|
||||
"llm": "openai",
|
||||
"model": "gpt-4o-mini",
|
||||
}
|
||||
module_str = generator_dict.pop("generator_module_type")
|
||||
module_class = get_support_modules(module_str)
|
||||
module_param = generator_dict
|
||||
return module_class, module_param
|
||||
Reference in New Issue
Block a user