Fix Dockerfile build issue
This commit is contained in:
4
autorag/nodes/generator/__init__.py
Normal file
4
autorag/nodes/generator/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from .llama_index_llm import LlamaIndexLLM
|
||||
from .openai_llm import OpenAILLM
|
||||
from .vllm import Vllm
|
||||
from .vllm_api import VllmAPI
|
||||
103
autorag/nodes/generator/base.py
Normal file
103
autorag/nodes/generator/base.py
Normal file
@@ -0,0 +1,103 @@
|
||||
import abc
|
||||
import functools
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Union, Tuple, List
|
||||
|
||||
import pandas as pd
|
||||
from llama_index.core.output_parsers import PydanticOutputParser
|
||||
|
||||
from autorag import generator_models
|
||||
from autorag.schema import BaseModule
|
||||
from autorag.utils import result_to_dataframe
|
||||
|
||||
logger = logging.getLogger("AutoRAG")
|
||||
|
||||
|
||||
class BaseGenerator(BaseModule, metaclass=abc.ABCMeta):
|
||||
def __init__(self, project_dir: str, llm: str, *args, **kwargs):
|
||||
logger.info(f"Initialize generator node - {self.__class__.__name__}")
|
||||
self.llm = llm
|
||||
|
||||
def __del__(self):
|
||||
logger.info(f"Deleting generator module - {self.__class__.__name__}")
|
||||
|
||||
def cast_to_run(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
logger.info(f"Running generator node - {self.__class__.__name__} module...")
|
||||
assert (
|
||||
"prompts" in previous_result.columns
|
||||
), "previous_result must contain prompts column."
|
||||
prompts = previous_result["prompts"].tolist()
|
||||
return prompts
|
||||
|
||||
def structured_output(self, prompts: List[str], output_cls):
|
||||
response, _, _ = self._pure(prompts)
|
||||
parser = PydanticOutputParser(output_cls)
|
||||
result = []
|
||||
for res in response:
|
||||
try:
|
||||
result.append(parser.parse(res))
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Error parsing response: {e} \nSo returning None instead in this case."
|
||||
)
|
||||
result.append(None)
|
||||
return result
|
||||
|
||||
@abc.abstractmethod
|
||||
async def astream(self, prompt: str, **kwargs):
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def stream(self, prompt: str, **kwargs):
|
||||
pass
|
||||
|
||||
|
||||
def generator_node(func):
|
||||
@functools.wraps(func)
|
||||
@result_to_dataframe(["generated_texts", "generated_tokens", "generated_log_probs"])
|
||||
def wrapper(
|
||||
project_dir: Union[str, Path], previous_result: pd.DataFrame, llm: str, **kwargs
|
||||
) -> Tuple[List[str], List[List[int]], List[List[float]]]:
|
||||
"""
|
||||
This decorator makes a generator module to be a node.
|
||||
It automatically extracts prompts from previous_result and runs the generator function.
|
||||
Plus, it retrieves the llm instance from autorag.generator_models.
|
||||
|
||||
:param project_dir: The project directory.
|
||||
:param previous_result: The previous result that contains prompts,
|
||||
:param llm: The llm name that you want to use.
|
||||
:param kwargs: The extra parameters for initializing the llm instance.
|
||||
:return: Pandas dataframe that contains generated texts, generated tokens, and generated log probs.
|
||||
Each column is "generated_texts", "generated_tokens", and "generated_log_probs".
|
||||
"""
|
||||
logger.info(f"Running generator node - {func.__name__} module...")
|
||||
assert (
|
||||
"prompts" in previous_result.columns
|
||||
), "previous_result must contain prompts column."
|
||||
prompts = previous_result["prompts"].tolist()
|
||||
if func.__name__ == "llama_index_llm":
|
||||
if llm not in generator_models:
|
||||
raise ValueError(
|
||||
f"{llm} is not a valid llm name. Please check the llm name."
|
||||
"You can check valid llm names from autorag.generator_models."
|
||||
)
|
||||
batch = kwargs.pop("batch", 16)
|
||||
if llm == "huggingfacellm":
|
||||
model_name = kwargs.pop("model", None)
|
||||
if model_name is not None:
|
||||
kwargs["model_name"] = model_name
|
||||
else:
|
||||
if "model_name" not in kwargs.keys():
|
||||
raise ValueError(
|
||||
"`model` or `model_name` parameter must be provided for using huggingfacellm."
|
||||
)
|
||||
kwargs["tokenizer_name"] = kwargs["model_name"]
|
||||
llm_instance = generator_models[llm](**kwargs)
|
||||
result = func(prompts=prompts, llm=llm_instance, batch=batch)
|
||||
del llm_instance
|
||||
return result
|
||||
else:
|
||||
return func(prompts=prompts, llm=llm, **kwargs)
|
||||
|
||||
return wrapper
|
||||
97
autorag/nodes/generator/llama_index_llm.py
Normal file
97
autorag/nodes/generator/llama_index_llm.py
Normal file
@@ -0,0 +1,97 @@
|
||||
from typing import List, Tuple
|
||||
|
||||
import pandas as pd
|
||||
from llama_index.core.base.llms.base import BaseLLM
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from autorag import generator_models
|
||||
from autorag.nodes.generator.base import BaseGenerator
|
||||
from autorag.utils.util import (
|
||||
get_event_loop,
|
||||
process_batch,
|
||||
result_to_dataframe,
|
||||
pop_params,
|
||||
)
|
||||
|
||||
|
||||
class LlamaIndexLLM(BaseGenerator):
|
||||
def __init__(self, project_dir: str, llm: str, batch: int = 16, *args, **kwargs):
|
||||
"""
|
||||
Initialize the Llama Index LLM module.
|
||||
|
||||
:param project_dir: The project directory.
|
||||
:param llm: A llama index LLM instance.
|
||||
:param batch: The batch size for llm.
|
||||
Set low if you face some errors.
|
||||
Default is 16.
|
||||
:param kwargs: The extra parameters for initializing the llm instance.
|
||||
"""
|
||||
super().__init__(project_dir=project_dir, llm=llm)
|
||||
if self.llm not in generator_models.keys():
|
||||
raise ValueError(
|
||||
f"{self.llm} is not a valid llm name. Please check the llm name."
|
||||
"You can check valid llm names from autorag.generator_models."
|
||||
)
|
||||
self.batch = batch
|
||||
llm_class = generator_models[self.llm]
|
||||
|
||||
if llm_class.class_name() in [
|
||||
"HuggingFace_LLM",
|
||||
"HuggingFaceInferenceAPI",
|
||||
"TextGenerationInference",
|
||||
]:
|
||||
model_name = kwargs.pop("model", None)
|
||||
if model_name is not None:
|
||||
kwargs["model_name"] = model_name
|
||||
else:
|
||||
if "model_name" not in kwargs.keys():
|
||||
raise ValueError(
|
||||
"`model` or `model_name` parameter must be provided for using huggingfacellm."
|
||||
)
|
||||
kwargs["tokenizer_name"] = kwargs["model_name"]
|
||||
self.llm_instance: BaseLLM = llm_class(**pop_params(llm_class.__init__, kwargs))
|
||||
|
||||
def __del__(self):
|
||||
super().__del__()
|
||||
del self.llm_instance
|
||||
|
||||
@result_to_dataframe(["generated_texts", "generated_tokens", "generated_log_probs"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
prompts = self.cast_to_run(previous_result=previous_result)
|
||||
return self._pure(prompts)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
prompts: List[str],
|
||||
) -> Tuple[List[str], List[List[int]], List[List[float]]]:
|
||||
"""
|
||||
Llama Index LLM module.
|
||||
It gets the LLM instance from llama index, and returns generated text by the input prompt.
|
||||
It does not generate the right log probs, but it returns the pseudo log probs,
|
||||
which are not meant to be used for other modules.
|
||||
|
||||
:param prompts: A list of prompts.
|
||||
:return: A tuple of three elements.
|
||||
The first element is a list of a generated text.
|
||||
The second element is a list of generated text's token ids, used tokenizer is GPT2Tokenizer.
|
||||
The third element is a list of generated text's pseudo log probs.
|
||||
"""
|
||||
tasks = [self.llm_instance.acomplete(prompt) for prompt in prompts]
|
||||
loop = get_event_loop() # get_event_loop()
|
||||
results = loop.run_until_complete(process_batch(tasks, batch_size=self.batch))
|
||||
|
||||
generated_texts = list(map(lambda x: x.text, results))
|
||||
tokenizer = AutoTokenizer.from_pretrained("gpt2", use_fast=False)
|
||||
tokenized_ids = tokenizer(generated_texts).data["input_ids"]
|
||||
pseudo_log_probs = list(map(lambda x: [0.5] * len(x), tokenized_ids))
|
||||
return generated_texts, tokenized_ids, pseudo_log_probs
|
||||
|
||||
async def astream(self, prompt: str, **kwargs):
|
||||
async for completion_response in await self.llm_instance.astream_complete(
|
||||
prompt
|
||||
):
|
||||
yield completion_response.text
|
||||
|
||||
def stream(self, prompt: str, **kwargs):
|
||||
for completion_response in self.llm_instance.stream_complete(prompt):
|
||||
yield completion_response.text
|
||||
296
autorag/nodes/generator/openai_llm.py
Normal file
296
autorag/nodes/generator/openai_llm.py
Normal file
@@ -0,0 +1,296 @@
|
||||
import logging
|
||||
from typing import List, Tuple
|
||||
|
||||
import pandas as pd
|
||||
import tiktoken
|
||||
from openai import AsyncOpenAI
|
||||
from tiktoken import Encoding
|
||||
|
||||
from autorag.nodes.generator.base import BaseGenerator
|
||||
from autorag.utils.util import (
|
||||
get_event_loop,
|
||||
process_batch,
|
||||
pop_params,
|
||||
result_to_dataframe,
|
||||
)
|
||||
|
||||
logger = logging.getLogger("AutoRAG")
|
||||
|
||||
MAX_TOKEN_DICT = { # model name : token limit
|
||||
"gpt-4.5-preview": 128_000,
|
||||
"gpt-4.5-preview-2025-02-27": 128_000,
|
||||
"o1": 200_000,
|
||||
"o1-preview": 128_000,
|
||||
"o1-preview-2024-09-12": 128_000,
|
||||
"o1-mini": 128_000,
|
||||
"o1-mini-2024-09-12": 128_000,
|
||||
"o3-mini": 200_000,
|
||||
"gpt-4o-mini": 128_000,
|
||||
"gpt-4o-mini-2024-07-18": 128_000,
|
||||
"gpt-4o": 128_000,
|
||||
"gpt-4o-2024-08-06": 128_000,
|
||||
"gpt-4o-2024-05-13": 128_000,
|
||||
"chatgpt-4o-latest": 128_000,
|
||||
"gpt-4-turbo": 128_000,
|
||||
"gpt-4-turbo-2024-04-09": 128_000,
|
||||
"gpt-4-turbo-preview": 128_000,
|
||||
"gpt-4-0125-preview": 128_000,
|
||||
"gpt-4-1106-preview": 128_000,
|
||||
"gpt-4-vision-preview": 128_000,
|
||||
"gpt-4-1106-vision-preview": 128_000,
|
||||
"gpt-4": 8_192,
|
||||
"gpt-4-0613": 8_192,
|
||||
"gpt-4-32k": 32_768,
|
||||
"gpt-4-32k-0613": 32_768,
|
||||
"gpt-3.5-turbo-0125": 16_385,
|
||||
"gpt-3.5-turbo": 16_385,
|
||||
"gpt-3.5-turbo-1106": 16_385,
|
||||
"gpt-3.5-turbo-instruct": 4_096,
|
||||
"gpt-3.5-turbo-16k": 16_385,
|
||||
"gpt-3.5-turbo-0613": 4_096,
|
||||
"gpt-3.5-turbo-16k-0613": 16_385,
|
||||
}
|
||||
|
||||
|
||||
class OpenAILLM(BaseGenerator):
|
||||
def __init__(self, project_dir, llm: str, batch: int = 16, *args, **kwargs):
|
||||
super().__init__(project_dir, llm, *args, **kwargs)
|
||||
assert batch > 0, "batch size must be greater than 0."
|
||||
self.batch = batch
|
||||
|
||||
client_init_params = pop_params(AsyncOpenAI.__init__, kwargs)
|
||||
self.client = AsyncOpenAI(**client_init_params)
|
||||
|
||||
if self.llm.startswith("gpt-4.5"):
|
||||
self.tokenizer = tiktoken.get_encoding("o200k_base")
|
||||
else:
|
||||
self.tokenizer = tiktoken.encoding_for_model(self.llm)
|
||||
|
||||
self.max_token_size = (
|
||||
MAX_TOKEN_DICT.get(self.llm) - 7
|
||||
) # because of chat token usage
|
||||
if self.max_token_size is None:
|
||||
raise ValueError(
|
||||
f"Model {self.llm} does not supported. "
|
||||
f"Please select the model between {list(MAX_TOKEN_DICT.keys())}"
|
||||
)
|
||||
|
||||
@result_to_dataframe(["generated_texts", "generated_tokens", "generated_log_probs"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
prompts = self.cast_to_run(previous_result)
|
||||
return self._pure(prompts, **kwargs)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
prompts: List[str],
|
||||
truncate: bool = True,
|
||||
**kwargs,
|
||||
) -> Tuple[List[str], List[List[int]], List[List[float]]]:
|
||||
"""
|
||||
OpenAI generator module.
|
||||
Uses an official openai library for generating answer from the given prompt.
|
||||
It returns real token ids and log probs, so you must use this for using token ids and log probs.
|
||||
|
||||
:param prompts: A list of prompts.
|
||||
:param llm: A model name for openai.
|
||||
Default is gpt-3.5-turbo.
|
||||
:param batch: Batch size for openai api call.
|
||||
If you get API limit errors, you should lower the batch size.
|
||||
Default is 16.
|
||||
:param truncate: Whether to truncate the input prompt.
|
||||
Default is True.
|
||||
:param api_key: OpenAI API key. You can set this by passing env variable `OPENAI_API_KEY`
|
||||
:param kwargs: The optional parameter for openai api call `openai.chat.completion`
|
||||
See https://platform.openai.com/docs/api-reference/chat/create for more details.
|
||||
:return: A tuple of three elements.
|
||||
The first element is a list of generated text.
|
||||
The second element is a list of generated text's token ids.
|
||||
The third element is a list of generated text's log probs.
|
||||
"""
|
||||
if kwargs.get("logprobs") is not None:
|
||||
kwargs.pop("logprobs")
|
||||
logger.warning(
|
||||
"parameter logprob does not effective. It always set to True."
|
||||
)
|
||||
if kwargs.get("n") is not None:
|
||||
kwargs.pop("n")
|
||||
logger.warning("parameter n does not effective. It always set to 1.")
|
||||
|
||||
# TODO: fix this after updating tiktoken for the gpt-4.5 model. It is not yet supported yet.
|
||||
if truncate:
|
||||
prompts = list(
|
||||
map(
|
||||
lambda prompt: truncate_by_token(
|
||||
prompt, self.tokenizer, self.max_token_size
|
||||
),
|
||||
prompts,
|
||||
)
|
||||
)
|
||||
|
||||
openai_chat_params = pop_params(self.client.chat.completions.create, kwargs)
|
||||
loop = get_event_loop()
|
||||
if self.llm.startswith("o1") or self.llm.startswith("o3"):
|
||||
tasks = [
|
||||
self.get_result_o1(prompt, **openai_chat_params) for prompt in prompts
|
||||
]
|
||||
else:
|
||||
tasks = [
|
||||
self.get_result(prompt, **openai_chat_params) for prompt in prompts
|
||||
]
|
||||
result = loop.run_until_complete(process_batch(tasks, self.batch))
|
||||
answer_result = list(map(lambda x: x[0], result))
|
||||
token_result = list(map(lambda x: x[1], result))
|
||||
logprob_result = list(map(lambda x: x[2], result))
|
||||
return answer_result, token_result, logprob_result
|
||||
|
||||
def structured_output(self, prompts: List[str], output_cls, **kwargs):
|
||||
supported_models = [
|
||||
"gpt-4o-mini-2024-07-18",
|
||||
"gpt-4o-2024-08-06",
|
||||
]
|
||||
if self.llm not in supported_models:
|
||||
raise ValueError(
|
||||
f"{self.llm} is not a valid model name for structured output. "
|
||||
f"Please select the model between {supported_models}"
|
||||
)
|
||||
|
||||
if kwargs.get("logprobs") is not None:
|
||||
kwargs.pop("logprobs")
|
||||
logger.warning(
|
||||
"parameter logprob does not effective. It always set to False."
|
||||
)
|
||||
if kwargs.get("n") is not None:
|
||||
kwargs.pop("n")
|
||||
logger.warning("parameter n does not effective. It always set to 1.")
|
||||
|
||||
# TODO: fix this after updating tiktoken for the gpt-4.5 model. It is not yet supported yet.
|
||||
prompts = list(
|
||||
map(
|
||||
lambda prompt: truncate_by_token(
|
||||
prompt, self.tokenizer, self.max_token_size
|
||||
),
|
||||
prompts,
|
||||
)
|
||||
)
|
||||
|
||||
openai_chat_params = pop_params(self.client.beta.chat.completions.parse, kwargs)
|
||||
loop = get_event_loop()
|
||||
tasks = [
|
||||
self.get_structured_result(prompt, output_cls, **openai_chat_params)
|
||||
for prompt in prompts
|
||||
]
|
||||
result = loop.run_until_complete(process_batch(tasks, self.batch))
|
||||
return result
|
||||
|
||||
async def astream(self, prompt: str, **kwargs):
|
||||
# TODO: gpt-4.5-preview does not support logprobs. It should be fixed after the openai update.
|
||||
if kwargs.get("logprobs") is not None:
|
||||
kwargs.pop("logprobs")
|
||||
logger.warning(
|
||||
"parameter logprob does not effective. It always set to False."
|
||||
)
|
||||
if kwargs.get("n") is not None:
|
||||
kwargs.pop("n")
|
||||
logger.warning("parameter n does not effective. It always set to 1.")
|
||||
|
||||
prompt = truncate_by_token(prompt, self.tokenizer, self.max_token_size)
|
||||
|
||||
openai_chat_params = pop_params(self.client.chat.completions.create, kwargs)
|
||||
|
||||
stream = await self.client.chat.completions.create(
|
||||
model=self.llm,
|
||||
messages=[
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
logprobs=False,
|
||||
n=1,
|
||||
stream=True,
|
||||
**openai_chat_params,
|
||||
)
|
||||
result = ""
|
||||
async for chunk in stream:
|
||||
if chunk.choices[0].delta.content is not None:
|
||||
result += chunk.choices[0].delta.content
|
||||
yield result
|
||||
|
||||
def stream(self, prompt: str, **kwargs):
|
||||
raise NotImplementedError("stream method is not implemented yet.")
|
||||
|
||||
async def get_structured_result(self, prompt: str, output_cls, **kwargs):
|
||||
logprobs = True
|
||||
if self.llm.startswith("gpt-4.5"):
|
||||
logprobs = False
|
||||
response = await self.client.beta.chat.completions.parse(
|
||||
model=self.llm,
|
||||
messages=[
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
response_format=output_cls,
|
||||
logprobs=logprobs,
|
||||
n=1,
|
||||
**kwargs,
|
||||
)
|
||||
return response.choices[0].message.parsed
|
||||
|
||||
async def get_result(self, prompt: str, **kwargs):
|
||||
# TODO: gpt-4.5-preview does not support logprobs. It should be fixed after the openai update.
|
||||
logprobs = True
|
||||
if self.llm.startswith("gpt-4.5"):
|
||||
logprobs = False
|
||||
response = await self.client.chat.completions.create(
|
||||
model=self.llm,
|
||||
messages=[
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
logprobs=logprobs,
|
||||
n=1,
|
||||
**kwargs,
|
||||
)
|
||||
choice = response.choices[0]
|
||||
answer = choice.message.content
|
||||
# TODO: gpt-4.5-preview does not support logprobs. It should be fixed after the openai update.
|
||||
if self.llm.startswith("gpt-4.5"):
|
||||
tokens = self.tokenizer.encode(answer, allowed_special="all")
|
||||
logprobs = [0.5] * len(tokens)
|
||||
logger.warning("gpt-4.5-preview does not support logprobs yet.")
|
||||
else:
|
||||
logprobs = list(map(lambda x: x.logprob, choice.logprobs.content))
|
||||
tokens = list(
|
||||
map(
|
||||
lambda x: self.tokenizer.encode(x.token, allowed_special="all")[0],
|
||||
choice.logprobs.content,
|
||||
)
|
||||
)
|
||||
assert len(tokens) == len(
|
||||
logprobs
|
||||
), "tokens and logprobs size is different."
|
||||
return answer, tokens, logprobs
|
||||
|
||||
async def get_result_o1(self, prompt: str, **kwargs):
|
||||
assert self.llm.startswith("o1") or self.llm.startswith(
|
||||
"o3"
|
||||
), "This function only supports o1 or o3 model."
|
||||
# The default temperature for the o1 model is 1. 1 is only supported.
|
||||
# See https://platform.openai.com/docs/guides/reasoning about beta limitation of o1 models.
|
||||
kwargs["temperature"] = 1
|
||||
kwargs["top_p"] = 1
|
||||
kwargs["presence_penalty"] = 0
|
||||
kwargs["frequency_penalty"] = 0
|
||||
response = await self.client.chat.completions.create(
|
||||
model=self.llm,
|
||||
messages=[
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
logprobs=False,
|
||||
n=1,
|
||||
**kwargs,
|
||||
)
|
||||
answer = response.choices[0].message.content
|
||||
tokens = self.tokenizer.encode(answer, allowed_special="all")
|
||||
pseudo_log_probs = [0.5] * len(tokens)
|
||||
return answer, tokens, pseudo_log_probs
|
||||
|
||||
|
||||
def truncate_by_token(prompt: str, tokenizer: Encoding, max_token_size: int):
|
||||
tokens = tokenizer.encode(prompt, allowed_special="all")
|
||||
return tokenizer.decode(tokens[:max_token_size])
|
||||
144
autorag/nodes/generator/run.py
Normal file
144
autorag/nodes/generator/run.py
Normal file
@@ -0,0 +1,144 @@
|
||||
import os
|
||||
import pathlib
|
||||
from typing import List, Dict, Union
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.evaluation import evaluate_generation
|
||||
from autorag.evaluation.util import cast_metrics
|
||||
from autorag.schema.metricinput import MetricInput
|
||||
from autorag.strategy import measure_speed, filter_by_threshold, select_best
|
||||
from autorag.utils.util import to_list
|
||||
|
||||
|
||||
def run_generator_node(
|
||||
modules: List,
|
||||
module_params: List[Dict],
|
||||
previous_result: pd.DataFrame,
|
||||
node_line_dir: str,
|
||||
strategies: Dict,
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Run evaluation and select the best module among generator node results.
|
||||
And save the results and summary to generator node directory.
|
||||
|
||||
:param modules: Generator modules to run.
|
||||
:param module_params: Generator module parameters.
|
||||
Including node parameters, which is used for every module in this node.
|
||||
:param previous_result: Previous result dataframe.
|
||||
Could be prompt maker node's result.
|
||||
:param node_line_dir: This node line's directory.
|
||||
:param strategies: Strategies for generator node.
|
||||
:return: The best result dataframe.
|
||||
It contains previous result columns and generator node's result columns.
|
||||
"""
|
||||
if not os.path.exists(node_line_dir):
|
||||
os.makedirs(node_line_dir)
|
||||
project_dir = pathlib.PurePath(node_line_dir).parent.parent
|
||||
node_dir = os.path.join(node_line_dir, "generator") # node name
|
||||
if not os.path.exists(node_dir):
|
||||
os.makedirs(node_dir)
|
||||
qa_data = pd.read_parquet(
|
||||
os.path.join(project_dir, "data", "qa.parquet"), engine="pyarrow"
|
||||
)
|
||||
if "generation_gt" not in qa_data.columns:
|
||||
raise ValueError("You must have 'generation_gt' column in qa.parquet.")
|
||||
|
||||
results, execution_times = zip(
|
||||
*map(
|
||||
lambda x: measure_speed(
|
||||
x[0].run_evaluator,
|
||||
project_dir=project_dir,
|
||||
previous_result=previous_result,
|
||||
**x[1],
|
||||
),
|
||||
zip(modules, module_params),
|
||||
)
|
||||
)
|
||||
average_times = list(map(lambda x: x / len(results[0]), execution_times))
|
||||
|
||||
# get average token usage
|
||||
token_usages = list(map(lambda x: x["generated_tokens"].apply(len).mean(), results))
|
||||
|
||||
# make rows to metric_inputs
|
||||
generation_gt = to_list(qa_data["generation_gt"].tolist())
|
||||
|
||||
metric_inputs = [MetricInput(generation_gt=gen_gt) for gen_gt in generation_gt]
|
||||
|
||||
metric_names, metric_params = cast_metrics(strategies.get("metrics"))
|
||||
if metric_names is None or len(metric_names) <= 0:
|
||||
raise ValueError("You must at least one metrics for generator evaluation.")
|
||||
results = list(
|
||||
map(
|
||||
lambda result: evaluate_generator_node(
|
||||
result, metric_inputs, strategies.get("metrics")
|
||||
),
|
||||
results,
|
||||
)
|
||||
)
|
||||
|
||||
# save results to folder
|
||||
filepaths = list(
|
||||
map(lambda x: os.path.join(node_dir, f"{x}.parquet"), range(len(modules)))
|
||||
)
|
||||
list(
|
||||
map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths))
|
||||
) # execute save to parquet
|
||||
filenames = list(map(lambda x: os.path.basename(x), filepaths))
|
||||
|
||||
summary_df = pd.DataFrame(
|
||||
{
|
||||
"filename": filenames,
|
||||
"module_name": list(map(lambda module: module.__name__, modules)),
|
||||
"module_params": module_params,
|
||||
"execution_time": average_times,
|
||||
"average_output_token": token_usages,
|
||||
**{
|
||||
metric: list(map(lambda x: x[metric].mean(), results))
|
||||
for metric in metric_names
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
# filter by strategies
|
||||
if strategies.get("speed_threshold") is not None:
|
||||
results, filenames = filter_by_threshold(
|
||||
results, average_times, strategies["speed_threshold"], filenames
|
||||
)
|
||||
if strategies.get("token_threshold") is not None:
|
||||
results, filenames = filter_by_threshold(
|
||||
results, token_usages, strategies["token_threshold"], filenames
|
||||
)
|
||||
selected_result, selected_filename = select_best(
|
||||
results, metric_names, filenames, strategies.get("strategy", "mean")
|
||||
)
|
||||
best_result = pd.concat([previous_result, selected_result], axis=1)
|
||||
|
||||
# add 'is_best' column at summary file
|
||||
summary_df["is_best"] = summary_df["filename"] == selected_filename
|
||||
|
||||
# save files
|
||||
summary_df.to_csv(os.path.join(node_dir, "summary.csv"), index=False)
|
||||
best_result.to_parquet(
|
||||
os.path.join(
|
||||
node_dir, f"best_{os.path.splitext(selected_filename)[0]}.parquet"
|
||||
),
|
||||
index=False,
|
||||
)
|
||||
return best_result
|
||||
|
||||
|
||||
def evaluate_generator_node(
|
||||
result_df: pd.DataFrame,
|
||||
metric_inputs: List[MetricInput],
|
||||
metrics: Union[List[str], List[Dict]],
|
||||
):
|
||||
@evaluate_generation(metric_inputs=metric_inputs, metrics=metrics)
|
||||
def evaluate_generation_module(df: pd.DataFrame):
|
||||
return (
|
||||
df["generated_texts"].tolist(),
|
||||
df["generated_tokens"].tolist(),
|
||||
df["generated_log_probs"].tolist(),
|
||||
)
|
||||
|
||||
return evaluate_generation_module(result_df)
|
||||
121
autorag/nodes/generator/vllm.py
Normal file
121
autorag/nodes/generator/vllm.py
Normal file
@@ -0,0 +1,121 @@
|
||||
import gc
|
||||
from copy import deepcopy
|
||||
from typing import List, Tuple
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.generator.base import BaseGenerator
|
||||
from autorag.utils import result_to_dataframe
|
||||
from autorag.utils.util import pop_params, to_list
|
||||
|
||||
|
||||
class Vllm(BaseGenerator):
|
||||
def __init__(self, project_dir: str, llm: str, **kwargs):
|
||||
super().__init__(project_dir, llm, **kwargs)
|
||||
try:
|
||||
from vllm import SamplingParams, LLM
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Please install vllm library. You can install it by running `pip install vllm`."
|
||||
)
|
||||
|
||||
model_from_kwargs = kwargs.pop("model", None)
|
||||
model = llm if model_from_kwargs is None else model_from_kwargs
|
||||
|
||||
input_kwargs = deepcopy(kwargs)
|
||||
sampling_params_init_params = pop_params(
|
||||
SamplingParams.from_optional, input_kwargs
|
||||
)
|
||||
self.vllm_model = LLM(model, **input_kwargs)
|
||||
|
||||
# delete not sampling param keys in the kwargs
|
||||
kwargs_keys = list(kwargs.keys())
|
||||
for key in kwargs_keys:
|
||||
if key not in sampling_params_init_params:
|
||||
kwargs.pop(key)
|
||||
|
||||
def __del__(self):
|
||||
try:
|
||||
import torch
|
||||
import contextlib
|
||||
|
||||
if torch.cuda.is_available():
|
||||
from vllm.distributed.parallel_state import (
|
||||
destroy_model_parallel,
|
||||
destroy_distributed_environment,
|
||||
)
|
||||
|
||||
destroy_model_parallel()
|
||||
destroy_distributed_environment()
|
||||
del self.vllm_model.llm_engine.model_executor
|
||||
del self.vllm_model
|
||||
with contextlib.suppress(AssertionError):
|
||||
torch.distributed.destroy_process_group()
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
torch.cuda.synchronize()
|
||||
except ImportError:
|
||||
del self.vllm_model
|
||||
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["generated_texts", "generated_tokens", "generated_log_probs"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
prompts = self.cast_to_run(previous_result)
|
||||
return self._pure(prompts, **kwargs)
|
||||
|
||||
def _pure(
|
||||
self, prompts: List[str], **kwargs
|
||||
) -> Tuple[List[str], List[List[int]], List[List[float]]]:
|
||||
"""
|
||||
Vllm module.
|
||||
It gets the VLLM instance and returns generated texts by the input prompt.
|
||||
You can set logprobs to get the log probs of the generated text.
|
||||
Default logprobs is 1.
|
||||
|
||||
:param prompts: A list of prompts.
|
||||
:param kwargs: The extra parameters for generating the text.
|
||||
:return: A tuple of three elements.
|
||||
The first element is a list of generated text.
|
||||
The second element is a list of generated text's token ids.
|
||||
The third element is a list of generated text's log probs.
|
||||
"""
|
||||
try:
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.sequence import SampleLogprobs
|
||||
from vllm import SamplingParams
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Please install vllm library. You can install it by running `pip install vllm`."
|
||||
)
|
||||
|
||||
if "logprobs" not in kwargs:
|
||||
kwargs["logprobs"] = 1
|
||||
|
||||
sampling_params = pop_params(SamplingParams.from_optional, kwargs)
|
||||
generate_params = SamplingParams(**sampling_params)
|
||||
results: List[RequestOutput] = self.vllm_model.generate(
|
||||
prompts, generate_params
|
||||
)
|
||||
generated_texts = list(map(lambda x: x.outputs[0].text, results))
|
||||
generated_token_ids = list(map(lambda x: x.outputs[0].token_ids, results))
|
||||
log_probs: List[SampleLogprobs] = list(
|
||||
map(lambda x: x.outputs[0].logprobs, results)
|
||||
)
|
||||
generated_log_probs = list(
|
||||
map(
|
||||
lambda x: list(map(lambda y: y[0][y[1]].logprob, zip(x[0], x[1]))),
|
||||
zip(log_probs, generated_token_ids),
|
||||
)
|
||||
)
|
||||
return (
|
||||
to_list(generated_texts),
|
||||
to_list(generated_token_ids),
|
||||
to_list(generated_log_probs),
|
||||
)
|
||||
|
||||
async def astream(self, prompt: str, **kwargs):
|
||||
raise NotImplementedError
|
||||
|
||||
def stream(self, prompt: str, **kwargs):
|
||||
raise NotImplementedError
|
||||
176
autorag/nodes/generator/vllm_api.py
Normal file
176
autorag/nodes/generator/vllm_api.py
Normal file
@@ -0,0 +1,176 @@
|
||||
import logging
|
||||
from typing import List, Tuple
|
||||
import time
|
||||
|
||||
import pandas as pd
|
||||
import requests
|
||||
from asyncio import to_thread
|
||||
|
||||
from autorag.nodes.generator.base import BaseGenerator
|
||||
from autorag.utils.util import get_event_loop, process_batch, result_to_dataframe
|
||||
|
||||
logger = logging.getLogger("AutoRAG")
|
||||
|
||||
DEFAULT_MAX_TOKENS = 4096 # Default token limit
|
||||
|
||||
|
||||
class VllmAPI(BaseGenerator):
|
||||
def __init__(
|
||||
self,
|
||||
project_dir,
|
||||
llm: str,
|
||||
uri: str,
|
||||
max_tokens: int = None,
|
||||
batch: int = 16,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
VLLM API Wrapper for OpenAI-compatible chat/completions format.
|
||||
|
||||
:param project_dir: Project directory.
|
||||
:param llm: Model name (e.g., LLaMA model).
|
||||
:param uri: VLLM API server URI.
|
||||
:param max_tokens: Maximum token limit.
|
||||
Default is 4096.
|
||||
:param batch: Request batch size.
|
||||
Default is 16.
|
||||
"""
|
||||
super().__init__(project_dir, llm, *args, **kwargs)
|
||||
assert batch > 0, "Batch size must be greater than 0."
|
||||
self.uri = uri.rstrip("/") # Set API URI
|
||||
self.batch = batch
|
||||
# Use the provided max_tokens if available, otherwise use the default
|
||||
self.max_token_size = max_tokens if max_tokens else DEFAULT_MAX_TOKENS
|
||||
self.max_model_len = self.get_max_model_length()
|
||||
logger.info(f"{llm} max model length: {self.max_model_len}")
|
||||
|
||||
@result_to_dataframe(["generated_texts", "generated_tokens", "generated_log_probs"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
prompts = self.cast_to_run(previous_result)
|
||||
return self._pure(prompts, **kwargs)
|
||||
|
||||
def _pure(
|
||||
self, prompts: List[str], truncate: bool = True, **kwargs
|
||||
) -> Tuple[List[str], List[List[int]], List[List[float]]]:
|
||||
"""
|
||||
Method to call the VLLM API to generate text.
|
||||
|
||||
:param prompts: List of input prompts.
|
||||
:param truncate: Whether to truncate input prompts to fit within the token limit.
|
||||
:param kwargs: Additional options (e.g., temperature, top_p).
|
||||
:return: Generated text, token lists, and log probability lists.
|
||||
"""
|
||||
if kwargs.get("logprobs") is not None:
|
||||
kwargs.pop("logprobs")
|
||||
logger.warning(
|
||||
"parameter logprob does not effective. It always set to True."
|
||||
)
|
||||
if kwargs.get("n") is not None:
|
||||
kwargs.pop("n")
|
||||
logger.warning("parameter n does not effective. It always set to 1.")
|
||||
|
||||
if truncate:
|
||||
prompts = list(map(lambda p: self.truncate_by_token(p), prompts))
|
||||
loop = get_event_loop()
|
||||
tasks = [to_thread(self.get_result, prompt, **kwargs) for prompt in prompts]
|
||||
results = loop.run_until_complete(process_batch(tasks, self.batch))
|
||||
|
||||
answer_result = list(map(lambda x: x[0], results))
|
||||
token_result = list(map(lambda x: x[1], results))
|
||||
logprob_result = list(map(lambda x: x[2], results))
|
||||
return answer_result, token_result, logprob_result
|
||||
|
||||
def truncate_by_token(self, prompt: str) -> str:
|
||||
"""
|
||||
Function to truncate prompts to fit within the maximum token limit.
|
||||
"""
|
||||
tokens = self.encoding_for_model(prompt)["tokens"] # Simple tokenization
|
||||
return self.decoding_for_model(tokens[: self.max_model_len])["prompt"]
|
||||
|
||||
def call_vllm_api(self, prompt: str, **kwargs) -> dict:
|
||||
"""
|
||||
Calls the VLLM API to get chat/completions responses.
|
||||
|
||||
:param prompt: Input prompt.
|
||||
:param kwargs: Additional API options (e.g., temperature, max_tokens).
|
||||
:return: API response.
|
||||
"""
|
||||
payload = {
|
||||
"model": self.llm,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"temperature": kwargs.get("temperature", 0.4),
|
||||
"max_tokens": min(
|
||||
kwargs.get("max_tokens", self.max_token_size), self.max_token_size
|
||||
),
|
||||
"logprobs": True,
|
||||
"n": 1,
|
||||
}
|
||||
start_time = time.time() # Record request start time
|
||||
response = requests.post(f"{self.uri}/v1/chat/completions", json=payload)
|
||||
end_time = time.time() # Record request end time
|
||||
|
||||
response.raise_for_status()
|
||||
elapsed_time = end_time - start_time # Calculate elapsed time
|
||||
logger.info(
|
||||
f"Request chat completions to vllm server completed in {elapsed_time:.2f} seconds"
|
||||
)
|
||||
return response.json()
|
||||
|
||||
# Additional method: abstract method implementation
|
||||
async def astream(self, prompt: str, **kwargs):
|
||||
"""
|
||||
Asynchronous streaming method not implemented.
|
||||
"""
|
||||
raise NotImplementedError("astream method is not implemented for VLLM API yet.")
|
||||
|
||||
def stream(self, prompt: str, **kwargs):
|
||||
"""
|
||||
Synchronous streaming method not implemented.
|
||||
"""
|
||||
raise NotImplementedError("stream method is not implemented for VLLM API yet.")
|
||||
|
||||
def get_result(self, prompt: str, **kwargs):
|
||||
response = self.call_vllm_api(prompt, **kwargs)
|
||||
choice = response["choices"][0]
|
||||
answer = choice["message"]["content"]
|
||||
|
||||
# Handle cases where logprobs is None
|
||||
if choice.get("logprobs") and "content" in choice["logprobs"]:
|
||||
logprobs = list(map(lambda x: x["logprob"], choice["logprobs"]["content"]))
|
||||
tokens = list(
|
||||
map(
|
||||
lambda x: self.encoding_for_model(x["token"])["tokens"],
|
||||
choice["logprobs"]["content"],
|
||||
)
|
||||
)
|
||||
else:
|
||||
logprobs = []
|
||||
tokens = []
|
||||
|
||||
return answer, tokens, logprobs
|
||||
|
||||
def encoding_for_model(self, answer_piece: str):
|
||||
payload = {
|
||||
"model": self.llm,
|
||||
"prompt": answer_piece,
|
||||
"add_special_tokens": True,
|
||||
}
|
||||
response = requests.post(f"{self.uri}/tokenize", json=payload)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
def decoding_for_model(self, tokens: list[int]):
|
||||
payload = {
|
||||
"model": self.llm,
|
||||
"tokens": tokens,
|
||||
}
|
||||
response = requests.post(f"{self.uri}/detokenize", json=payload)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
def get_max_model_length(self):
|
||||
response = requests.get(f"{self.uri}/v1/models")
|
||||
response.raise_for_status()
|
||||
json_data = response.json()
|
||||
return json_data["data"][0]["max_model_len"]
|
||||
Reference in New Issue
Block a user