Fix Dockerfile build issue

This commit is contained in:
2025-03-18 16:41:12 +09:00
parent 6814230bfb
commit 9323aa254a
228 changed files with 467 additions and 3488 deletions

View File

@@ -0,0 +1,4 @@
from .llama_index_llm import LlamaIndexLLM
from .openai_llm import OpenAILLM
from .vllm import Vllm
from .vllm_api import VllmAPI

View File

@@ -0,0 +1,103 @@
import abc
import functools
import logging
from pathlib import Path
from typing import Union, Tuple, List
import pandas as pd
from llama_index.core.output_parsers import PydanticOutputParser
from autorag import generator_models
from autorag.schema import BaseModule
from autorag.utils import result_to_dataframe
logger = logging.getLogger("AutoRAG")
class BaseGenerator(BaseModule, metaclass=abc.ABCMeta):
def __init__(self, project_dir: str, llm: str, *args, **kwargs):
logger.info(f"Initialize generator node - {self.__class__.__name__}")
self.llm = llm
def __del__(self):
logger.info(f"Deleting generator module - {self.__class__.__name__}")
def cast_to_run(self, previous_result: pd.DataFrame, *args, **kwargs):
logger.info(f"Running generator node - {self.__class__.__name__} module...")
assert (
"prompts" in previous_result.columns
), "previous_result must contain prompts column."
prompts = previous_result["prompts"].tolist()
return prompts
def structured_output(self, prompts: List[str], output_cls):
response, _, _ = self._pure(prompts)
parser = PydanticOutputParser(output_cls)
result = []
for res in response:
try:
result.append(parser.parse(res))
except Exception as e:
logger.warning(
f"Error parsing response: {e} \nSo returning None instead in this case."
)
result.append(None)
return result
@abc.abstractmethod
async def astream(self, prompt: str, **kwargs):
pass
@abc.abstractmethod
def stream(self, prompt: str, **kwargs):
pass
def generator_node(func):
@functools.wraps(func)
@result_to_dataframe(["generated_texts", "generated_tokens", "generated_log_probs"])
def wrapper(
project_dir: Union[str, Path], previous_result: pd.DataFrame, llm: str, **kwargs
) -> Tuple[List[str], List[List[int]], List[List[float]]]:
"""
This decorator makes a generator module to be a node.
It automatically extracts prompts from previous_result and runs the generator function.
Plus, it retrieves the llm instance from autorag.generator_models.
:param project_dir: The project directory.
:param previous_result: The previous result that contains prompts,
:param llm: The llm name that you want to use.
:param kwargs: The extra parameters for initializing the llm instance.
:return: Pandas dataframe that contains generated texts, generated tokens, and generated log probs.
Each column is "generated_texts", "generated_tokens", and "generated_log_probs".
"""
logger.info(f"Running generator node - {func.__name__} module...")
assert (
"prompts" in previous_result.columns
), "previous_result must contain prompts column."
prompts = previous_result["prompts"].tolist()
if func.__name__ == "llama_index_llm":
if llm not in generator_models:
raise ValueError(
f"{llm} is not a valid llm name. Please check the llm name."
"You can check valid llm names from autorag.generator_models."
)
batch = kwargs.pop("batch", 16)
if llm == "huggingfacellm":
model_name = kwargs.pop("model", None)
if model_name is not None:
kwargs["model_name"] = model_name
else:
if "model_name" not in kwargs.keys():
raise ValueError(
"`model` or `model_name` parameter must be provided for using huggingfacellm."
)
kwargs["tokenizer_name"] = kwargs["model_name"]
llm_instance = generator_models[llm](**kwargs)
result = func(prompts=prompts, llm=llm_instance, batch=batch)
del llm_instance
return result
else:
return func(prompts=prompts, llm=llm, **kwargs)
return wrapper

View File

@@ -0,0 +1,97 @@
from typing import List, Tuple
import pandas as pd
from llama_index.core.base.llms.base import BaseLLM
from transformers import AutoTokenizer
from autorag import generator_models
from autorag.nodes.generator.base import BaseGenerator
from autorag.utils.util import (
get_event_loop,
process_batch,
result_to_dataframe,
pop_params,
)
class LlamaIndexLLM(BaseGenerator):
def __init__(self, project_dir: str, llm: str, batch: int = 16, *args, **kwargs):
"""
Initialize the Llama Index LLM module.
:param project_dir: The project directory.
:param llm: A llama index LLM instance.
:param batch: The batch size for llm.
Set low if you face some errors.
Default is 16.
:param kwargs: The extra parameters for initializing the llm instance.
"""
super().__init__(project_dir=project_dir, llm=llm)
if self.llm not in generator_models.keys():
raise ValueError(
f"{self.llm} is not a valid llm name. Please check the llm name."
"You can check valid llm names from autorag.generator_models."
)
self.batch = batch
llm_class = generator_models[self.llm]
if llm_class.class_name() in [
"HuggingFace_LLM",
"HuggingFaceInferenceAPI",
"TextGenerationInference",
]:
model_name = kwargs.pop("model", None)
if model_name is not None:
kwargs["model_name"] = model_name
else:
if "model_name" not in kwargs.keys():
raise ValueError(
"`model` or `model_name` parameter must be provided for using huggingfacellm."
)
kwargs["tokenizer_name"] = kwargs["model_name"]
self.llm_instance: BaseLLM = llm_class(**pop_params(llm_class.__init__, kwargs))
def __del__(self):
super().__del__()
del self.llm_instance
@result_to_dataframe(["generated_texts", "generated_tokens", "generated_log_probs"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
prompts = self.cast_to_run(previous_result=previous_result)
return self._pure(prompts)
def _pure(
self,
prompts: List[str],
) -> Tuple[List[str], List[List[int]], List[List[float]]]:
"""
Llama Index LLM module.
It gets the LLM instance from llama index, and returns generated text by the input prompt.
It does not generate the right log probs, but it returns the pseudo log probs,
which are not meant to be used for other modules.
:param prompts: A list of prompts.
:return: A tuple of three elements.
The first element is a list of a generated text.
The second element is a list of generated text's token ids, used tokenizer is GPT2Tokenizer.
The third element is a list of generated text's pseudo log probs.
"""
tasks = [self.llm_instance.acomplete(prompt) for prompt in prompts]
loop = get_event_loop() # get_event_loop()
results = loop.run_until_complete(process_batch(tasks, batch_size=self.batch))
generated_texts = list(map(lambda x: x.text, results))
tokenizer = AutoTokenizer.from_pretrained("gpt2", use_fast=False)
tokenized_ids = tokenizer(generated_texts).data["input_ids"]
pseudo_log_probs = list(map(lambda x: [0.5] * len(x), tokenized_ids))
return generated_texts, tokenized_ids, pseudo_log_probs
async def astream(self, prompt: str, **kwargs):
async for completion_response in await self.llm_instance.astream_complete(
prompt
):
yield completion_response.text
def stream(self, prompt: str, **kwargs):
for completion_response in self.llm_instance.stream_complete(prompt):
yield completion_response.text

View File

@@ -0,0 +1,296 @@
import logging
from typing import List, Tuple
import pandas as pd
import tiktoken
from openai import AsyncOpenAI
from tiktoken import Encoding
from autorag.nodes.generator.base import BaseGenerator
from autorag.utils.util import (
get_event_loop,
process_batch,
pop_params,
result_to_dataframe,
)
logger = logging.getLogger("AutoRAG")
MAX_TOKEN_DICT = { # model name : token limit
"gpt-4.5-preview": 128_000,
"gpt-4.5-preview-2025-02-27": 128_000,
"o1": 200_000,
"o1-preview": 128_000,
"o1-preview-2024-09-12": 128_000,
"o1-mini": 128_000,
"o1-mini-2024-09-12": 128_000,
"o3-mini": 200_000,
"gpt-4o-mini": 128_000,
"gpt-4o-mini-2024-07-18": 128_000,
"gpt-4o": 128_000,
"gpt-4o-2024-08-06": 128_000,
"gpt-4o-2024-05-13": 128_000,
"chatgpt-4o-latest": 128_000,
"gpt-4-turbo": 128_000,
"gpt-4-turbo-2024-04-09": 128_000,
"gpt-4-turbo-preview": 128_000,
"gpt-4-0125-preview": 128_000,
"gpt-4-1106-preview": 128_000,
"gpt-4-vision-preview": 128_000,
"gpt-4-1106-vision-preview": 128_000,
"gpt-4": 8_192,
"gpt-4-0613": 8_192,
"gpt-4-32k": 32_768,
"gpt-4-32k-0613": 32_768,
"gpt-3.5-turbo-0125": 16_385,
"gpt-3.5-turbo": 16_385,
"gpt-3.5-turbo-1106": 16_385,
"gpt-3.5-turbo-instruct": 4_096,
"gpt-3.5-turbo-16k": 16_385,
"gpt-3.5-turbo-0613": 4_096,
"gpt-3.5-turbo-16k-0613": 16_385,
}
class OpenAILLM(BaseGenerator):
def __init__(self, project_dir, llm: str, batch: int = 16, *args, **kwargs):
super().__init__(project_dir, llm, *args, **kwargs)
assert batch > 0, "batch size must be greater than 0."
self.batch = batch
client_init_params = pop_params(AsyncOpenAI.__init__, kwargs)
self.client = AsyncOpenAI(**client_init_params)
if self.llm.startswith("gpt-4.5"):
self.tokenizer = tiktoken.get_encoding("o200k_base")
else:
self.tokenizer = tiktoken.encoding_for_model(self.llm)
self.max_token_size = (
MAX_TOKEN_DICT.get(self.llm) - 7
) # because of chat token usage
if self.max_token_size is None:
raise ValueError(
f"Model {self.llm} does not supported. "
f"Please select the model between {list(MAX_TOKEN_DICT.keys())}"
)
@result_to_dataframe(["generated_texts", "generated_tokens", "generated_log_probs"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
prompts = self.cast_to_run(previous_result)
return self._pure(prompts, **kwargs)
def _pure(
self,
prompts: List[str],
truncate: bool = True,
**kwargs,
) -> Tuple[List[str], List[List[int]], List[List[float]]]:
"""
OpenAI generator module.
Uses an official openai library for generating answer from the given prompt.
It returns real token ids and log probs, so you must use this for using token ids and log probs.
:param prompts: A list of prompts.
:param llm: A model name for openai.
Default is gpt-3.5-turbo.
:param batch: Batch size for openai api call.
If you get API limit errors, you should lower the batch size.
Default is 16.
:param truncate: Whether to truncate the input prompt.
Default is True.
:param api_key: OpenAI API key. You can set this by passing env variable `OPENAI_API_KEY`
:param kwargs: The optional parameter for openai api call `openai.chat.completion`
See https://platform.openai.com/docs/api-reference/chat/create for more details.
:return: A tuple of three elements.
The first element is a list of generated text.
The second element is a list of generated text's token ids.
The third element is a list of generated text's log probs.
"""
if kwargs.get("logprobs") is not None:
kwargs.pop("logprobs")
logger.warning(
"parameter logprob does not effective. It always set to True."
)
if kwargs.get("n") is not None:
kwargs.pop("n")
logger.warning("parameter n does not effective. It always set to 1.")
# TODO: fix this after updating tiktoken for the gpt-4.5 model. It is not yet supported yet.
if truncate:
prompts = list(
map(
lambda prompt: truncate_by_token(
prompt, self.tokenizer, self.max_token_size
),
prompts,
)
)
openai_chat_params = pop_params(self.client.chat.completions.create, kwargs)
loop = get_event_loop()
if self.llm.startswith("o1") or self.llm.startswith("o3"):
tasks = [
self.get_result_o1(prompt, **openai_chat_params) for prompt in prompts
]
else:
tasks = [
self.get_result(prompt, **openai_chat_params) for prompt in prompts
]
result = loop.run_until_complete(process_batch(tasks, self.batch))
answer_result = list(map(lambda x: x[0], result))
token_result = list(map(lambda x: x[1], result))
logprob_result = list(map(lambda x: x[2], result))
return answer_result, token_result, logprob_result
def structured_output(self, prompts: List[str], output_cls, **kwargs):
supported_models = [
"gpt-4o-mini-2024-07-18",
"gpt-4o-2024-08-06",
]
if self.llm not in supported_models:
raise ValueError(
f"{self.llm} is not a valid model name for structured output. "
f"Please select the model between {supported_models}"
)
if kwargs.get("logprobs") is not None:
kwargs.pop("logprobs")
logger.warning(
"parameter logprob does not effective. It always set to False."
)
if kwargs.get("n") is not None:
kwargs.pop("n")
logger.warning("parameter n does not effective. It always set to 1.")
# TODO: fix this after updating tiktoken for the gpt-4.5 model. It is not yet supported yet.
prompts = list(
map(
lambda prompt: truncate_by_token(
prompt, self.tokenizer, self.max_token_size
),
prompts,
)
)
openai_chat_params = pop_params(self.client.beta.chat.completions.parse, kwargs)
loop = get_event_loop()
tasks = [
self.get_structured_result(prompt, output_cls, **openai_chat_params)
for prompt in prompts
]
result = loop.run_until_complete(process_batch(tasks, self.batch))
return result
async def astream(self, prompt: str, **kwargs):
# TODO: gpt-4.5-preview does not support logprobs. It should be fixed after the openai update.
if kwargs.get("logprobs") is not None:
kwargs.pop("logprobs")
logger.warning(
"parameter logprob does not effective. It always set to False."
)
if kwargs.get("n") is not None:
kwargs.pop("n")
logger.warning("parameter n does not effective. It always set to 1.")
prompt = truncate_by_token(prompt, self.tokenizer, self.max_token_size)
openai_chat_params = pop_params(self.client.chat.completions.create, kwargs)
stream = await self.client.chat.completions.create(
model=self.llm,
messages=[
{"role": "user", "content": prompt},
],
logprobs=False,
n=1,
stream=True,
**openai_chat_params,
)
result = ""
async for chunk in stream:
if chunk.choices[0].delta.content is not None:
result += chunk.choices[0].delta.content
yield result
def stream(self, prompt: str, **kwargs):
raise NotImplementedError("stream method is not implemented yet.")
async def get_structured_result(self, prompt: str, output_cls, **kwargs):
logprobs = True
if self.llm.startswith("gpt-4.5"):
logprobs = False
response = await self.client.beta.chat.completions.parse(
model=self.llm,
messages=[
{"role": "user", "content": prompt},
],
response_format=output_cls,
logprobs=logprobs,
n=1,
**kwargs,
)
return response.choices[0].message.parsed
async def get_result(self, prompt: str, **kwargs):
# TODO: gpt-4.5-preview does not support logprobs. It should be fixed after the openai update.
logprobs = True
if self.llm.startswith("gpt-4.5"):
logprobs = False
response = await self.client.chat.completions.create(
model=self.llm,
messages=[
{"role": "user", "content": prompt},
],
logprobs=logprobs,
n=1,
**kwargs,
)
choice = response.choices[0]
answer = choice.message.content
# TODO: gpt-4.5-preview does not support logprobs. It should be fixed after the openai update.
if self.llm.startswith("gpt-4.5"):
tokens = self.tokenizer.encode(answer, allowed_special="all")
logprobs = [0.5] * len(tokens)
logger.warning("gpt-4.5-preview does not support logprobs yet.")
else:
logprobs = list(map(lambda x: x.logprob, choice.logprobs.content))
tokens = list(
map(
lambda x: self.tokenizer.encode(x.token, allowed_special="all")[0],
choice.logprobs.content,
)
)
assert len(tokens) == len(
logprobs
), "tokens and logprobs size is different."
return answer, tokens, logprobs
async def get_result_o1(self, prompt: str, **kwargs):
assert self.llm.startswith("o1") or self.llm.startswith(
"o3"
), "This function only supports o1 or o3 model."
# The default temperature for the o1 model is 1. 1 is only supported.
# See https://platform.openai.com/docs/guides/reasoning about beta limitation of o1 models.
kwargs["temperature"] = 1
kwargs["top_p"] = 1
kwargs["presence_penalty"] = 0
kwargs["frequency_penalty"] = 0
response = await self.client.chat.completions.create(
model=self.llm,
messages=[
{"role": "user", "content": prompt},
],
logprobs=False,
n=1,
**kwargs,
)
answer = response.choices[0].message.content
tokens = self.tokenizer.encode(answer, allowed_special="all")
pseudo_log_probs = [0.5] * len(tokens)
return answer, tokens, pseudo_log_probs
def truncate_by_token(prompt: str, tokenizer: Encoding, max_token_size: int):
tokens = tokenizer.encode(prompt, allowed_special="all")
return tokenizer.decode(tokens[:max_token_size])

View File

@@ -0,0 +1,144 @@
import os
import pathlib
from typing import List, Dict, Union
import pandas as pd
from autorag.evaluation import evaluate_generation
from autorag.evaluation.util import cast_metrics
from autorag.schema.metricinput import MetricInput
from autorag.strategy import measure_speed, filter_by_threshold, select_best
from autorag.utils.util import to_list
def run_generator_node(
modules: List,
module_params: List[Dict],
previous_result: pd.DataFrame,
node_line_dir: str,
strategies: Dict,
) -> pd.DataFrame:
"""
Run evaluation and select the best module among generator node results.
And save the results and summary to generator node directory.
:param modules: Generator modules to run.
:param module_params: Generator module parameters.
Including node parameters, which is used for every module in this node.
:param previous_result: Previous result dataframe.
Could be prompt maker node's result.
:param node_line_dir: This node line's directory.
:param strategies: Strategies for generator node.
:return: The best result dataframe.
It contains previous result columns and generator node's result columns.
"""
if not os.path.exists(node_line_dir):
os.makedirs(node_line_dir)
project_dir = pathlib.PurePath(node_line_dir).parent.parent
node_dir = os.path.join(node_line_dir, "generator") # node name
if not os.path.exists(node_dir):
os.makedirs(node_dir)
qa_data = pd.read_parquet(
os.path.join(project_dir, "data", "qa.parquet"), engine="pyarrow"
)
if "generation_gt" not in qa_data.columns:
raise ValueError("You must have 'generation_gt' column in qa.parquet.")
results, execution_times = zip(
*map(
lambda x: measure_speed(
x[0].run_evaluator,
project_dir=project_dir,
previous_result=previous_result,
**x[1],
),
zip(modules, module_params),
)
)
average_times = list(map(lambda x: x / len(results[0]), execution_times))
# get average token usage
token_usages = list(map(lambda x: x["generated_tokens"].apply(len).mean(), results))
# make rows to metric_inputs
generation_gt = to_list(qa_data["generation_gt"].tolist())
metric_inputs = [MetricInput(generation_gt=gen_gt) for gen_gt in generation_gt]
metric_names, metric_params = cast_metrics(strategies.get("metrics"))
if metric_names is None or len(metric_names) <= 0:
raise ValueError("You must at least one metrics for generator evaluation.")
results = list(
map(
lambda result: evaluate_generator_node(
result, metric_inputs, strategies.get("metrics")
),
results,
)
)
# save results to folder
filepaths = list(
map(lambda x: os.path.join(node_dir, f"{x}.parquet"), range(len(modules)))
)
list(
map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths))
) # execute save to parquet
filenames = list(map(lambda x: os.path.basename(x), filepaths))
summary_df = pd.DataFrame(
{
"filename": filenames,
"module_name": list(map(lambda module: module.__name__, modules)),
"module_params": module_params,
"execution_time": average_times,
"average_output_token": token_usages,
**{
metric: list(map(lambda x: x[metric].mean(), results))
for metric in metric_names
},
}
)
# filter by strategies
if strategies.get("speed_threshold") is not None:
results, filenames = filter_by_threshold(
results, average_times, strategies["speed_threshold"], filenames
)
if strategies.get("token_threshold") is not None:
results, filenames = filter_by_threshold(
results, token_usages, strategies["token_threshold"], filenames
)
selected_result, selected_filename = select_best(
results, metric_names, filenames, strategies.get("strategy", "mean")
)
best_result = pd.concat([previous_result, selected_result], axis=1)
# add 'is_best' column at summary file
summary_df["is_best"] = summary_df["filename"] == selected_filename
# save files
summary_df.to_csv(os.path.join(node_dir, "summary.csv"), index=False)
best_result.to_parquet(
os.path.join(
node_dir, f"best_{os.path.splitext(selected_filename)[0]}.parquet"
),
index=False,
)
return best_result
def evaluate_generator_node(
result_df: pd.DataFrame,
metric_inputs: List[MetricInput],
metrics: Union[List[str], List[Dict]],
):
@evaluate_generation(metric_inputs=metric_inputs, metrics=metrics)
def evaluate_generation_module(df: pd.DataFrame):
return (
df["generated_texts"].tolist(),
df["generated_tokens"].tolist(),
df["generated_log_probs"].tolist(),
)
return evaluate_generation_module(result_df)

View File

@@ -0,0 +1,121 @@
import gc
from copy import deepcopy
from typing import List, Tuple
import pandas as pd
from autorag.nodes.generator.base import BaseGenerator
from autorag.utils import result_to_dataframe
from autorag.utils.util import pop_params, to_list
class Vllm(BaseGenerator):
def __init__(self, project_dir: str, llm: str, **kwargs):
super().__init__(project_dir, llm, **kwargs)
try:
from vllm import SamplingParams, LLM
except ImportError:
raise ImportError(
"Please install vllm library. You can install it by running `pip install vllm`."
)
model_from_kwargs = kwargs.pop("model", None)
model = llm if model_from_kwargs is None else model_from_kwargs
input_kwargs = deepcopy(kwargs)
sampling_params_init_params = pop_params(
SamplingParams.from_optional, input_kwargs
)
self.vllm_model = LLM(model, **input_kwargs)
# delete not sampling param keys in the kwargs
kwargs_keys = list(kwargs.keys())
for key in kwargs_keys:
if key not in sampling_params_init_params:
kwargs.pop(key)
def __del__(self):
try:
import torch
import contextlib
if torch.cuda.is_available():
from vllm.distributed.parallel_state import (
destroy_model_parallel,
destroy_distributed_environment,
)
destroy_model_parallel()
destroy_distributed_environment()
del self.vllm_model.llm_engine.model_executor
del self.vllm_model
with contextlib.suppress(AssertionError):
torch.distributed.destroy_process_group()
gc.collect()
torch.cuda.empty_cache()
torch.cuda.synchronize()
except ImportError:
del self.vllm_model
super().__del__()
@result_to_dataframe(["generated_texts", "generated_tokens", "generated_log_probs"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
prompts = self.cast_to_run(previous_result)
return self._pure(prompts, **kwargs)
def _pure(
self, prompts: List[str], **kwargs
) -> Tuple[List[str], List[List[int]], List[List[float]]]:
"""
Vllm module.
It gets the VLLM instance and returns generated texts by the input prompt.
You can set logprobs to get the log probs of the generated text.
Default logprobs is 1.
:param prompts: A list of prompts.
:param kwargs: The extra parameters for generating the text.
:return: A tuple of three elements.
The first element is a list of generated text.
The second element is a list of generated text's token ids.
The third element is a list of generated text's log probs.
"""
try:
from vllm.outputs import RequestOutput
from vllm.sequence import SampleLogprobs
from vllm import SamplingParams
except ImportError:
raise ImportError(
"Please install vllm library. You can install it by running `pip install vllm`."
)
if "logprobs" not in kwargs:
kwargs["logprobs"] = 1
sampling_params = pop_params(SamplingParams.from_optional, kwargs)
generate_params = SamplingParams(**sampling_params)
results: List[RequestOutput] = self.vllm_model.generate(
prompts, generate_params
)
generated_texts = list(map(lambda x: x.outputs[0].text, results))
generated_token_ids = list(map(lambda x: x.outputs[0].token_ids, results))
log_probs: List[SampleLogprobs] = list(
map(lambda x: x.outputs[0].logprobs, results)
)
generated_log_probs = list(
map(
lambda x: list(map(lambda y: y[0][y[1]].logprob, zip(x[0], x[1]))),
zip(log_probs, generated_token_ids),
)
)
return (
to_list(generated_texts),
to_list(generated_token_ids),
to_list(generated_log_probs),
)
async def astream(self, prompt: str, **kwargs):
raise NotImplementedError
def stream(self, prompt: str, **kwargs):
raise NotImplementedError

View File

@@ -0,0 +1,176 @@
import logging
from typing import List, Tuple
import time
import pandas as pd
import requests
from asyncio import to_thread
from autorag.nodes.generator.base import BaseGenerator
from autorag.utils.util import get_event_loop, process_batch, result_to_dataframe
logger = logging.getLogger("AutoRAG")
DEFAULT_MAX_TOKENS = 4096 # Default token limit
class VllmAPI(BaseGenerator):
def __init__(
self,
project_dir,
llm: str,
uri: str,
max_tokens: int = None,
batch: int = 16,
*args,
**kwargs,
):
"""
VLLM API Wrapper for OpenAI-compatible chat/completions format.
:param project_dir: Project directory.
:param llm: Model name (e.g., LLaMA model).
:param uri: VLLM API server URI.
:param max_tokens: Maximum token limit.
Default is 4096.
:param batch: Request batch size.
Default is 16.
"""
super().__init__(project_dir, llm, *args, **kwargs)
assert batch > 0, "Batch size must be greater than 0."
self.uri = uri.rstrip("/") # Set API URI
self.batch = batch
# Use the provided max_tokens if available, otherwise use the default
self.max_token_size = max_tokens if max_tokens else DEFAULT_MAX_TOKENS
self.max_model_len = self.get_max_model_length()
logger.info(f"{llm} max model length: {self.max_model_len}")
@result_to_dataframe(["generated_texts", "generated_tokens", "generated_log_probs"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
prompts = self.cast_to_run(previous_result)
return self._pure(prompts, **kwargs)
def _pure(
self, prompts: List[str], truncate: bool = True, **kwargs
) -> Tuple[List[str], List[List[int]], List[List[float]]]:
"""
Method to call the VLLM API to generate text.
:param prompts: List of input prompts.
:param truncate: Whether to truncate input prompts to fit within the token limit.
:param kwargs: Additional options (e.g., temperature, top_p).
:return: Generated text, token lists, and log probability lists.
"""
if kwargs.get("logprobs") is not None:
kwargs.pop("logprobs")
logger.warning(
"parameter logprob does not effective. It always set to True."
)
if kwargs.get("n") is not None:
kwargs.pop("n")
logger.warning("parameter n does not effective. It always set to 1.")
if truncate:
prompts = list(map(lambda p: self.truncate_by_token(p), prompts))
loop = get_event_loop()
tasks = [to_thread(self.get_result, prompt, **kwargs) for prompt in prompts]
results = loop.run_until_complete(process_batch(tasks, self.batch))
answer_result = list(map(lambda x: x[0], results))
token_result = list(map(lambda x: x[1], results))
logprob_result = list(map(lambda x: x[2], results))
return answer_result, token_result, logprob_result
def truncate_by_token(self, prompt: str) -> str:
"""
Function to truncate prompts to fit within the maximum token limit.
"""
tokens = self.encoding_for_model(prompt)["tokens"] # Simple tokenization
return self.decoding_for_model(tokens[: self.max_model_len])["prompt"]
def call_vllm_api(self, prompt: str, **kwargs) -> dict:
"""
Calls the VLLM API to get chat/completions responses.
:param prompt: Input prompt.
:param kwargs: Additional API options (e.g., temperature, max_tokens).
:return: API response.
"""
payload = {
"model": self.llm,
"messages": [{"role": "user", "content": prompt}],
"temperature": kwargs.get("temperature", 0.4),
"max_tokens": min(
kwargs.get("max_tokens", self.max_token_size), self.max_token_size
),
"logprobs": True,
"n": 1,
}
start_time = time.time() # Record request start time
response = requests.post(f"{self.uri}/v1/chat/completions", json=payload)
end_time = time.time() # Record request end time
response.raise_for_status()
elapsed_time = end_time - start_time # Calculate elapsed time
logger.info(
f"Request chat completions to vllm server completed in {elapsed_time:.2f} seconds"
)
return response.json()
# Additional method: abstract method implementation
async def astream(self, prompt: str, **kwargs):
"""
Asynchronous streaming method not implemented.
"""
raise NotImplementedError("astream method is not implemented for VLLM API yet.")
def stream(self, prompt: str, **kwargs):
"""
Synchronous streaming method not implemented.
"""
raise NotImplementedError("stream method is not implemented for VLLM API yet.")
def get_result(self, prompt: str, **kwargs):
response = self.call_vllm_api(prompt, **kwargs)
choice = response["choices"][0]
answer = choice["message"]["content"]
# Handle cases where logprobs is None
if choice.get("logprobs") and "content" in choice["logprobs"]:
logprobs = list(map(lambda x: x["logprob"], choice["logprobs"]["content"]))
tokens = list(
map(
lambda x: self.encoding_for_model(x["token"])["tokens"],
choice["logprobs"]["content"],
)
)
else:
logprobs = []
tokens = []
return answer, tokens, logprobs
def encoding_for_model(self, answer_piece: str):
payload = {
"model": self.llm,
"prompt": answer_piece,
"add_special_tokens": True,
}
response = requests.post(f"{self.uri}/tokenize", json=payload)
response.raise_for_status()
return response.json()
def decoding_for_model(self, tokens: list[int]):
payload = {
"model": self.llm,
"tokens": tokens,
}
response = requests.post(f"{self.uri}/detokenize", json=payload)
response.raise_for_status()
return response.json()
def get_max_model_length(self):
response = requests.get(f"{self.uri}/v1/models")
response.raise_for_status()
json_data = response.json()
return json_data["data"][0]["max_model_len"]