Fix Dockerfile build issue

2025-03-18 16:41:12 +09:00
parent 6814230bfb
commit 9323aa254a
228 changed files with 467 additions and 3488 deletions
--- a/autorag/nodes/generator/init.py
+++ b/autorag/nodes/generator/init.py
@@ -0,0 +1,4 @@
+from .llama_index_llm import LlamaIndexLLM
+from .openai_llm import OpenAILLM
+from .vllm import Vllm
+from .vllm_api import VllmAPI
--- a/autorag/nodes/generator/base.py
+++ b/autorag/nodes/generator/base.py
@@ -0,0 +1,103 @@
+import abc
+import functools
+import logging
+from pathlib import Path
+from typing import Union, Tuple, List
+
+import pandas as pd
+from llama_index.core.output_parsers import PydanticOutputParser
+
+from autorag import generator_models
+from autorag.schema import BaseModule
+from autorag.utils import result_to_dataframe
+
+logger = logging.getLogger("AutoRAG")
+
+
+class BaseGenerator(BaseModule, metaclass=abc.ABCMeta):
+	def __init__(self, project_dir: str, llm: str, *args, **kwargs):
+		logger.info(f"Initialize generator node - {self.__class__.__name__}")
+		self.llm = llm
+
+	def __del__(self):
+		logger.info(f"Deleting generator module - {self.__class__.__name__}")
+
+	def cast_to_run(self, previous_result: pd.DataFrame, *args, **kwargs):
+		logger.info(f"Running generator node - {self.__class__.__name__} module...")
+		assert (
+			"prompts" in previous_result.columns
+		), "previous_result must contain prompts column."
+		prompts = previous_result["prompts"].tolist()
+		return prompts
+
+	def structured_output(self, prompts: List[str], output_cls):
+		response, _, _ = self._pure(prompts)
+		parser = PydanticOutputParser(output_cls)
+		result = []
+		for res in response:
+			try:
+				result.append(parser.parse(res))
+			except Exception as e:
+				logger.warning(
+					f"Error parsing response: {e} \nSo returning None instead in this case."
+				)
+				result.append(None)
+		return result
+
+	@abc.abstractmethod
+	async def astream(self, prompt: str, **kwargs):
+		pass
+
+	@abc.abstractmethod
+	def stream(self, prompt: str, **kwargs):
+		pass
+
+
+def generator_node(func):
+	@functools.wraps(func)
+	@result_to_dataframe(["generated_texts", "generated_tokens", "generated_log_probs"])
+	def wrapper(
+		project_dir: Union[str, Path], previous_result: pd.DataFrame, llm: str, **kwargs
+	) -> Tuple[List[str], List[List[int]], List[List[float]]]:
+		"""
+		This decorator makes a generator module to be a node.
+		It automatically extracts prompts from previous_result and runs the generator function.
+		Plus, it retrieves the llm instance from autorag.generator_models.
+
+		:param project_dir: The project directory.
+		:param previous_result: The previous result that contains prompts,
+		:param llm: The llm name that you want to use.
+		:param kwargs: The extra parameters for initializing the llm instance.
+		:return: Pandas dataframe that contains generated texts, generated tokens, and generated log probs.
+		    Each column is "generated_texts", "generated_tokens", and "generated_log_probs".
+		"""
+		logger.info(f"Running generator node - {func.__name__} module...")
+		assert (
+			"prompts" in previous_result.columns
+		), "previous_result must contain prompts column."
+		prompts = previous_result["prompts"].tolist()
+		if func.__name__ == "llama_index_llm":
+			if llm not in generator_models:
+				raise ValueError(
+					f"{llm} is not a valid llm name. Please check the llm name."
+					"You can check valid llm names from autorag.generator_models."
+				)
+			batch = kwargs.pop("batch", 16)
+			if llm == "huggingfacellm":
+				model_name = kwargs.pop("model", None)
+				if model_name is not None:
+					kwargs["model_name"] = model_name
+				else:
+					if "model_name" not in kwargs.keys():
+						raise ValueError(
+							"`model` or `model_name` parameter must be provided for using huggingfacellm."
+						)
+				kwargs["tokenizer_name"] = kwargs["model_name"]
+			llm_instance = generator_models[llm](**kwargs)
+			result = func(prompts=prompts, llm=llm_instance, batch=batch)
+			del llm_instance
+			return result
+		else:
+			return func(prompts=prompts, llm=llm, **kwargs)
+
+	return wrapper
--- a/autorag/nodes/generator/llama_index_llm.py
+++ b/autorag/nodes/generator/llama_index_llm.py
@@ -0,0 +1,97 @@
+from typing import List, Tuple
+
+import pandas as pd
+from llama_index.core.base.llms.base import BaseLLM
+from transformers import AutoTokenizer
+
+from autorag import generator_models
+from autorag.nodes.generator.base import BaseGenerator
+from autorag.utils.util import (
+	get_event_loop,
+	process_batch,
+	result_to_dataframe,
+	pop_params,
+)
+
+
+class LlamaIndexLLM(BaseGenerator):
+	def __init__(self, project_dir: str, llm: str, batch: int = 16, *args, **kwargs):
+		"""
+		Initialize the Llama Index LLM module.
+
+		:param project_dir: The project directory.
+		:param llm: A llama index LLM instance.
+		:param batch: The batch size for llm.
+			Set low if you face some errors.
+			Default is 16.
+		:param kwargs: The extra parameters for initializing the llm instance.
+		"""
+		super().__init__(project_dir=project_dir, llm=llm)
+		if self.llm not in generator_models.keys():
+			raise ValueError(
+				f"{self.llm} is not a valid llm name. Please check the llm name."
+				"You can check valid llm names from autorag.generator_models."
+			)
+		self.batch = batch
+		llm_class = generator_models[self.llm]
+
+		if llm_class.class_name() in [
+			"HuggingFace_LLM",
+			"HuggingFaceInferenceAPI",
+			"TextGenerationInference",
+		]:
+			model_name = kwargs.pop("model", None)
+			if model_name is not None:
+				kwargs["model_name"] = model_name
+			else:
+				if "model_name" not in kwargs.keys():
+					raise ValueError(
+						"`model` or `model_name` parameter must be provided for using huggingfacellm."
+					)
+			kwargs["tokenizer_name"] = kwargs["model_name"]
+		self.llm_instance: BaseLLM = llm_class(**pop_params(llm_class.__init__, kwargs))
+
+	def __del__(self):
+		super().__del__()
+		del self.llm_instance
+
+	@result_to_dataframe(["generated_texts", "generated_tokens", "generated_log_probs"])
+	def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
+		prompts = self.cast_to_run(previous_result=previous_result)
+		return self._pure(prompts)
+
+	def _pure(
+		self,
+		prompts: List[str],
+	) -> Tuple[List[str], List[List[int]], List[List[float]]]:
+		"""
+		Llama Index LLM module.
+		It gets the LLM instance from llama index, and returns generated text by the input prompt.
+		It does not generate the right log probs, but it returns the pseudo log probs,
+		which are not meant to be used for other modules.
+
+		:param prompts: A list of prompts.
+		:return: A tuple of three elements.
+			The first element is a list of a generated text.
+			The second element is a list of generated text's token ids, used tokenizer is GPT2Tokenizer.
+			The third element is a list of generated text's pseudo log probs.
+		"""
+		tasks = [self.llm_instance.acomplete(prompt) for prompt in prompts]
+		loop = get_event_loop() # get_event_loop()
+		results = loop.run_until_complete(process_batch(tasks, batch_size=self.batch))
+
+		generated_texts = list(map(lambda x: x.text, results))
+		tokenizer = AutoTokenizer.from_pretrained("gpt2", use_fast=False)
+		tokenized_ids = tokenizer(generated_texts).data["input_ids"]
+		pseudo_log_probs = list(map(lambda x: [0.5] * len(x), tokenized_ids))
+		return generated_texts, tokenized_ids, pseudo_log_probs
+
+	async def astream(self, prompt: str, **kwargs):
+		async for completion_response in await self.llm_instance.astream_complete(
+			prompt
+		):
+			yield completion_response.text
+
+	def stream(self, prompt: str, **kwargs):
+		for completion_response in self.llm_instance.stream_complete(prompt):
+			yield completion_response.text
--- a/autorag/nodes/generator/openai_llm.py
+++ b/autorag/nodes/generator/openai_llm.py
@@ -0,0 +1,296 @@
+import logging
+from typing import List, Tuple
+
+import pandas as pd
+import tiktoken
+from openai import AsyncOpenAI
+from tiktoken import Encoding
+
+from autorag.nodes.generator.base import BaseGenerator
+from autorag.utils.util import (
+	get_event_loop,
+	process_batch,
+	pop_params,
+	result_to_dataframe,
+)
+
+logger = logging.getLogger("AutoRAG")
+
+MAX_TOKEN_DICT = {  # model name : token limit
+	"gpt-4.5-preview": 128_000,
+	"gpt-4.5-preview-2025-02-27": 128_000,
+	"o1": 200_000,
+	"o1-preview": 128_000,
+	"o1-preview-2024-09-12": 128_000,
+	"o1-mini": 128_000,
+	"o1-mini-2024-09-12": 128_000,
+	"o3-mini": 200_000,
+	"gpt-4o-mini": 128_000,
+	"gpt-4o-mini-2024-07-18": 128_000,
+	"gpt-4o": 128_000,
+	"gpt-4o-2024-08-06": 128_000,
+	"gpt-4o-2024-05-13": 128_000,
+	"chatgpt-4o-latest": 128_000,
+	"gpt-4-turbo": 128_000,
+	"gpt-4-turbo-2024-04-09": 128_000,
+	"gpt-4-turbo-preview": 128_000,
+	"gpt-4-0125-preview": 128_000,
+	"gpt-4-1106-preview": 128_000,
+	"gpt-4-vision-preview": 128_000,
+	"gpt-4-1106-vision-preview": 128_000,
+	"gpt-4": 8_192,
+	"gpt-4-0613": 8_192,
+	"gpt-4-32k": 32_768,
+	"gpt-4-32k-0613": 32_768,
+	"gpt-3.5-turbo-0125": 16_385,
+	"gpt-3.5-turbo": 16_385,
+	"gpt-3.5-turbo-1106": 16_385,
+	"gpt-3.5-turbo-instruct": 4_096,
+	"gpt-3.5-turbo-16k": 16_385,
+	"gpt-3.5-turbo-0613": 4_096,
+	"gpt-3.5-turbo-16k-0613": 16_385,
+}
+
+
+class OpenAILLM(BaseGenerator):
+	def __init__(self, project_dir, llm: str, batch: int = 16, *args, **kwargs):
+		super().__init__(project_dir, llm, *args, **kwargs)
+		assert batch > 0, "batch size must be greater than 0."
+		self.batch = batch
+
+		client_init_params = pop_params(AsyncOpenAI.__init__, kwargs)
+		self.client = AsyncOpenAI(**client_init_params)
+
+		if self.llm.startswith("gpt-4.5"):
+			self.tokenizer = tiktoken.get_encoding("o200k_base")
+		else:
+			self.tokenizer = tiktoken.encoding_for_model(self.llm)
+
+		self.max_token_size = (
+			MAX_TOKEN_DICT.get(self.llm) - 7
+		)  # because of chat token usage
+		if self.max_token_size is None:
+			raise ValueError(
+				f"Model {self.llm} does not supported. "
+				f"Please select the model between {list(MAX_TOKEN_DICT.keys())}"
+			)
+
+	@result_to_dataframe(["generated_texts", "generated_tokens", "generated_log_probs"])
+	def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
+		prompts = self.cast_to_run(previous_result)
+		return self._pure(prompts, **kwargs)
+
+	def _pure(
+		self,
+		prompts: List[str],
+		truncate: bool = True,
+		**kwargs,
+	) -> Tuple[List[str], List[List[int]], List[List[float]]]:
+		"""
+		OpenAI generator module.
+		Uses an official openai library for generating answer from the given prompt.
+		It returns real token ids and log probs, so you must use this for using token ids and log probs.
+
+		:param prompts: A list of prompts.
+		:param llm: A model name for openai.
+		    Default is gpt-3.5-turbo.
+		:param batch: Batch size for openai api call.
+		    If you get API limit errors, you should lower the batch size.
+		    Default is 16.
+		:param truncate: Whether to truncate the input prompt.
+		    Default is True.
+		:param api_key: OpenAI API key. You can set this by passing env variable `OPENAI_API_KEY`
+		:param kwargs: The optional parameter for openai api call `openai.chat.completion`
+		    See https://platform.openai.com/docs/api-reference/chat/create for more details.
+		:return: A tuple of three elements.
+		    The first element is a list of generated text.
+		    The second element is a list of generated text's token ids.
+		    The third element is a list of generated text's log probs.
+		"""
+		if kwargs.get("logprobs") is not None:
+			kwargs.pop("logprobs")
+			logger.warning(
+				"parameter logprob does not effective. It always set to True."
+			)
+		if kwargs.get("n") is not None:
+			kwargs.pop("n")
+			logger.warning("parameter n does not effective. It always set to 1.")
+
+		# TODO: fix this after updating tiktoken for the gpt-4.5 model. It is not yet supported yet.
+		if truncate:
+			prompts = list(
+				map(
+					lambda prompt: truncate_by_token(
+						prompt, self.tokenizer, self.max_token_size
+					),
+					prompts,
+				)
+			)
+
+		openai_chat_params = pop_params(self.client.chat.completions.create, kwargs)
+		loop = get_event_loop()
+		if self.llm.startswith("o1") or self.llm.startswith("o3"):
+			tasks = [
+				self.get_result_o1(prompt, **openai_chat_params) for prompt in prompts
+			]
+		else:
+			tasks = [
+				self.get_result(prompt, **openai_chat_params) for prompt in prompts
+			]
+		result = loop.run_until_complete(process_batch(tasks, self.batch))
+		answer_result = list(map(lambda x: x[0], result))
+		token_result = list(map(lambda x: x[1], result))
+		logprob_result = list(map(lambda x: x[2], result))
+		return answer_result, token_result, logprob_result
+
+	def structured_output(self, prompts: List[str], output_cls, **kwargs):
+		supported_models = [
+			"gpt-4o-mini-2024-07-18",
+			"gpt-4o-2024-08-06",
+		]
+		if self.llm not in supported_models:
+			raise ValueError(
+				f"{self.llm} is not a valid model name for structured output. "
+				f"Please select the model between {supported_models}"
+			)
+
+		if kwargs.get("logprobs") is not None:
+			kwargs.pop("logprobs")
+			logger.warning(
+				"parameter logprob does not effective. It always set to False."
+			)
+		if kwargs.get("n") is not None:
+			kwargs.pop("n")
+			logger.warning("parameter n does not effective. It always set to 1.")
+
+		# TODO: fix this after updating tiktoken for the gpt-4.5 model. It is not yet supported yet.
+		prompts = list(
+			map(
+				lambda prompt: truncate_by_token(
+					prompt, self.tokenizer, self.max_token_size
+				),
+				prompts,
+			)
+		)
+
+		openai_chat_params = pop_params(self.client.beta.chat.completions.parse, kwargs)
+		loop = get_event_loop()
+		tasks = [
+			self.get_structured_result(prompt, output_cls, **openai_chat_params)
+			for prompt in prompts
+		]
+		result = loop.run_until_complete(process_batch(tasks, self.batch))
+		return result
+
+	async def astream(self, prompt: str, **kwargs):
+		# TODO: gpt-4.5-preview does not support logprobs. It should be fixed after the openai update.
+		if kwargs.get("logprobs") is not None:
+			kwargs.pop("logprobs")
+			logger.warning(
+				"parameter logprob does not effective. It always set to False."
+			)
+		if kwargs.get("n") is not None:
+			kwargs.pop("n")
+			logger.warning("parameter n does not effective. It always set to 1.")
+
+		prompt = truncate_by_token(prompt, self.tokenizer, self.max_token_size)
+
+		openai_chat_params = pop_params(self.client.chat.completions.create, kwargs)
+
+		stream = await self.client.chat.completions.create(
+			model=self.llm,
+			messages=[
+				{"role": "user", "content": prompt},
+			],
+			logprobs=False,
+			n=1,
+			stream=True,
+			**openai_chat_params,
+		)
+		result = ""
+		async for chunk in stream:
+			if chunk.choices[0].delta.content is not None:
+				result += chunk.choices[0].delta.content
+				yield result
+
+	def stream(self, prompt: str, **kwargs):
+		raise NotImplementedError("stream method is not implemented yet.")
+
+	async def get_structured_result(self, prompt: str, output_cls, **kwargs):
+		logprobs = True
+		if self.llm.startswith("gpt-4.5"):
+			logprobs = False
+		response = await self.client.beta.chat.completions.parse(
+			model=self.llm,
+			messages=[
+				{"role": "user", "content": prompt},
+			],
+			response_format=output_cls,
+			logprobs=logprobs,
+			n=1,
+			**kwargs,
+		)
+		return response.choices[0].message.parsed
+
+	async def get_result(self, prompt: str, **kwargs):
+		# TODO: gpt-4.5-preview does not support logprobs. It should be fixed after the openai update.
+		logprobs = True
+		if self.llm.startswith("gpt-4.5"):
+			logprobs = False
+		response = await self.client.chat.completions.create(
+			model=self.llm,
+			messages=[
+				{"role": "user", "content": prompt},
+			],
+			logprobs=logprobs,
+			n=1,
+			**kwargs,
+		)
+		choice = response.choices[0]
+		answer = choice.message.content
+		# TODO: gpt-4.5-preview does not support logprobs. It should be fixed after the openai update.
+		if self.llm.startswith("gpt-4.5"):
+			tokens = self.tokenizer.encode(answer, allowed_special="all")
+			logprobs = [0.5] * len(tokens)
+			logger.warning("gpt-4.5-preview does not support logprobs yet.")
+		else:
+			logprobs = list(map(lambda x: x.logprob, choice.logprobs.content))
+			tokens = list(
+				map(
+					lambda x: self.tokenizer.encode(x.token, allowed_special="all")[0],
+					choice.logprobs.content,
+				)
+			)
+			assert len(tokens) == len(
+				logprobs
+			), "tokens and logprobs size is different."
+		return answer, tokens, logprobs
+
+	async def get_result_o1(self, prompt: str, **kwargs):
+		assert self.llm.startswith("o1") or self.llm.startswith(
+			"o3"
+		), "This function only supports o1 or o3 model."
+		# The default temperature for the o1 model is 1. 1 is only supported.
+		# See https://platform.openai.com/docs/guides/reasoning about beta limitation of o1 models.
+		kwargs["temperature"] = 1
+		kwargs["top_p"] = 1
+		kwargs["presence_penalty"] = 0
+		kwargs["frequency_penalty"] = 0
+		response = await self.client.chat.completions.create(
+			model=self.llm,
+			messages=[
+				{"role": "user", "content": prompt},
+			],
+			logprobs=False,
+			n=1,
+			**kwargs,
+		)
+		answer = response.choices[0].message.content
+		tokens = self.tokenizer.encode(answer, allowed_special="all")
+		pseudo_log_probs = [0.5] * len(tokens)
+		return answer, tokens, pseudo_log_probs
+
+
+def truncate_by_token(prompt: str, tokenizer: Encoding, max_token_size: int):
+	tokens = tokenizer.encode(prompt, allowed_special="all")
+	return tokenizer.decode(tokens[:max_token_size])
--- a/autorag/nodes/generator/run.py
+++ b/autorag/nodes/generator/run.py
@@ -0,0 +1,144 @@
+import os
+import pathlib
+from typing import List, Dict, Union
+
+import pandas as pd
+
+from autorag.evaluation import evaluate_generation
+from autorag.evaluation.util import cast_metrics
+from autorag.schema.metricinput import MetricInput
+from autorag.strategy import measure_speed, filter_by_threshold, select_best
+from autorag.utils.util import to_list
+
+
+def run_generator_node(
+	modules: List,
+	module_params: List[Dict],
+	previous_result: pd.DataFrame,
+	node_line_dir: str,
+	strategies: Dict,
+) -> pd.DataFrame:
+	"""
+	Run evaluation and select the best module among generator node results.
+	And save the results and summary to generator node directory.
+
+	:param modules: Generator modules to run.
+	:param module_params: Generator module parameters.
+	    Including node parameters, which is used for every module in this node.
+	:param previous_result: Previous result dataframe.
+	    Could be prompt maker node's result.
+	:param node_line_dir: This node line's directory.
+	:param strategies: Strategies for generator node.
+	:return: The best result dataframe.
+	    It contains previous result columns and generator node's result columns.
+	"""
+	if not os.path.exists(node_line_dir):
+		os.makedirs(node_line_dir)
+	project_dir = pathlib.PurePath(node_line_dir).parent.parent
+	node_dir = os.path.join(node_line_dir, "generator")  # node name
+	if not os.path.exists(node_dir):
+		os.makedirs(node_dir)
+	qa_data = pd.read_parquet(
+		os.path.join(project_dir, "data", "qa.parquet"), engine="pyarrow"
+	)
+	if "generation_gt" not in qa_data.columns:
+		raise ValueError("You must have 'generation_gt' column in qa.parquet.")
+
+	results, execution_times = zip(
+		*map(
+			lambda x: measure_speed(
+				x[0].run_evaluator,
+				project_dir=project_dir,
+				previous_result=previous_result,
+				**x[1],
+			),
+			zip(modules, module_params),
+		)
+	)
+	average_times = list(map(lambda x: x / len(results[0]), execution_times))
+
+	# get average token usage
+	token_usages = list(map(lambda x: x["generated_tokens"].apply(len).mean(), results))
+
+	# make rows to metric_inputs
+	generation_gt = to_list(qa_data["generation_gt"].tolist())
+
+	metric_inputs = [MetricInput(generation_gt=gen_gt) for gen_gt in generation_gt]
+
+	metric_names, metric_params = cast_metrics(strategies.get("metrics"))
+	if metric_names is None or len(metric_names) <= 0:
+		raise ValueError("You must at least one metrics for generator evaluation.")
+	results = list(
+		map(
+			lambda result: evaluate_generator_node(
+				result, metric_inputs, strategies.get("metrics")
+			),
+			results,
+		)
+	)
+
+	# save results to folder
+	filepaths = list(
+		map(lambda x: os.path.join(node_dir, f"{x}.parquet"), range(len(modules)))
+	)
+	list(
+		map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths))
+	)  # execute save to parquet
+	filenames = list(map(lambda x: os.path.basename(x), filepaths))
+
+	summary_df = pd.DataFrame(
+		{
+			"filename": filenames,
+			"module_name": list(map(lambda module: module.__name__, modules)),
+			"module_params": module_params,
+			"execution_time": average_times,
+			"average_output_token": token_usages,
+			**{
+				metric: list(map(lambda x: x[metric].mean(), results))
+				for metric in metric_names
+			},
+		}
+	)
+
+	# filter by strategies
+	if strategies.get("speed_threshold") is not None:
+		results, filenames = filter_by_threshold(
+			results, average_times, strategies["speed_threshold"], filenames
+		)
+	if strategies.get("token_threshold") is not None:
+		results, filenames = filter_by_threshold(
+			results, token_usages, strategies["token_threshold"], filenames
+		)
+	selected_result, selected_filename = select_best(
+		results, metric_names, filenames, strategies.get("strategy", "mean")
+	)
+	best_result = pd.concat([previous_result, selected_result], axis=1)
+
+	# add 'is_best' column at summary file
+	summary_df["is_best"] = summary_df["filename"] == selected_filename
+
+	# save files
+	summary_df.to_csv(os.path.join(node_dir, "summary.csv"), index=False)
+	best_result.to_parquet(
+		os.path.join(
+			node_dir, f"best_{os.path.splitext(selected_filename)[0]}.parquet"
+		),
+		index=False,
+	)
+	return best_result
+
+
+def evaluate_generator_node(
+	result_df: pd.DataFrame,
+	metric_inputs: List[MetricInput],
+	metrics: Union[List[str], List[Dict]],
+):
+	@evaluate_generation(metric_inputs=metric_inputs, metrics=metrics)
+	def evaluate_generation_module(df: pd.DataFrame):
+		return (
+			df["generated_texts"].tolist(),
+			df["generated_tokens"].tolist(),
+			df["generated_log_probs"].tolist(),
+		)
+
+	return evaluate_generation_module(result_df)
--- a/autorag/nodes/generator/vllm.py
+++ b/autorag/nodes/generator/vllm.py
@@ -0,0 +1,121 @@
+import gc
+from copy import deepcopy
+from typing import List, Tuple
+
+import pandas as pd
+
+from autorag.nodes.generator.base import BaseGenerator
+from autorag.utils import result_to_dataframe
+from autorag.utils.util import pop_params, to_list
+
+
+class Vllm(BaseGenerator):
+	def __init__(self, project_dir: str, llm: str, **kwargs):
+		super().__init__(project_dir, llm, **kwargs)
+		try:
+			from vllm import SamplingParams, LLM
+		except ImportError:
+			raise ImportError(
+				"Please install vllm library. You can install it by running `pip install vllm`."
+			)
+
+		model_from_kwargs = kwargs.pop("model", None)
+		model = llm if model_from_kwargs is None else model_from_kwargs
+
+		input_kwargs = deepcopy(kwargs)
+		sampling_params_init_params = pop_params(
+			SamplingParams.from_optional, input_kwargs
+		)
+		self.vllm_model = LLM(model, **input_kwargs)
+
+		# delete not sampling param keys in the kwargs
+		kwargs_keys = list(kwargs.keys())
+		for key in kwargs_keys:
+			if key not in sampling_params_init_params:
+				kwargs.pop(key)
+
+	def __del__(self):
+		try:
+			import torch
+			import contextlib
+
+			if torch.cuda.is_available():
+				from vllm.distributed.parallel_state import (
+					destroy_model_parallel,
+					destroy_distributed_environment,
+				)
+
+				destroy_model_parallel()
+				destroy_distributed_environment()
+				del self.vllm_model.llm_engine.model_executor
+				del self.vllm_model
+				with contextlib.suppress(AssertionError):
+					torch.distributed.destroy_process_group()
+				gc.collect()
+				torch.cuda.empty_cache()
+				torch.cuda.synchronize()
+		except ImportError:
+			del self.vllm_model
+
+		super().__del__()
+
+	@result_to_dataframe(["generated_texts", "generated_tokens", "generated_log_probs"])
+	def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
+		prompts = self.cast_to_run(previous_result)
+		return self._pure(prompts, **kwargs)
+
+	def _pure(
+		self, prompts: List[str], **kwargs
+	) -> Tuple[List[str], List[List[int]], List[List[float]]]:
+		"""
+		Vllm module.
+		It gets the VLLM instance and returns generated texts by the input prompt.
+		You can set logprobs to get the log probs of the generated text.
+		Default logprobs is 1.
+
+		:param prompts: A list of prompts.
+		:param kwargs: The extra parameters for generating the text.
+		:return: A tuple of three elements.
+		    The first element is a list of generated text.
+		    The second element is a list of generated text's token ids.
+		    The third element is a list of generated text's log probs.
+		"""
+		try:
+			from vllm.outputs import RequestOutput
+			from vllm.sequence import SampleLogprobs
+			from vllm import SamplingParams
+		except ImportError:
+			raise ImportError(
+				"Please install vllm library. You can install it by running `pip install vllm`."
+			)
+
+		if "logprobs" not in kwargs:
+			kwargs["logprobs"] = 1
+
+		sampling_params = pop_params(SamplingParams.from_optional, kwargs)
+		generate_params = SamplingParams(**sampling_params)
+		results: List[RequestOutput] = self.vllm_model.generate(
+			prompts, generate_params
+		)
+		generated_texts = list(map(lambda x: x.outputs[0].text, results))
+		generated_token_ids = list(map(lambda x: x.outputs[0].token_ids, results))
+		log_probs: List[SampleLogprobs] = list(
+			map(lambda x: x.outputs[0].logprobs, results)
+		)
+		generated_log_probs = list(
+			map(
+				lambda x: list(map(lambda y: y[0][y[1]].logprob, zip(x[0], x[1]))),
+				zip(log_probs, generated_token_ids),
+			)
+		)
+		return (
+			to_list(generated_texts),
+			to_list(generated_token_ids),
+			to_list(generated_log_probs),
+		)
+
+	async def astream(self, prompt: str, **kwargs):
+		raise NotImplementedError
+
+	def stream(self, prompt: str, **kwargs):
+		raise NotImplementedError
--- a/autorag/nodes/generator/vllm_api.py
+++ b/autorag/nodes/generator/vllm_api.py
@@ -0,0 +1,176 @@
+import logging
+from typing import List, Tuple
+import time
+
+import pandas as pd
+import requests
+from asyncio import to_thread
+
+from autorag.nodes.generator.base import BaseGenerator
+from autorag.utils.util import get_event_loop, process_batch, result_to_dataframe
+
+logger = logging.getLogger("AutoRAG")
+
+DEFAULT_MAX_TOKENS = 4096  # Default token limit
+
+
+class VllmAPI(BaseGenerator):
+	def __init__(
+		self,
+		project_dir,
+		llm: str,
+		uri: str,
+		max_tokens: int = None,
+		batch: int = 16,
+		*args,
+		**kwargs,
+	):
+		"""
+		VLLM API Wrapper for OpenAI-compatible chat/completions format.
+
+		:param project_dir: Project directory.
+		:param llm: Model name (e.g., LLaMA model).
+		:param uri: VLLM API server URI.
+		:param max_tokens: Maximum token limit.
+		    Default is 4096.
+		:param batch: Request batch size.
+		    Default is 16.
+		"""
+		super().__init__(project_dir, llm, *args, **kwargs)
+		assert batch > 0, "Batch size must be greater than 0."
+		self.uri = uri.rstrip("/")  # Set API URI
+		self.batch = batch
+		# Use the provided max_tokens if available, otherwise use the default
+		self.max_token_size = max_tokens if max_tokens else DEFAULT_MAX_TOKENS
+		self.max_model_len = self.get_max_model_length()
+		logger.info(f"{llm} max model length: {self.max_model_len}")
+
+	@result_to_dataframe(["generated_texts", "generated_tokens", "generated_log_probs"])
+	def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
+		prompts = self.cast_to_run(previous_result)
+		return self._pure(prompts, **kwargs)
+
+	def _pure(
+		self, prompts: List[str], truncate: bool = True, **kwargs
+	) -> Tuple[List[str], List[List[int]], List[List[float]]]:
+		"""
+		Method to call the VLLM API to generate text.
+
+		:param prompts: List of input prompts.
+		:param truncate: Whether to truncate input prompts to fit within the token limit.
+		:param kwargs: Additional options (e.g., temperature, top_p).
+		:return: Generated text, token lists, and log probability lists.
+		"""
+		if kwargs.get("logprobs") is not None:
+			kwargs.pop("logprobs")
+			logger.warning(
+				"parameter logprob does not effective. It always set to True."
+			)
+		if kwargs.get("n") is not None:
+			kwargs.pop("n")
+			logger.warning("parameter n does not effective. It always set to 1.")
+
+		if truncate:
+			prompts = list(map(lambda p: self.truncate_by_token(p), prompts))
+		loop = get_event_loop()
+		tasks = [to_thread(self.get_result, prompt, **kwargs) for prompt in prompts]
+		results = loop.run_until_complete(process_batch(tasks, self.batch))
+
+		answer_result = list(map(lambda x: x[0], results))
+		token_result = list(map(lambda x: x[1], results))
+		logprob_result = list(map(lambda x: x[2], results))
+		return answer_result, token_result, logprob_result
+
+	def truncate_by_token(self, prompt: str) -> str:
+		"""
+		Function to truncate prompts to fit within the maximum token limit.
+		"""
+		tokens = self.encoding_for_model(prompt)["tokens"]  # Simple tokenization
+		return self.decoding_for_model(tokens[: self.max_model_len])["prompt"]
+
+	def call_vllm_api(self, prompt: str, **kwargs) -> dict:
+		"""
+		Calls the VLLM API to get chat/completions responses.
+
+		:param prompt: Input prompt.
+		:param kwargs: Additional API options (e.g., temperature, max_tokens).
+		:return: API response.
+		"""
+		payload = {
+			"model": self.llm,
+			"messages": [{"role": "user", "content": prompt}],
+			"temperature": kwargs.get("temperature", 0.4),
+			"max_tokens": min(
+				kwargs.get("max_tokens", self.max_token_size), self.max_token_size
+			),
+			"logprobs": True,
+			"n": 1,
+		}
+		start_time = time.time()  # Record request start time
+		response = requests.post(f"{self.uri}/v1/chat/completions", json=payload)
+		end_time = time.time()  # Record request end time
+
+		response.raise_for_status()
+		elapsed_time = end_time - start_time  # Calculate elapsed time
+		logger.info(
+			f"Request chat completions to vllm server completed in {elapsed_time:.2f} seconds"
+		)
+		return response.json()
+
+	# Additional method: abstract method implementation
+	async def astream(self, prompt: str, **kwargs):
+		"""
+		Asynchronous streaming method not implemented.
+		"""
+		raise NotImplementedError("astream method is not implemented for VLLM API yet.")
+
+	def stream(self, prompt: str, **kwargs):
+		"""
+		Synchronous streaming method not implemented.
+		"""
+		raise NotImplementedError("stream method is not implemented for VLLM API yet.")
+
+	def get_result(self, prompt: str, **kwargs):
+		response = self.call_vllm_api(prompt, **kwargs)
+		choice = response["choices"][0]
+		answer = choice["message"]["content"]
+
+		# Handle cases where logprobs is None
+		if choice.get("logprobs") and "content" in choice["logprobs"]:
+			logprobs = list(map(lambda x: x["logprob"], choice["logprobs"]["content"]))
+			tokens = list(
+				map(
+					lambda x: self.encoding_for_model(x["token"])["tokens"],
+					choice["logprobs"]["content"],
+				)
+			)
+		else:
+			logprobs = []
+			tokens = []
+
+		return answer, tokens, logprobs
+
+	def encoding_for_model(self, answer_piece: str):
+		payload = {
+			"model": self.llm,
+			"prompt": answer_piece,
+			"add_special_tokens": True,
+		}
+		response = requests.post(f"{self.uri}/tokenize", json=payload)
+		response.raise_for_status()
+		return response.json()
+
+	def decoding_for_model(self, tokens: list[int]):
+		payload = {
+			"model": self.llm,
+			"tokens": tokens,
+		}
+		response = requests.post(f"{self.uri}/detokenize", json=payload)
+		response.raise_for_status()
+		return response.json()
+
+	def get_max_model_length(self):
+		response = requests.get(f"{self.uri}/v1/models")
+		response.raise_for_status()
+		json_data = response.json()
+		return json_data["data"][0]["max_model_len"]