Fix Dockerfile build issue

2025-03-18 16:41:12 +09:00
parent 6814230bfb
commit 9323aa254a
228 changed files with 467 additions and 3488 deletions
--- a/autorag/data/legacy/qacreation/init.py
+++ b/autorag/data/legacy/qacreation/init.py
@@ -0,0 +1,6 @@
+from .base import make_single_content_qa, make_qa_with_existing_qa
+from .llama_index import (
+	generate_qa_llama_index,
+	generate_answers,
+	generate_qa_llama_index_by_ratio,
+)
--- a/autorag/data/legacy/qacreation/base.py
+++ b/autorag/data/legacy/qacreation/base.py
@@ -0,0 +1,239 @@
+import logging
+import uuid
+from typing import Callable, Optional, List
+
+import chromadb
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+
+import autorag
+from autorag.nodes.retrieval.vectordb import vectordb_ingest, vectordb_pure
+from autorag.utils.util import (
+	save_parquet_safe,
+	fetch_contents,
+	get_event_loop,
+	process_batch,
+)
+
+logger = logging.getLogger("AutoRAG")
+
+
+def make_single_content_qa(
+	corpus_df: pd.DataFrame,
+	content_size: int,
+	qa_creation_func: Callable,
+	output_filepath: Optional[str] = None,
+	upsert: bool = False,
+	random_state: int = 42,
+	cache_batch: int = 32,
+	**kwargs,
+) -> pd.DataFrame:
+	"""
+	Make single content (single-hop, single-document) QA dataset using given qa_creation_func.
+	It generates a single content QA dataset, which means its retrieval ground truth will be only one.
+	It is the most basic form of QA dataset.
+
+	:param corpus_df: The corpus dataframe to make QA dataset from.
+	:param content_size: This function will generate QA dataset for the given number of contents.
+	:param qa_creation_func: The function to create QA pairs.
+	    You can use like `generate_qa_llama_index` or `generate_qa_llama_index_by_ratio`.
+	    The input func must have `contents` parameter for the list of content string.
+	:param output_filepath: Optional filepath to save the parquet file.
+	    If None, the function will return the processed_data as pd.DataFrame, but do not save as parquet.
+	    File directory must exist. File extension must be .parquet
+	:param upsert: If true, the function will overwrite the existing file if it exists.
+	    Default is False.
+	:param random_state: The random state for sampling corpus from the given corpus_df.
+	:param cache_batch: The number of batches to use for caching the generated QA dataset.
+	    When the cache_batch size data is generated, the dataset will save to the designated output_filepath.
+	    If the cache_batch size is too small, the process time will be longer.
+	:param kwargs: The keyword arguments for qa_creation_func.
+	:return: QA dataset dataframe.
+	    You can save this as parquet file to use at AutoRAG.
+	"""
+	assert content_size > 0, "content_size must be greater than 0."
+	if content_size > len(corpus_df):
+		logger.warning(
+			f"content_size {content_size} is larger than the corpus size {len(corpus_df)}. "
+			"Setting content_size to the corpus size."
+		)
+		content_size = len(corpus_df)
+	sampled_corpus = corpus_df.sample(n=content_size, random_state=random_state)
+	sampled_corpus = sampled_corpus.reset_index(drop=True)
+
+	def make_query_generation_gt(row):
+		return row["qa"]["query"], row["qa"]["generation_gt"]
+
+	qa_data = pd.DataFrame()
+	for idx, i in tqdm(enumerate(range(0, len(sampled_corpus), cache_batch))):
+		qa = qa_creation_func(
+			contents=sampled_corpus["contents"].tolist()[i : i + cache_batch], **kwargs
+		)
+
+		temp_qa_data = pd.DataFrame(
+			{
+				"qa": qa,
+				"retrieval_gt": sampled_corpus["doc_id"].tolist()[i : i + cache_batch],
+			}
+		)
+		temp_qa_data = temp_qa_data.explode("qa", ignore_index=True)
+		temp_qa_data["qid"] = [str(uuid.uuid4()) for _ in range(len(temp_qa_data))]
+		temp_qa_data[["query", "generation_gt"]] = temp_qa_data.apply(
+			make_query_generation_gt, axis=1, result_type="expand"
+		)
+		temp_qa_data = temp_qa_data.drop(columns=["qa"])
+
+		temp_qa_data["retrieval_gt"] = temp_qa_data["retrieval_gt"].apply(
+			lambda x: [[x]]
+		)
+		temp_qa_data["generation_gt"] = temp_qa_data["generation_gt"].apply(
+			lambda x: [x]
+		)
+
+		if idx == 0:
+			qa_data = temp_qa_data
+		else:
+			qa_data = pd.concat([qa_data, temp_qa_data], ignore_index=True)
+		if output_filepath is not None:
+			save_parquet_safe(qa_data, output_filepath, upsert=upsert)
+
+	return qa_data
+
+
+def make_qa_with_existing_qa(
+	corpus_df: pd.DataFrame,
+	existing_query_df: pd.DataFrame,
+	content_size: int,
+	answer_creation_func: Optional[Callable] = None,
+	exist_gen_gt: Optional[bool] = False,
+	output_filepath: Optional[str] = None,
+	embedding_model: str = "openai_embed_3_large",
+	collection: Optional[chromadb.Collection] = None,
+	upsert: bool = False,
+	random_state: int = 42,
+	cache_batch: int = 32,
+	top_k: int = 3,
+	**kwargs,
+) -> pd.DataFrame:
+	"""
+	Make single-hop QA dataset using given qa_creation_func and existing queries.
+
+	:param corpus_df: The corpus dataframe to make QA dataset from.
+	:param existing_query_df: Dataframe containing existing queries to use for QA pair creation.
+	:param content_size: This function will generate QA dataset for the given number of contents.
+	:param answer_creation_func: Optional function to create answer with input query.
+	    If exist_gen_gt is False, this function must be given.
+	:param exist_gen_gt: Optional boolean to use existing generation_gt.
+	    If True, the existing_query_df must have 'generation_gt' column.
+	    If False, the answer_creation_func must be given.
+	:param output_filepath: Optional filepath to save the parquet file.
+	:param embedding_model: The embedding model to use for vectorization.
+	    You can add your own embedding model in the autorag.embedding_models.
+	    Please refer to how to add an embedding model in this doc: https://docs.auto-rag.com/local_model.html
+	    The default is 'openai_embed_3_large'.
+	:param collection: The chromadb collection to use for vector DB.
+	    You can make any chromadb collection and use it here.
+	    If you already ingested the corpus_df to the collection, the embedding process will not be repeated.
+	    The default is None. If None, it makes a temporary collection.
+	:param upsert: If true, the function will overwrite the existing file if it exists.
+	:param random_state: The random state for sampling corpus from the given corpus_df.
+	:param cache_batch: The number of batches to use for caching the generated QA dataset.
+	:param top_k: The number of sources to refer by model.
+	    Default is 3.
+	:param kwargs: The keyword arguments for qa_creation_func.
+	:return: QA dataset dataframe.
+	"""
+	raise DeprecationWarning("This function is deprecated.")
+	assert (
+		"query" in existing_query_df.columns
+	), "existing_query_df must have 'query' column."
+
+	if exist_gen_gt:
+		assert (
+			"generation_gt" in existing_query_df.columns
+		), "existing_query_df must have 'generation_gt' column."
+	else:
+		assert (
+			answer_creation_func is not None
+		), "answer_creation_func must be given when exist_gen_gt is False."
+
+	assert content_size > 0, "content_size must be greater than 0."
+	if content_size > len(corpus_df):
+		logger.warning(
+			f"content_size {content_size} is larger than the corpus size {len(corpus_df)}. "
+			"Setting content_size to the corpus size."
+		)
+		content_size = len(corpus_df)
+
+	logger.info("Loading local embedding model...")
+	embeddings = autorag.embedding_models[embedding_model]()
+
+	# Vector DB creation
+	if collection is None:
+		chroma_client = chromadb.Client()
+		collection_name = "auto-rag"
+		collection = chroma_client.get_or_create_collection(collection_name)
+
+	# embed corpus_df
+	vectordb_ingest(collection, corpus_df, embeddings)
+	query_embeddings = embeddings.get_text_embedding_batch(
+		existing_query_df["query"].tolist()
+	)
+
+	loop = get_event_loop()
+	tasks = [
+		vectordb_pure([query_embedding], top_k, collection)
+		for query_embedding in query_embeddings
+	]
+	results = loop.run_until_complete(process_batch(tasks, batch_size=cache_batch))
+	retrieved_ids = list(map(lambda x: x[0], results))
+
+	retrieved_contents: List[List[str]] = fetch_contents(corpus_df, retrieved_ids)
+	input_passage_strs: List[str] = list(
+		map(
+			lambda x: "\n".join(
+				[f"Document {i + 1}\n{content}" for i, content in enumerate(x)]
+			),
+			retrieved_contents,
+		)
+	)
+
+	retrieved_qa_df = pd.DataFrame(
+		{
+			"qid": [str(uuid.uuid4()) for _ in range(len(existing_query_df))],
+			"query": existing_query_df["query"].tolist(),
+			"retrieval_gt": list(map(lambda x: [x], retrieved_ids)),
+			"input_passage_str": input_passage_strs,
+		}
+	)
+
+	if exist_gen_gt:
+		generation_gt = existing_query_df["generation_gt"].tolist()
+		if isinstance(generation_gt[0], np.ndarray):
+			retrieved_qa_df["generation_gt"] = generation_gt
+		else:
+			raise ValueError(
+				"In existing_query_df, generation_gt (per query) must be in the form of List[str]."
+			)
+
+	sample_qa_df = retrieved_qa_df.sample(
+		n=min(content_size, len(retrieved_qa_df)), random_state=random_state
+	)
+
+	qa_df = sample_qa_df.copy(deep=True)
+	qa_df.drop(columns=["input_passage_str"], inplace=True)
+
+	if not exist_gen_gt:
+		generation_gt = answer_creation_func(
+			contents=sample_qa_df["input_passage_str"].tolist(),
+			queries=sample_qa_df["query"].tolist(),
+			batch=cache_batch,
+			**kwargs,
+		)
+		qa_df["generation_gt"] = generation_gt
+
+	if output_filepath is not None:
+		save_parquet_safe(qa_df, output_filepath, upsert=upsert)
+
+	return qa_df
--- a/autorag/data/legacy/qacreation/llama_index.py
+++ b/autorag/data/legacy/qacreation/llama_index.py
@@ -0,0 +1,253 @@
+import os.path
+import random
+from typing import Optional, List, Dict, Any
+
+import pandas as pd
+from llama_index.core.base.llms.types import ChatMessage, MessageRole
+from llama_index.core.llms import LLM
+
+from autorag.utils.util import process_batch, get_event_loop
+
+package_dir = os.path.dirname(os.path.realpath(__file__))
+
+
+def generate_qa_llama_index(
+	llm: LLM,
+	contents: List[str],
+	prompt: Optional[str] = None,
+	question_num_per_content: int = 1,
+	max_retries: int = 3,
+	batch: int = 4,
+) -> List[List[Dict]]:
+	"""
+	Generate a qa set from the list of contents.
+	It uses a single prompt for all contents.
+	If you want to use more than one prompt for generating qa,
+	you can consider using generate_qa_llama_index_by_ratio.
+
+	:param llm: Llama index model
+	:param contents: List of content strings.
+	:param prompt: The prompt to use for the qa generation.
+	    The prompt must include the following placeholders:
+	    - {{text}}: The content string
+	    - {{num_questions}}: The number of questions to generate
+	    As default, the prompt is set to the default prompt for the question type.
+	:param question_num_per_content: Number of questions to generate for each content.
+	    Default is 1.
+	:param max_retries: The maximum number of retries when generated question number is not equal to the target number.
+	    Default is 3.
+	:param batch: The batch size to process asynchronously.
+	    Default is 4.
+	:return: 2-d list of dictionaries containing the query and generation_gt.
+	"""
+	# load default prompt
+	if prompt is None:
+		prompt = open(
+			os.path.join(package_dir, "llama_index_default_prompt.txt"), "r"
+		).read()
+
+	tasks = [
+		async_qa_gen_llama_index(
+			content, llm, prompt, question_num_per_content, max_retries
+		)
+		for content in contents
+	]
+	loops = get_event_loop()
+	results = loops.run_until_complete(process_batch(tasks, batch))
+	return results
+
+
+def generate_answers(
+	llm: LLM,
+	contents: List[str],
+	queries: List[str],
+	batch: int = 4,
+) -> List[List[Dict]]:
+	"""
+	Generate qa sets from the list of contents using existing queries.
+
+	:param llm: Llama index model
+	:param contents: List of content strings.
+	:param queries: List of existing queries.
+	:param batch: The batch size to process asynchronously.
+	:return: 2-d list of dictionaries containing the query and generation_gt.
+	"""
+
+	tasks = [
+		generate_basic_answer(llm, content, query)
+		for content, query in zip(contents, queries)
+	]
+	loops = get_event_loop()
+	results = loops.run_until_complete(process_batch(tasks, batch))
+	return results
+
+
+def generate_qa_llama_index_by_ratio(
+	llm: LLM,
+	contents: List[str],
+	prompts_ratio: Dict,
+	question_num_per_content: int = 1,
+	max_retries: int = 3,
+	random_state: int = 42,
+	batch: int = 4,
+) -> List[List[Dict]]:
+	"""
+	Generate a qa set from the list of contents.
+	You can set the ratio of prompts that you want to use for generating qa.
+	It distributes the number of questions to generate for each content by the ratio randomly.
+
+	:param llm: Llama index model
+	:param contents: List of content strings.
+	:param prompts_ratio: Dictionary of prompt paths and their ratios.
+	    Example: {"prompt/prompt1.txt": 0.5, "prompt/prompt2.txt": 0.5}
+	    The value sum doesn't have to be 1.
+	    The path must be the absolute path, and the file must exist.
+	    Plus, it has to be a text file which contains proper prompt.
+	    Each prompt must contain the following placeholders:
+	    - {{text}}: The content string
+	    - {{num_questions}}: The number of questions to generate
+	:param question_num_per_content: Number of questions to generate for each content.
+	    Default is 1.
+	:param max_retries: The maximum number of retries when generated question number is not equal to the target number.
+	    Default is 3.
+	:param random_state: Random seed
+	    Default is 42.
+	:param batch: The batch size to process asynchronously.
+	    Default is 4.
+	:return: 2-d list of dictionaries containing the query and generation_gt.
+	"""
+	prompts = list(map(lambda path: open(path, "r").read(), prompts_ratio.keys()))
+	assert all([validate_llama_index_prompt(prompt) for prompt in prompts])
+
+	content_indices = list(range(len(contents)))
+	random.seed(random_state)
+	random.shuffle(content_indices)
+
+	slice_content_indices: List[List[str]] = distribute_list_by_ratio(
+		content_indices, list(prompts_ratio.values())
+	)
+	temp_df = pd.DataFrame({"idx": slice_content_indices, "prompt": prompts})
+	temp_df = temp_df.explode("idx", ignore_index=True)
+	temp_df = temp_df.sort_values(by="idx", ascending=True)
+
+	final_df = pd.DataFrame({"content": contents, "prompt": temp_df["prompt"].tolist()})
+
+	tasks = [
+		async_qa_gen_llama_index(
+			content, llm, prompt, question_num_per_content, max_retries
+		)
+		for content, prompt in zip(
+			final_df["content"].tolist(), final_df["prompt"].tolist()
+		)
+	]
+
+	loops = get_event_loop()
+	results = loops.run_until_complete(process_batch(tasks, batch))
+
+	return results
+
+
+async def async_qa_gen_llama_index(
+	content: str,
+	llm: LLM,
+	prompt: str,
+	question_num: int = 1,
+	max_retries: int = 3,
+):
+	"""
+	Generate a qa set by using the given content and the llama index model.
+	You must select the question type.
+
+	:param content: Content string
+	:param llm: Llama index model
+	:param prompt: The prompt to use for the qa generation.
+	    The prompt must include the following placeholders:
+	    - {{text}}: The content string
+	    - {{num_questions}}: The number of questions to generate
+	:param question_num: The number of questions to generate
+	:param max_retries: Maximum number of retries when generated question number is not equal to the target number
+	:return: List of dictionaries containing the query and generation_gt
+	"""
+	validate_llama_index_prompt(prompt)
+
+	async def generate(content: str, llm: LLM):
+		for _ in range(max_retries):
+			output = await llm.acomplete(
+				prompt.replace("{{text}}", content).replace(
+					"{{num_questions}}", str(question_num)
+				)
+			)
+			result = parse_output(output.text)
+			if len(result) == question_num:
+				return result
+		raise InterruptedError(
+			f"Failed to generate output of length {question_num} after {max_retries} retries."
+		)
+
+	return await generate(content, llm)
+
+
+async def generate_basic_answer(llm: LLM, passage_str: str, query: str) -> str:
+	basic_answer_system_prompt = """You are an AI assistant to answer the given question in the provide evidence text.
+    You can find the evidence from the given text about question, and you have to write a proper answer to the given question.
+    You have to preserve the question's language at the answer.
+    For example, if the input question is Korean, the output answer must be in Korean.
+    """
+	user_prompt = f"Text:\n<|text_start|>\n{passage_str}\n<|text_end|>\n\nQuestion:\n{query}\n\nAnswer:"
+
+	response = await llm.achat(
+		messages=[
+			ChatMessage(role=MessageRole.SYSTEM, content=basic_answer_system_prompt),
+			ChatMessage(role=MessageRole.USER, content=user_prompt),
+		],
+		temperature=1.0,
+	)
+	return response.message.content
+
+
+def validate_llama_index_prompt(prompt: str) -> bool:
+	"""
+	Validate the prompt for the llama index model.
+	The prompt must include the following placeholders:
+	- {{text}}: The content string
+	- {{num_questions}}: The number of questions to generate
+	"""
+	if "{{text}}" not in prompt:
+		raise ValueError("The prompt must include the placeholder {{text}}.")
+	if "{{num_questions}}" not in prompt:
+		raise ValueError("The prompt must include the placeholder {{num_questions}}.")
+	return True
+
+
+def parse_output(result: str) -> List[Dict]:
+	result = result.strip()
+	result = result.split("[Q]:")
+	final_result = list()
+	for res in result:
+		res = res.strip()
+		if res and "\n[A]:" in res:
+			qa = res.split("\n[A]:")
+			final_result.append(
+				{"query": qa[0].strip(), "generation_gt": qa[1].strip()}
+			)
+	return final_result
+
+
+def distribute_list_by_ratio(input_list, ratio) -> List[List[Any]]:
+	total_ratio = sum(ratio)
+	total_length = len(input_list)
+
+	# Calculate the length of each slice
+	slice_lengths = [int((r / total_ratio) * total_length) for r in ratio]
+
+	# Adjust the last slice in case of rounding issues
+	slice_lengths[-1] = total_length - sum(slice_lengths[:-1])
+
+	slices = []
+	start = 0
+	for length in slice_lengths:
+		end = start + length
+		slices.append(input_list[start:end])
+		start = end
+
+	return slices
--- a/autorag/data/legacy/qacreation/llama_index_default_prompt.txt
+++ b/autorag/data/legacy/qacreation/llama_index_default_prompt.txt
@@ -0,0 +1,54 @@
+You're an AI tasked to convert Text into a question and answer set.
+Cover as many details from Text as possible in the QnA set.
+
+Instructions:
+1. Both Questions and Answers MUST BE extracted from given Text
+2. Answers must be full sentences
+3. Questions should be as detailed as possible from Text
+4. Output must always have the provided number of QnAs
+5. Create questions that ask about information from the Text
+6. MUST include specific keywords from the Text.
+7. Do not mention any of these in the questions: "in the given text", "in the provided information", etc.
+
+Question examples:
+1. How do owen and riggs know each other?
+2. What does the word fore "mean" in golf?
+3. What makes charging bull in nyc popular to tourists?
+4. What kind of pistol does the army use?
+5. Who was the greatest violin virtuoso in the romantic period?
+<|separator|>
+
+Text:
+<|text_start|>
+Mark Hamill as Luke Skywalker : One of the last living Jedi , trained by Obi - Wan and Yoda , who is also a skilled X-wing fighter pilot allied with the Rebellion .
+Harrison Ford as Han Solo : A rogue smuggler , who aids the Rebellion against the Empire . Han is Luke and Leia 's friend , as well as Leia 's love interest .
+Carrie Fisher as Leia Organa : The former Princess of the destroyed planet Alderaan , who joins the Rebellion ; Luke 's twin sister , and Han 's love interest .
+Billy Dee Williams as Lando Calrissian : The former Baron Administrator of Cloud City and one of Han 's friends who aids the Rebellion .
+Anthony Daniels as C - 3PO : A humanoid protocol droid , who sides with the Rebellion .
+Peter Mayhew as Chewbacca : A Wookiee who is Han 's longtime friend , who takes part in the Rebellion .
+Kenny Baker as R2 - D2 : An astromech droid , bought by Luke ; and long - time friend to C - 3PO . He also portrays a GONK power droid in the background .
+Ian McDiarmid as the Emperor : The evil founding supreme ruler of the Galactic Empire , and Vader 's Sith Master .
+Frank Oz as Yoda : The wise , centuries - old Grand Master of the Jedi , who is Luke 's self - exiled Jedi Master living on Dagobah . After dying , he reappears to Luke as a Force - ghost . Yoda 's Puppetry was assisted by Mike Quinn .
+David Prowse as Darth Vader / Anakin Skywalker : A powerful Sith lord and the second in command of the Galactic Empire ; Luke and Leia 's father .
+<|text_end|>
+Output with 4 QnAs:
+<|separator|>
+
+[Q]: who played luke father in return of the jedi
+[A]: David Prowse acted as Darth Vader, a.k.a Anakin Skywalker, which is Luke and Leia's father.
+[Q]: Who is Han Solo's best friend? And what species is he?
+[A]: Han Solo's best friend is Chewbacca, who is a Wookiee.
+[Q]: Who played luke's teacher in the return of the jedi
+[A]: Yoda, the wise, centuries-old Grand Master of the Jedi, who is Luke's self-exiled Jedi Master living on Dagobah, was played by Frank Oz.
+Also, there is a mention of Obi-Wan Kenobi, who trained Luke Skywalker.
+But I can't find who played Obi-Wan Kenobi in the given text.
+[Q]: Where Yoda lives in the return of the jedi?
+[A]: Yoda, the Jedi Master, lives on Dagobah.
+<|separator|>
+
+Text:
+<|text_start|>
+{{text}}
+<|text_end|>
+Output with {{num_questions}} QnAs:
+<|separator|>
--- a/autorag/data/legacy/qacreation/ragas.py
+++ b/autorag/data/legacy/qacreation/ragas.py
@@ -0,0 +1,75 @@
+import uuid
+from typing import Optional
+
+import pandas as pd
+from langchain_core.embeddings import Embeddings
+from langchain_core.language_models import BaseChatModel
+from langchain_openai import ChatOpenAI, OpenAIEmbeddings
+
+from autorag.data.utils.util import corpus_df_to_langchain_documents
+from autorag.utils import cast_qa_dataset
+
+
+def generate_qa_ragas(
+	corpus_df: pd.DataFrame,
+	test_size: int,
+	distributions: Optional[dict] = None,
+	generator_llm: Optional[BaseChatModel] = None,
+	critic_llm: Optional[BaseChatModel] = None,
+	embedding_model: Optional[Embeddings] = None,
+	**kwargs,
+) -> pd.DataFrame:
+	"""
+	QA dataset generation using RAGAS.
+	Returns qa dataset dataframe.
+
+	:param corpus_df: Corpus dataframe.
+	:param test_size: Number of queries to generate.
+	:param distributions: Distributions of different types of questions.
+	    Default is "simple is 0.5, multi_context is 0.4, and reasoning is 0.1."
+	    Each type of questions refers to Ragas evolution types.
+	:param generator_llm: Generator language model from Langchain.
+	:param critic_llm: Critic language model from Langchain.
+	:param embedding_model: Embedding model from Langchain.
+	:param kwargs: The additional option to pass to the 'generate_with_langchain_docs' method.
+	    You can input 'with_debugging_logs', 'is_async', 'raise_exceptions', and 'run_config'.
+	:return: QA dataset dataframe.
+	"""
+	from ragas.testset import TestsetGenerator
+	from ragas.testset.evolutions import simple, reasoning, multi_context
+
+	if generator_llm is None:
+		generator_llm = ChatOpenAI(model="gpt-3.5-turbo-16k")
+	if critic_llm is None:
+		critic_llm = ChatOpenAI(model="gpt-4-turbo")
+	if embedding_model is None:
+		embedding_model = OpenAIEmbeddings()
+	if distributions is None:
+		distributions = {simple: 0.5, multi_context: 0.4, reasoning: 0.1}
+
+	assert sum(list(distributions.values())) == 1.0, "Sum of distributions must be 1.0"
+
+	generator = TestsetGenerator.from_langchain(
+		generator_llm, critic_llm, embedding_model
+	)
+
+	langchain_docs = corpus_df_to_langchain_documents(corpus_df)
+
+	test_df = generator.generate_with_langchain_docs(
+		langchain_docs, test_size, distributions=distributions, **kwargs
+	).to_pandas()
+
+	result_df = pd.DataFrame(
+		{
+			"qid": [str(uuid.uuid4()) for _ in range(len(test_df))],
+			"query": test_df["question"].tolist(),
+			"generation_gt": list(map(lambda x: x, test_df["ground_truth"].tolist())),
+		}
+	)
+
+	result_df["retrieval_gt"] = test_df["metadata"].apply(
+		lambda x: list(map(lambda y: y["filename"], x))
+	)
+	result_df = cast_qa_dataset(result_df)
+
+	return result_df
--- a/autorag/data/legacy/qacreation/simple.py
+++ b/autorag/data/legacy/qacreation/simple.py
@@ -0,0 +1,99 @@
+import os
+import pathlib
+import uuid
+from typing import Callable
+
+import pandas as pd
+
+
+def generate_qa_row(llm, corpus_data_row):
+	"""
+	this sample code to generate rag dataset using OpenAI chat model
+
+	:param llm: guidance model
+	:param corpus_data_row: need "contents" column
+	:return: should to be dict which has "query", "generation_gt" columns at least.
+	"""
+	from guidance import gen
+	import guidance
+
+	temp_llm = llm
+	with guidance.user():
+		temp_llm += f"""
+    You have to found a passge to solve "the problem".
+    You need to build a clean and clear set of (problem, passage, answer) in json format
+    so that you don't have to ask about "the problem" again.
+    problem need to end with question mark("?").
+    The process of approaching the answer based on the information of the given passage
+    must be clearly and neatly displayed in the answer.\n
+    \n
+    Here is set of (problem, passage, answer) in JSON format:\n
+    {{\n
+        "passage": {corpus_data_row["contents"]}\n
+        "problem":
+    """
+
+	with guidance.assistant():
+		temp_llm += gen("query", stop="?")
+	with guidance.user():
+		temp_llm += """
+        "answer":
+        """
+	with guidance.assistant():
+		temp_llm += gen("generation_gt")
+
+	corpus_data_row["metadata"]["qa_generation"] = "simple"
+
+	response = {"query": temp_llm["query"], "generation_gt": temp_llm["generation_gt"]}
+	return response
+
+
+def generate_simple_qa_dataset(
+	llm,
+	corpus_data: pd.DataFrame,
+	output_filepath: str,
+	generate_row_function: Callable,
+	**kwargs,
+):
+	"""
+	corpus_data to qa_dataset
+	qa_dataset will be saved to filepath(file_dir/filename)
+
+	:param llm: guidance.models.Model
+	:param corpus_data: pd.DataFrame. refer to the basic structure
+	:param output_filepath: file_dir must exist, filepath must not exist. file extension must be .parquet
+	:param generate_row_function: input(llm, corpus_data_row, kwargs) output(dict[columns contain "query" and "generation_gt"])
+	:param kwargs: if generate_row_function requires more args, use kwargs
+	:return: qa_dataset as pd.DataFrame
+	"""
+	output_file_dir = pathlib.PurePath(output_filepath).parent
+	if not os.path.isdir(output_file_dir):
+		raise NotADirectoryError(f"directory {output_file_dir}  not found.")
+	if not output_filepath.endswith("parquet"):
+		raise NameError(
+			f'file path: {output_filepath}  filename extension need to be ".parquet"'
+		)
+	if os.path.exists(output_filepath):
+		raise FileExistsError(
+			f"{output_filepath.split('/')[-1]} already exists in {output_file_dir}."
+		)
+
+	qa_data_lst = []
+	for _, corpus_data_row in corpus_data.iterrows():
+		response = generate_row_function(
+			llm=llm, corpus_data_row=corpus_data_row, **kwargs
+		)
+		qa_data_lst.append(
+			{
+				"qid": str(uuid.uuid4()),
+				"query": response["query"],
+				"retrieval_gt": [[corpus_data_row["doc_id"]]],
+				"generation_gt": [response["generation_gt"]],
+				"metadata": corpus_data_row["metadata"],
+			}
+		)
+
+	qa_dataset = pd.DataFrame(qa_data_lst)
+	qa_dataset.to_parquet(output_filepath, index=False)
+
+	return qa_dataset