Fix Dockerfile build issue

2025-03-18 16:41:12 +09:00
parent 6814230bfb
commit 9323aa254a
228 changed files with 467 additions and 3488 deletions
--- a/autorag/data/legacy/qacreation/simple.py
+++ b/autorag/data/legacy/qacreation/simple.py
@@ -0,0 +1,99 @@
+import os
+import pathlib
+import uuid
+from typing import Callable
+
+import pandas as pd
+
+
+def generate_qa_row(llm, corpus_data_row):
+	"""
+	this sample code to generate rag dataset using OpenAI chat model
+
+	:param llm: guidance model
+	:param corpus_data_row: need "contents" column
+	:return: should to be dict which has "query", "generation_gt" columns at least.
+	"""
+	from guidance import gen
+	import guidance
+
+	temp_llm = llm
+	with guidance.user():
+		temp_llm += f"""
+    You have to found a passge to solve "the problem".
+    You need to build a clean and clear set of (problem, passage, answer) in json format
+    so that you don't have to ask about "the problem" again.
+    problem need to end with question mark("?").
+    The process of approaching the answer based on the information of the given passage
+    must be clearly and neatly displayed in the answer.\n
+    \n
+    Here is set of (problem, passage, answer) in JSON format:\n
+    {{\n
+        "passage": {corpus_data_row["contents"]}\n
+        "problem":
+    """
+
+	with guidance.assistant():
+		temp_llm += gen("query", stop="?")
+	with guidance.user():
+		temp_llm += """
+        "answer":
+        """
+	with guidance.assistant():
+		temp_llm += gen("generation_gt")
+
+	corpus_data_row["metadata"]["qa_generation"] = "simple"
+
+	response = {"query": temp_llm["query"], "generation_gt": temp_llm["generation_gt"]}
+	return response
+
+
+def generate_simple_qa_dataset(
+	llm,
+	corpus_data: pd.DataFrame,
+	output_filepath: str,
+	generate_row_function: Callable,
+	**kwargs,
+):
+	"""
+	corpus_data to qa_dataset
+	qa_dataset will be saved to filepath(file_dir/filename)
+
+	:param llm: guidance.models.Model
+	:param corpus_data: pd.DataFrame. refer to the basic structure
+	:param output_filepath: file_dir must exist, filepath must not exist. file extension must be .parquet
+	:param generate_row_function: input(llm, corpus_data_row, kwargs) output(dict[columns contain "query" and "generation_gt"])
+	:param kwargs: if generate_row_function requires more args, use kwargs
+	:return: qa_dataset as pd.DataFrame
+	"""
+	output_file_dir = pathlib.PurePath(output_filepath).parent
+	if not os.path.isdir(output_file_dir):
+		raise NotADirectoryError(f"directory {output_file_dir}  not found.")
+	if not output_filepath.endswith("parquet"):
+		raise NameError(
+			f'file path: {output_filepath}  filename extension need to be ".parquet"'
+		)
+	if os.path.exists(output_filepath):
+		raise FileExistsError(
+			f"{output_filepath.split('/')[-1]} already exists in {output_file_dir}."
+		)
+
+	qa_data_lst = []
+	for _, corpus_data_row in corpus_data.iterrows():
+		response = generate_row_function(
+			llm=llm, corpus_data_row=corpus_data_row, **kwargs
+		)
+		qa_data_lst.append(
+			{
+				"qid": str(uuid.uuid4()),
+				"query": response["query"],
+				"retrieval_gt": [[corpus_data_row["doc_id"]]],
+				"generation_gt": [response["generation_gt"]],
+				"metadata": corpus_data_row["metadata"],
+			}
+		)
+
+	qa_dataset = pd.DataFrame(qa_data_lst)
+	qa_dataset.to_parquet(output_filepath, index=False)
+
+	return qa_dataset