Fix Dockerfile build issue

2025-03-18 16:41:12 +09:00
parent 6814230bfb
commit 9323aa254a
228 changed files with 467 additions and 3488 deletions
--- a/autorag/data/legacy/qacreation/base.py
+++ b/autorag/data/legacy/qacreation/base.py
@@ -0,0 +1,239 @@
+import logging
+import uuid
+from typing import Callable, Optional, List
+
+import chromadb
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+
+import autorag
+from autorag.nodes.retrieval.vectordb import vectordb_ingest, vectordb_pure
+from autorag.utils.util import (
+	save_parquet_safe,
+	fetch_contents,
+	get_event_loop,
+	process_batch,
+)
+
+logger = logging.getLogger("AutoRAG")
+
+
+def make_single_content_qa(
+	corpus_df: pd.DataFrame,
+	content_size: int,
+	qa_creation_func: Callable,
+	output_filepath: Optional[str] = None,
+	upsert: bool = False,
+	random_state: int = 42,
+	cache_batch: int = 32,
+	**kwargs,
+) -> pd.DataFrame:
+	"""
+	Make single content (single-hop, single-document) QA dataset using given qa_creation_func.
+	It generates a single content QA dataset, which means its retrieval ground truth will be only one.
+	It is the most basic form of QA dataset.
+
+	:param corpus_df: The corpus dataframe to make QA dataset from.
+	:param content_size: This function will generate QA dataset for the given number of contents.
+	:param qa_creation_func: The function to create QA pairs.
+	    You can use like `generate_qa_llama_index` or `generate_qa_llama_index_by_ratio`.
+	    The input func must have `contents` parameter for the list of content string.
+	:param output_filepath: Optional filepath to save the parquet file.
+	    If None, the function will return the processed_data as pd.DataFrame, but do not save as parquet.
+	    File directory must exist. File extension must be .parquet
+	:param upsert: If true, the function will overwrite the existing file if it exists.
+	    Default is False.
+	:param random_state: The random state for sampling corpus from the given corpus_df.
+	:param cache_batch: The number of batches to use for caching the generated QA dataset.
+	    When the cache_batch size data is generated, the dataset will save to the designated output_filepath.
+	    If the cache_batch size is too small, the process time will be longer.
+	:param kwargs: The keyword arguments for qa_creation_func.
+	:return: QA dataset dataframe.
+	    You can save this as parquet file to use at AutoRAG.
+	"""
+	assert content_size > 0, "content_size must be greater than 0."
+	if content_size > len(corpus_df):
+		logger.warning(
+			f"content_size {content_size} is larger than the corpus size {len(corpus_df)}. "
+			"Setting content_size to the corpus size."
+		)
+		content_size = len(corpus_df)
+	sampled_corpus = corpus_df.sample(n=content_size, random_state=random_state)
+	sampled_corpus = sampled_corpus.reset_index(drop=True)
+
+	def make_query_generation_gt(row):
+		return row["qa"]["query"], row["qa"]["generation_gt"]
+
+	qa_data = pd.DataFrame()
+	for idx, i in tqdm(enumerate(range(0, len(sampled_corpus), cache_batch))):
+		qa = qa_creation_func(
+			contents=sampled_corpus["contents"].tolist()[i : i + cache_batch], **kwargs
+		)
+
+		temp_qa_data = pd.DataFrame(
+			{
+				"qa": qa,
+				"retrieval_gt": sampled_corpus["doc_id"].tolist()[i : i + cache_batch],
+			}
+		)
+		temp_qa_data = temp_qa_data.explode("qa", ignore_index=True)
+		temp_qa_data["qid"] = [str(uuid.uuid4()) for _ in range(len(temp_qa_data))]
+		temp_qa_data[["query", "generation_gt"]] = temp_qa_data.apply(
+			make_query_generation_gt, axis=1, result_type="expand"
+		)
+		temp_qa_data = temp_qa_data.drop(columns=["qa"])
+
+		temp_qa_data["retrieval_gt"] = temp_qa_data["retrieval_gt"].apply(
+			lambda x: [[x]]
+		)
+		temp_qa_data["generation_gt"] = temp_qa_data["generation_gt"].apply(
+			lambda x: [x]
+		)
+
+		if idx == 0:
+			qa_data = temp_qa_data
+		else:
+			qa_data = pd.concat([qa_data, temp_qa_data], ignore_index=True)
+		if output_filepath is not None:
+			save_parquet_safe(qa_data, output_filepath, upsert=upsert)
+
+	return qa_data
+
+
+def make_qa_with_existing_qa(
+	corpus_df: pd.DataFrame,
+	existing_query_df: pd.DataFrame,
+	content_size: int,
+	answer_creation_func: Optional[Callable] = None,
+	exist_gen_gt: Optional[bool] = False,
+	output_filepath: Optional[str] = None,
+	embedding_model: str = "openai_embed_3_large",
+	collection: Optional[chromadb.Collection] = None,
+	upsert: bool = False,
+	random_state: int = 42,
+	cache_batch: int = 32,
+	top_k: int = 3,
+	**kwargs,
+) -> pd.DataFrame:
+	"""
+	Make single-hop QA dataset using given qa_creation_func and existing queries.
+
+	:param corpus_df: The corpus dataframe to make QA dataset from.
+	:param existing_query_df: Dataframe containing existing queries to use for QA pair creation.
+	:param content_size: This function will generate QA dataset for the given number of contents.
+	:param answer_creation_func: Optional function to create answer with input query.
+	    If exist_gen_gt is False, this function must be given.
+	:param exist_gen_gt: Optional boolean to use existing generation_gt.
+	    If True, the existing_query_df must have 'generation_gt' column.
+	    If False, the answer_creation_func must be given.
+	:param output_filepath: Optional filepath to save the parquet file.
+	:param embedding_model: The embedding model to use for vectorization.
+	    You can add your own embedding model in the autorag.embedding_models.
+	    Please refer to how to add an embedding model in this doc: https://docs.auto-rag.com/local_model.html
+	    The default is 'openai_embed_3_large'.
+	:param collection: The chromadb collection to use for vector DB.
+	    You can make any chromadb collection and use it here.
+	    If you already ingested the corpus_df to the collection, the embedding process will not be repeated.
+	    The default is None. If None, it makes a temporary collection.
+	:param upsert: If true, the function will overwrite the existing file if it exists.
+	:param random_state: The random state for sampling corpus from the given corpus_df.
+	:param cache_batch: The number of batches to use for caching the generated QA dataset.
+	:param top_k: The number of sources to refer by model.
+	    Default is 3.
+	:param kwargs: The keyword arguments for qa_creation_func.
+	:return: QA dataset dataframe.
+	"""
+	raise DeprecationWarning("This function is deprecated.")
+	assert (
+		"query" in existing_query_df.columns
+	), "existing_query_df must have 'query' column."
+
+	if exist_gen_gt:
+		assert (
+			"generation_gt" in existing_query_df.columns
+		), "existing_query_df must have 'generation_gt' column."
+	else:
+		assert (
+			answer_creation_func is not None
+		), "answer_creation_func must be given when exist_gen_gt is False."
+
+	assert content_size > 0, "content_size must be greater than 0."
+	if content_size > len(corpus_df):
+		logger.warning(
+			f"content_size {content_size} is larger than the corpus size {len(corpus_df)}. "
+			"Setting content_size to the corpus size."
+		)
+		content_size = len(corpus_df)
+
+	logger.info("Loading local embedding model...")
+	embeddings = autorag.embedding_models[embedding_model]()
+
+	# Vector DB creation
+	if collection is None:
+		chroma_client = chromadb.Client()
+		collection_name = "auto-rag"
+		collection = chroma_client.get_or_create_collection(collection_name)
+
+	# embed corpus_df
+	vectordb_ingest(collection, corpus_df, embeddings)
+	query_embeddings = embeddings.get_text_embedding_batch(
+		existing_query_df["query"].tolist()
+	)
+
+	loop = get_event_loop()
+	tasks = [
+		vectordb_pure([query_embedding], top_k, collection)
+		for query_embedding in query_embeddings
+	]
+	results = loop.run_until_complete(process_batch(tasks, batch_size=cache_batch))
+	retrieved_ids = list(map(lambda x: x[0], results))
+
+	retrieved_contents: List[List[str]] = fetch_contents(corpus_df, retrieved_ids)
+	input_passage_strs: List[str] = list(
+		map(
+			lambda x: "\n".join(
+				[f"Document {i + 1}\n{content}" for i, content in enumerate(x)]
+			),
+			retrieved_contents,
+		)
+	)
+
+	retrieved_qa_df = pd.DataFrame(
+		{
+			"qid": [str(uuid.uuid4()) for _ in range(len(existing_query_df))],
+			"query": existing_query_df["query"].tolist(),
+			"retrieval_gt": list(map(lambda x: [x], retrieved_ids)),
+			"input_passage_str": input_passage_strs,
+		}
+	)
+
+	if exist_gen_gt:
+		generation_gt = existing_query_df["generation_gt"].tolist()
+		if isinstance(generation_gt[0], np.ndarray):
+			retrieved_qa_df["generation_gt"] = generation_gt
+		else:
+			raise ValueError(
+				"In existing_query_df, generation_gt (per query) must be in the form of List[str]."
+			)
+
+	sample_qa_df = retrieved_qa_df.sample(
+		n=min(content_size, len(retrieved_qa_df)), random_state=random_state
+	)
+
+	qa_df = sample_qa_df.copy(deep=True)
+	qa_df.drop(columns=["input_passage_str"], inplace=True)
+
+	if not exist_gen_gt:
+		generation_gt = answer_creation_func(
+			contents=sample_qa_df["input_passage_str"].tolist(),
+			queries=sample_qa_df["query"].tolist(),
+			batch=cache_batch,
+			**kwargs,
+		)
+		qa_df["generation_gt"] = generation_gt
+
+	if output_filepath is not None:
+		save_parquet_safe(qa_df, output_filepath, upsert=upsert)
+
+	return qa_df