Fix Dockerfile build issue
This commit is contained in:
239
autorag/data/legacy/qacreation/base.py
Normal file
239
autorag/data/legacy/qacreation/base.py
Normal file
@@ -0,0 +1,239 @@
|
||||
import logging
|
||||
import uuid
|
||||
from typing import Callable, Optional, List
|
||||
|
||||
import chromadb
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
|
||||
import autorag
|
||||
from autorag.nodes.retrieval.vectordb import vectordb_ingest, vectordb_pure
|
||||
from autorag.utils.util import (
|
||||
save_parquet_safe,
|
||||
fetch_contents,
|
||||
get_event_loop,
|
||||
process_batch,
|
||||
)
|
||||
|
||||
logger = logging.getLogger("AutoRAG")
|
||||
|
||||
|
||||
def make_single_content_qa(
|
||||
corpus_df: pd.DataFrame,
|
||||
content_size: int,
|
||||
qa_creation_func: Callable,
|
||||
output_filepath: Optional[str] = None,
|
||||
upsert: bool = False,
|
||||
random_state: int = 42,
|
||||
cache_batch: int = 32,
|
||||
**kwargs,
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Make single content (single-hop, single-document) QA dataset using given qa_creation_func.
|
||||
It generates a single content QA dataset, which means its retrieval ground truth will be only one.
|
||||
It is the most basic form of QA dataset.
|
||||
|
||||
:param corpus_df: The corpus dataframe to make QA dataset from.
|
||||
:param content_size: This function will generate QA dataset for the given number of contents.
|
||||
:param qa_creation_func: The function to create QA pairs.
|
||||
You can use like `generate_qa_llama_index` or `generate_qa_llama_index_by_ratio`.
|
||||
The input func must have `contents` parameter for the list of content string.
|
||||
:param output_filepath: Optional filepath to save the parquet file.
|
||||
If None, the function will return the processed_data as pd.DataFrame, but do not save as parquet.
|
||||
File directory must exist. File extension must be .parquet
|
||||
:param upsert: If true, the function will overwrite the existing file if it exists.
|
||||
Default is False.
|
||||
:param random_state: The random state for sampling corpus from the given corpus_df.
|
||||
:param cache_batch: The number of batches to use for caching the generated QA dataset.
|
||||
When the cache_batch size data is generated, the dataset will save to the designated output_filepath.
|
||||
If the cache_batch size is too small, the process time will be longer.
|
||||
:param kwargs: The keyword arguments for qa_creation_func.
|
||||
:return: QA dataset dataframe.
|
||||
You can save this as parquet file to use at AutoRAG.
|
||||
"""
|
||||
assert content_size > 0, "content_size must be greater than 0."
|
||||
if content_size > len(corpus_df):
|
||||
logger.warning(
|
||||
f"content_size {content_size} is larger than the corpus size {len(corpus_df)}. "
|
||||
"Setting content_size to the corpus size."
|
||||
)
|
||||
content_size = len(corpus_df)
|
||||
sampled_corpus = corpus_df.sample(n=content_size, random_state=random_state)
|
||||
sampled_corpus = sampled_corpus.reset_index(drop=True)
|
||||
|
||||
def make_query_generation_gt(row):
|
||||
return row["qa"]["query"], row["qa"]["generation_gt"]
|
||||
|
||||
qa_data = pd.DataFrame()
|
||||
for idx, i in tqdm(enumerate(range(0, len(sampled_corpus), cache_batch))):
|
||||
qa = qa_creation_func(
|
||||
contents=sampled_corpus["contents"].tolist()[i : i + cache_batch], **kwargs
|
||||
)
|
||||
|
||||
temp_qa_data = pd.DataFrame(
|
||||
{
|
||||
"qa": qa,
|
||||
"retrieval_gt": sampled_corpus["doc_id"].tolist()[i : i + cache_batch],
|
||||
}
|
||||
)
|
||||
temp_qa_data = temp_qa_data.explode("qa", ignore_index=True)
|
||||
temp_qa_data["qid"] = [str(uuid.uuid4()) for _ in range(len(temp_qa_data))]
|
||||
temp_qa_data[["query", "generation_gt"]] = temp_qa_data.apply(
|
||||
make_query_generation_gt, axis=1, result_type="expand"
|
||||
)
|
||||
temp_qa_data = temp_qa_data.drop(columns=["qa"])
|
||||
|
||||
temp_qa_data["retrieval_gt"] = temp_qa_data["retrieval_gt"].apply(
|
||||
lambda x: [[x]]
|
||||
)
|
||||
temp_qa_data["generation_gt"] = temp_qa_data["generation_gt"].apply(
|
||||
lambda x: [x]
|
||||
)
|
||||
|
||||
if idx == 0:
|
||||
qa_data = temp_qa_data
|
||||
else:
|
||||
qa_data = pd.concat([qa_data, temp_qa_data], ignore_index=True)
|
||||
if output_filepath is not None:
|
||||
save_parquet_safe(qa_data, output_filepath, upsert=upsert)
|
||||
|
||||
return qa_data
|
||||
|
||||
|
||||
def make_qa_with_existing_qa(
|
||||
corpus_df: pd.DataFrame,
|
||||
existing_query_df: pd.DataFrame,
|
||||
content_size: int,
|
||||
answer_creation_func: Optional[Callable] = None,
|
||||
exist_gen_gt: Optional[bool] = False,
|
||||
output_filepath: Optional[str] = None,
|
||||
embedding_model: str = "openai_embed_3_large",
|
||||
collection: Optional[chromadb.Collection] = None,
|
||||
upsert: bool = False,
|
||||
random_state: int = 42,
|
||||
cache_batch: int = 32,
|
||||
top_k: int = 3,
|
||||
**kwargs,
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Make single-hop QA dataset using given qa_creation_func and existing queries.
|
||||
|
||||
:param corpus_df: The corpus dataframe to make QA dataset from.
|
||||
:param existing_query_df: Dataframe containing existing queries to use for QA pair creation.
|
||||
:param content_size: This function will generate QA dataset for the given number of contents.
|
||||
:param answer_creation_func: Optional function to create answer with input query.
|
||||
If exist_gen_gt is False, this function must be given.
|
||||
:param exist_gen_gt: Optional boolean to use existing generation_gt.
|
||||
If True, the existing_query_df must have 'generation_gt' column.
|
||||
If False, the answer_creation_func must be given.
|
||||
:param output_filepath: Optional filepath to save the parquet file.
|
||||
:param embedding_model: The embedding model to use for vectorization.
|
||||
You can add your own embedding model in the autorag.embedding_models.
|
||||
Please refer to how to add an embedding model in this doc: https://docs.auto-rag.com/local_model.html
|
||||
The default is 'openai_embed_3_large'.
|
||||
:param collection: The chromadb collection to use for vector DB.
|
||||
You can make any chromadb collection and use it here.
|
||||
If you already ingested the corpus_df to the collection, the embedding process will not be repeated.
|
||||
The default is None. If None, it makes a temporary collection.
|
||||
:param upsert: If true, the function will overwrite the existing file if it exists.
|
||||
:param random_state: The random state for sampling corpus from the given corpus_df.
|
||||
:param cache_batch: The number of batches to use for caching the generated QA dataset.
|
||||
:param top_k: The number of sources to refer by model.
|
||||
Default is 3.
|
||||
:param kwargs: The keyword arguments for qa_creation_func.
|
||||
:return: QA dataset dataframe.
|
||||
"""
|
||||
raise DeprecationWarning("This function is deprecated.")
|
||||
assert (
|
||||
"query" in existing_query_df.columns
|
||||
), "existing_query_df must have 'query' column."
|
||||
|
||||
if exist_gen_gt:
|
||||
assert (
|
||||
"generation_gt" in existing_query_df.columns
|
||||
), "existing_query_df must have 'generation_gt' column."
|
||||
else:
|
||||
assert (
|
||||
answer_creation_func is not None
|
||||
), "answer_creation_func must be given when exist_gen_gt is False."
|
||||
|
||||
assert content_size > 0, "content_size must be greater than 0."
|
||||
if content_size > len(corpus_df):
|
||||
logger.warning(
|
||||
f"content_size {content_size} is larger than the corpus size {len(corpus_df)}. "
|
||||
"Setting content_size to the corpus size."
|
||||
)
|
||||
content_size = len(corpus_df)
|
||||
|
||||
logger.info("Loading local embedding model...")
|
||||
embeddings = autorag.embedding_models[embedding_model]()
|
||||
|
||||
# Vector DB creation
|
||||
if collection is None:
|
||||
chroma_client = chromadb.Client()
|
||||
collection_name = "auto-rag"
|
||||
collection = chroma_client.get_or_create_collection(collection_name)
|
||||
|
||||
# embed corpus_df
|
||||
vectordb_ingest(collection, corpus_df, embeddings)
|
||||
query_embeddings = embeddings.get_text_embedding_batch(
|
||||
existing_query_df["query"].tolist()
|
||||
)
|
||||
|
||||
loop = get_event_loop()
|
||||
tasks = [
|
||||
vectordb_pure([query_embedding], top_k, collection)
|
||||
for query_embedding in query_embeddings
|
||||
]
|
||||
results = loop.run_until_complete(process_batch(tasks, batch_size=cache_batch))
|
||||
retrieved_ids = list(map(lambda x: x[0], results))
|
||||
|
||||
retrieved_contents: List[List[str]] = fetch_contents(corpus_df, retrieved_ids)
|
||||
input_passage_strs: List[str] = list(
|
||||
map(
|
||||
lambda x: "\n".join(
|
||||
[f"Document {i + 1}\n{content}" for i, content in enumerate(x)]
|
||||
),
|
||||
retrieved_contents,
|
||||
)
|
||||
)
|
||||
|
||||
retrieved_qa_df = pd.DataFrame(
|
||||
{
|
||||
"qid": [str(uuid.uuid4()) for _ in range(len(existing_query_df))],
|
||||
"query": existing_query_df["query"].tolist(),
|
||||
"retrieval_gt": list(map(lambda x: [x], retrieved_ids)),
|
||||
"input_passage_str": input_passage_strs,
|
||||
}
|
||||
)
|
||||
|
||||
if exist_gen_gt:
|
||||
generation_gt = existing_query_df["generation_gt"].tolist()
|
||||
if isinstance(generation_gt[0], np.ndarray):
|
||||
retrieved_qa_df["generation_gt"] = generation_gt
|
||||
else:
|
||||
raise ValueError(
|
||||
"In existing_query_df, generation_gt (per query) must be in the form of List[str]."
|
||||
)
|
||||
|
||||
sample_qa_df = retrieved_qa_df.sample(
|
||||
n=min(content_size, len(retrieved_qa_df)), random_state=random_state
|
||||
)
|
||||
|
||||
qa_df = sample_qa_df.copy(deep=True)
|
||||
qa_df.drop(columns=["input_passage_str"], inplace=True)
|
||||
|
||||
if not exist_gen_gt:
|
||||
generation_gt = answer_creation_func(
|
||||
contents=sample_qa_df["input_passage_str"].tolist(),
|
||||
queries=sample_qa_df["query"].tolist(),
|
||||
batch=cache_batch,
|
||||
**kwargs,
|
||||
)
|
||||
qa_df["generation_gt"] = generation_gt
|
||||
|
||||
if output_filepath is not None:
|
||||
save_parquet_safe(qa_df, output_filepath, upsert=upsert)
|
||||
|
||||
return qa_df
|
||||
Reference in New Issue
Block a user