Fix Dockerfile build issue
This commit is contained in:
0
autorag/data/legacy/__init__.py
Normal file
0
autorag/data/legacy/__init__.py
Normal file
2
autorag/data/legacy/corpus/__init__.py
Normal file
2
autorag/data/legacy/corpus/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
from .langchain import langchain_documents_to_parquet
|
||||
from .llama_index import llama_documents_to_parquet, llama_text_node_to_parquet
|
||||
47
autorag/data/legacy/corpus/langchain.py
Normal file
47
autorag/data/legacy/corpus/langchain.py
Normal file
@@ -0,0 +1,47 @@
|
||||
import uuid
|
||||
from typing import List, Optional
|
||||
|
||||
import pandas as pd
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from autorag.data.utils.util import add_essential_metadata
|
||||
from autorag.utils.util import save_parquet_safe
|
||||
|
||||
|
||||
def langchain_documents_to_parquet(
|
||||
langchain_documents: List[Document],
|
||||
output_filepath: Optional[str] = None,
|
||||
upsert: bool = False,
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Langchain documents to corpus dataframe.
|
||||
Corpus dataframe will be saved to filepath(file_dir/filename) if given.
|
||||
Return corpus dataframe whether the filepath is given.
|
||||
You can use this method to create corpus.parquet after load and chunk using Llama Index.
|
||||
|
||||
:param langchain_documents: List of langchain documents.
|
||||
:param output_filepath: Optional filepath to save the parquet file.
|
||||
If None, the function will return the processed_data as pd.DataFrame, but do not save as parquet.
|
||||
File directory must exist. File extension must be .parquet
|
||||
:param upsert: If true, the function will overwrite the existing file if it exists.
|
||||
Default is False.
|
||||
:return: Corpus data as pd.DataFrame
|
||||
"""
|
||||
|
||||
corpus_df = pd.DataFrame(
|
||||
list(
|
||||
map(
|
||||
lambda doc: {
|
||||
"doc_id": str(uuid.uuid4()),
|
||||
"contents": doc.page_content,
|
||||
"metadata": add_essential_metadata(doc.metadata),
|
||||
},
|
||||
langchain_documents,
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
if output_filepath is not None:
|
||||
save_parquet_safe(corpus_df, output_filepath, upsert=upsert)
|
||||
|
||||
return corpus_df
|
||||
93
autorag/data/legacy/corpus/llama_index.py
Normal file
93
autorag/data/legacy/corpus/llama_index.py
Normal file
@@ -0,0 +1,93 @@
|
||||
import uuid
|
||||
from typing import List, Optional
|
||||
|
||||
import pandas as pd
|
||||
from llama_index.core import Document
|
||||
from llama_index.core.schema import TextNode
|
||||
|
||||
from autorag.data.utils.util import (
|
||||
add_essential_metadata,
|
||||
add_essential_metadata_llama_text_node,
|
||||
)
|
||||
from autorag.utils.util import save_parquet_safe
|
||||
|
||||
|
||||
def llama_documents_to_parquet(
|
||||
llama_documents: List[Document],
|
||||
output_filepath: Optional[str] = None,
|
||||
upsert: bool = False,
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Llama Index documents to corpus dataframe.
|
||||
Corpus dataframe will be saved to filepath(file_dir/filename) if given.
|
||||
Return corpus dataframe whether the filepath is given.
|
||||
You can use this method to create corpus.parquet after load and chunk using Llama Index.
|
||||
|
||||
:param llama_documents: List[Document]
|
||||
:param output_filepath: Optional filepath to save the parquet file.
|
||||
If None, the function will return the processed_data as pd.DataFrame, but do not save as parquet.
|
||||
File directory must exist. File extension must be .parquet
|
||||
:param upsert: If true, the function will overwrite the existing file if it exists.
|
||||
Default is False.
|
||||
:return: Corpus data as pd.DataFrame
|
||||
"""
|
||||
|
||||
doc_lst = pd.DataFrame(
|
||||
list(
|
||||
map(
|
||||
lambda doc: {
|
||||
"doc_id": str(uuid.uuid4()),
|
||||
"contents": doc.text,
|
||||
"metadata": add_essential_metadata(doc.metadata),
|
||||
},
|
||||
llama_documents,
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
processed_df = pd.DataFrame(doc_lst)
|
||||
|
||||
if output_filepath is not None:
|
||||
save_parquet_safe(processed_df, output_filepath, upsert=upsert)
|
||||
|
||||
return processed_df
|
||||
|
||||
|
||||
def llama_text_node_to_parquet(
|
||||
text_nodes: List[TextNode],
|
||||
output_filepath: Optional[str] = None,
|
||||
upsert: bool = False,
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Llama Index text nodes to corpus dataframe.
|
||||
Corpus dataframe will be saved to filepath(file_dir/filename) if given.
|
||||
Return corpus dataframe whether the filepath is given.
|
||||
You can use this method to create corpus.parquet after load and chunk using Llama Index.
|
||||
|
||||
:param text_nodes: List of llama index text nodes.
|
||||
:param output_filepath: Optional filepath to save the parquet file.
|
||||
If None, the function will return the processed_data as pd.DataFrame, but do not save as parquet.
|
||||
File directory must exist. File extension must be .parquet
|
||||
:param upsert: If true, the function will overwrite the existing file if it exists.
|
||||
Default is False.
|
||||
:return: Corpus data as pd.DataFrame
|
||||
"""
|
||||
corpus_df = pd.DataFrame(
|
||||
list(
|
||||
map(
|
||||
lambda node: {
|
||||
"doc_id": node.node_id,
|
||||
"contents": node.text,
|
||||
"metadata": add_essential_metadata_llama_text_node(
|
||||
node.metadata, node.relationships
|
||||
),
|
||||
},
|
||||
text_nodes,
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
if output_filepath is not None:
|
||||
save_parquet_safe(corpus_df, output_filepath, upsert=upsert)
|
||||
|
||||
return corpus_df
|
||||
6
autorag/data/legacy/qacreation/__init__.py
Normal file
6
autorag/data/legacy/qacreation/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
from .base import make_single_content_qa, make_qa_with_existing_qa
|
||||
from .llama_index import (
|
||||
generate_qa_llama_index,
|
||||
generate_answers,
|
||||
generate_qa_llama_index_by_ratio,
|
||||
)
|
||||
239
autorag/data/legacy/qacreation/base.py
Normal file
239
autorag/data/legacy/qacreation/base.py
Normal file
@@ -0,0 +1,239 @@
|
||||
import logging
|
||||
import uuid
|
||||
from typing import Callable, Optional, List
|
||||
|
||||
import chromadb
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
|
||||
import autorag
|
||||
from autorag.nodes.retrieval.vectordb import vectordb_ingest, vectordb_pure
|
||||
from autorag.utils.util import (
|
||||
save_parquet_safe,
|
||||
fetch_contents,
|
||||
get_event_loop,
|
||||
process_batch,
|
||||
)
|
||||
|
||||
logger = logging.getLogger("AutoRAG")
|
||||
|
||||
|
||||
def make_single_content_qa(
|
||||
corpus_df: pd.DataFrame,
|
||||
content_size: int,
|
||||
qa_creation_func: Callable,
|
||||
output_filepath: Optional[str] = None,
|
||||
upsert: bool = False,
|
||||
random_state: int = 42,
|
||||
cache_batch: int = 32,
|
||||
**kwargs,
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Make single content (single-hop, single-document) QA dataset using given qa_creation_func.
|
||||
It generates a single content QA dataset, which means its retrieval ground truth will be only one.
|
||||
It is the most basic form of QA dataset.
|
||||
|
||||
:param corpus_df: The corpus dataframe to make QA dataset from.
|
||||
:param content_size: This function will generate QA dataset for the given number of contents.
|
||||
:param qa_creation_func: The function to create QA pairs.
|
||||
You can use like `generate_qa_llama_index` or `generate_qa_llama_index_by_ratio`.
|
||||
The input func must have `contents` parameter for the list of content string.
|
||||
:param output_filepath: Optional filepath to save the parquet file.
|
||||
If None, the function will return the processed_data as pd.DataFrame, but do not save as parquet.
|
||||
File directory must exist. File extension must be .parquet
|
||||
:param upsert: If true, the function will overwrite the existing file if it exists.
|
||||
Default is False.
|
||||
:param random_state: The random state for sampling corpus from the given corpus_df.
|
||||
:param cache_batch: The number of batches to use for caching the generated QA dataset.
|
||||
When the cache_batch size data is generated, the dataset will save to the designated output_filepath.
|
||||
If the cache_batch size is too small, the process time will be longer.
|
||||
:param kwargs: The keyword arguments for qa_creation_func.
|
||||
:return: QA dataset dataframe.
|
||||
You can save this as parquet file to use at AutoRAG.
|
||||
"""
|
||||
assert content_size > 0, "content_size must be greater than 0."
|
||||
if content_size > len(corpus_df):
|
||||
logger.warning(
|
||||
f"content_size {content_size} is larger than the corpus size {len(corpus_df)}. "
|
||||
"Setting content_size to the corpus size."
|
||||
)
|
||||
content_size = len(corpus_df)
|
||||
sampled_corpus = corpus_df.sample(n=content_size, random_state=random_state)
|
||||
sampled_corpus = sampled_corpus.reset_index(drop=True)
|
||||
|
||||
def make_query_generation_gt(row):
|
||||
return row["qa"]["query"], row["qa"]["generation_gt"]
|
||||
|
||||
qa_data = pd.DataFrame()
|
||||
for idx, i in tqdm(enumerate(range(0, len(sampled_corpus), cache_batch))):
|
||||
qa = qa_creation_func(
|
||||
contents=sampled_corpus["contents"].tolist()[i : i + cache_batch], **kwargs
|
||||
)
|
||||
|
||||
temp_qa_data = pd.DataFrame(
|
||||
{
|
||||
"qa": qa,
|
||||
"retrieval_gt": sampled_corpus["doc_id"].tolist()[i : i + cache_batch],
|
||||
}
|
||||
)
|
||||
temp_qa_data = temp_qa_data.explode("qa", ignore_index=True)
|
||||
temp_qa_data["qid"] = [str(uuid.uuid4()) for _ in range(len(temp_qa_data))]
|
||||
temp_qa_data[["query", "generation_gt"]] = temp_qa_data.apply(
|
||||
make_query_generation_gt, axis=1, result_type="expand"
|
||||
)
|
||||
temp_qa_data = temp_qa_data.drop(columns=["qa"])
|
||||
|
||||
temp_qa_data["retrieval_gt"] = temp_qa_data["retrieval_gt"].apply(
|
||||
lambda x: [[x]]
|
||||
)
|
||||
temp_qa_data["generation_gt"] = temp_qa_data["generation_gt"].apply(
|
||||
lambda x: [x]
|
||||
)
|
||||
|
||||
if idx == 0:
|
||||
qa_data = temp_qa_data
|
||||
else:
|
||||
qa_data = pd.concat([qa_data, temp_qa_data], ignore_index=True)
|
||||
if output_filepath is not None:
|
||||
save_parquet_safe(qa_data, output_filepath, upsert=upsert)
|
||||
|
||||
return qa_data
|
||||
|
||||
|
||||
def make_qa_with_existing_qa(
|
||||
corpus_df: pd.DataFrame,
|
||||
existing_query_df: pd.DataFrame,
|
||||
content_size: int,
|
||||
answer_creation_func: Optional[Callable] = None,
|
||||
exist_gen_gt: Optional[bool] = False,
|
||||
output_filepath: Optional[str] = None,
|
||||
embedding_model: str = "openai_embed_3_large",
|
||||
collection: Optional[chromadb.Collection] = None,
|
||||
upsert: bool = False,
|
||||
random_state: int = 42,
|
||||
cache_batch: int = 32,
|
||||
top_k: int = 3,
|
||||
**kwargs,
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Make single-hop QA dataset using given qa_creation_func and existing queries.
|
||||
|
||||
:param corpus_df: The corpus dataframe to make QA dataset from.
|
||||
:param existing_query_df: Dataframe containing existing queries to use for QA pair creation.
|
||||
:param content_size: This function will generate QA dataset for the given number of contents.
|
||||
:param answer_creation_func: Optional function to create answer with input query.
|
||||
If exist_gen_gt is False, this function must be given.
|
||||
:param exist_gen_gt: Optional boolean to use existing generation_gt.
|
||||
If True, the existing_query_df must have 'generation_gt' column.
|
||||
If False, the answer_creation_func must be given.
|
||||
:param output_filepath: Optional filepath to save the parquet file.
|
||||
:param embedding_model: The embedding model to use for vectorization.
|
||||
You can add your own embedding model in the autorag.embedding_models.
|
||||
Please refer to how to add an embedding model in this doc: https://docs.auto-rag.com/local_model.html
|
||||
The default is 'openai_embed_3_large'.
|
||||
:param collection: The chromadb collection to use for vector DB.
|
||||
You can make any chromadb collection and use it here.
|
||||
If you already ingested the corpus_df to the collection, the embedding process will not be repeated.
|
||||
The default is None. If None, it makes a temporary collection.
|
||||
:param upsert: If true, the function will overwrite the existing file if it exists.
|
||||
:param random_state: The random state for sampling corpus from the given corpus_df.
|
||||
:param cache_batch: The number of batches to use for caching the generated QA dataset.
|
||||
:param top_k: The number of sources to refer by model.
|
||||
Default is 3.
|
||||
:param kwargs: The keyword arguments for qa_creation_func.
|
||||
:return: QA dataset dataframe.
|
||||
"""
|
||||
raise DeprecationWarning("This function is deprecated.")
|
||||
assert (
|
||||
"query" in existing_query_df.columns
|
||||
), "existing_query_df must have 'query' column."
|
||||
|
||||
if exist_gen_gt:
|
||||
assert (
|
||||
"generation_gt" in existing_query_df.columns
|
||||
), "existing_query_df must have 'generation_gt' column."
|
||||
else:
|
||||
assert (
|
||||
answer_creation_func is not None
|
||||
), "answer_creation_func must be given when exist_gen_gt is False."
|
||||
|
||||
assert content_size > 0, "content_size must be greater than 0."
|
||||
if content_size > len(corpus_df):
|
||||
logger.warning(
|
||||
f"content_size {content_size} is larger than the corpus size {len(corpus_df)}. "
|
||||
"Setting content_size to the corpus size."
|
||||
)
|
||||
content_size = len(corpus_df)
|
||||
|
||||
logger.info("Loading local embedding model...")
|
||||
embeddings = autorag.embedding_models[embedding_model]()
|
||||
|
||||
# Vector DB creation
|
||||
if collection is None:
|
||||
chroma_client = chromadb.Client()
|
||||
collection_name = "auto-rag"
|
||||
collection = chroma_client.get_or_create_collection(collection_name)
|
||||
|
||||
# embed corpus_df
|
||||
vectordb_ingest(collection, corpus_df, embeddings)
|
||||
query_embeddings = embeddings.get_text_embedding_batch(
|
||||
existing_query_df["query"].tolist()
|
||||
)
|
||||
|
||||
loop = get_event_loop()
|
||||
tasks = [
|
||||
vectordb_pure([query_embedding], top_k, collection)
|
||||
for query_embedding in query_embeddings
|
||||
]
|
||||
results = loop.run_until_complete(process_batch(tasks, batch_size=cache_batch))
|
||||
retrieved_ids = list(map(lambda x: x[0], results))
|
||||
|
||||
retrieved_contents: List[List[str]] = fetch_contents(corpus_df, retrieved_ids)
|
||||
input_passage_strs: List[str] = list(
|
||||
map(
|
||||
lambda x: "\n".join(
|
||||
[f"Document {i + 1}\n{content}" for i, content in enumerate(x)]
|
||||
),
|
||||
retrieved_contents,
|
||||
)
|
||||
)
|
||||
|
||||
retrieved_qa_df = pd.DataFrame(
|
||||
{
|
||||
"qid": [str(uuid.uuid4()) for _ in range(len(existing_query_df))],
|
||||
"query": existing_query_df["query"].tolist(),
|
||||
"retrieval_gt": list(map(lambda x: [x], retrieved_ids)),
|
||||
"input_passage_str": input_passage_strs,
|
||||
}
|
||||
)
|
||||
|
||||
if exist_gen_gt:
|
||||
generation_gt = existing_query_df["generation_gt"].tolist()
|
||||
if isinstance(generation_gt[0], np.ndarray):
|
||||
retrieved_qa_df["generation_gt"] = generation_gt
|
||||
else:
|
||||
raise ValueError(
|
||||
"In existing_query_df, generation_gt (per query) must be in the form of List[str]."
|
||||
)
|
||||
|
||||
sample_qa_df = retrieved_qa_df.sample(
|
||||
n=min(content_size, len(retrieved_qa_df)), random_state=random_state
|
||||
)
|
||||
|
||||
qa_df = sample_qa_df.copy(deep=True)
|
||||
qa_df.drop(columns=["input_passage_str"], inplace=True)
|
||||
|
||||
if not exist_gen_gt:
|
||||
generation_gt = answer_creation_func(
|
||||
contents=sample_qa_df["input_passage_str"].tolist(),
|
||||
queries=sample_qa_df["query"].tolist(),
|
||||
batch=cache_batch,
|
||||
**kwargs,
|
||||
)
|
||||
qa_df["generation_gt"] = generation_gt
|
||||
|
||||
if output_filepath is not None:
|
||||
save_parquet_safe(qa_df, output_filepath, upsert=upsert)
|
||||
|
||||
return qa_df
|
||||
253
autorag/data/legacy/qacreation/llama_index.py
Normal file
253
autorag/data/legacy/qacreation/llama_index.py
Normal file
@@ -0,0 +1,253 @@
|
||||
import os.path
|
||||
import random
|
||||
from typing import Optional, List, Dict, Any
|
||||
|
||||
import pandas as pd
|
||||
from llama_index.core.base.llms.types import ChatMessage, MessageRole
|
||||
from llama_index.core.llms import LLM
|
||||
|
||||
from autorag.utils.util import process_batch, get_event_loop
|
||||
|
||||
package_dir = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
|
||||
def generate_qa_llama_index(
|
||||
llm: LLM,
|
||||
contents: List[str],
|
||||
prompt: Optional[str] = None,
|
||||
question_num_per_content: int = 1,
|
||||
max_retries: int = 3,
|
||||
batch: int = 4,
|
||||
) -> List[List[Dict]]:
|
||||
"""
|
||||
Generate a qa set from the list of contents.
|
||||
It uses a single prompt for all contents.
|
||||
If you want to use more than one prompt for generating qa,
|
||||
you can consider using generate_qa_llama_index_by_ratio.
|
||||
|
||||
:param llm: Llama index model
|
||||
:param contents: List of content strings.
|
||||
:param prompt: The prompt to use for the qa generation.
|
||||
The prompt must include the following placeholders:
|
||||
- {{text}}: The content string
|
||||
- {{num_questions}}: The number of questions to generate
|
||||
As default, the prompt is set to the default prompt for the question type.
|
||||
:param question_num_per_content: Number of questions to generate for each content.
|
||||
Default is 1.
|
||||
:param max_retries: The maximum number of retries when generated question number is not equal to the target number.
|
||||
Default is 3.
|
||||
:param batch: The batch size to process asynchronously.
|
||||
Default is 4.
|
||||
:return: 2-d list of dictionaries containing the query and generation_gt.
|
||||
"""
|
||||
# load default prompt
|
||||
if prompt is None:
|
||||
prompt = open(
|
||||
os.path.join(package_dir, "llama_index_default_prompt.txt"), "r"
|
||||
).read()
|
||||
|
||||
tasks = [
|
||||
async_qa_gen_llama_index(
|
||||
content, llm, prompt, question_num_per_content, max_retries
|
||||
)
|
||||
for content in contents
|
||||
]
|
||||
loops = get_event_loop()
|
||||
results = loops.run_until_complete(process_batch(tasks, batch))
|
||||
return results
|
||||
|
||||
|
||||
def generate_answers(
|
||||
llm: LLM,
|
||||
contents: List[str],
|
||||
queries: List[str],
|
||||
batch: int = 4,
|
||||
) -> List[List[Dict]]:
|
||||
"""
|
||||
Generate qa sets from the list of contents using existing queries.
|
||||
|
||||
:param llm: Llama index model
|
||||
:param contents: List of content strings.
|
||||
:param queries: List of existing queries.
|
||||
:param batch: The batch size to process asynchronously.
|
||||
:return: 2-d list of dictionaries containing the query and generation_gt.
|
||||
"""
|
||||
|
||||
tasks = [
|
||||
generate_basic_answer(llm, content, query)
|
||||
for content, query in zip(contents, queries)
|
||||
]
|
||||
loops = get_event_loop()
|
||||
results = loops.run_until_complete(process_batch(tasks, batch))
|
||||
return results
|
||||
|
||||
|
||||
def generate_qa_llama_index_by_ratio(
|
||||
llm: LLM,
|
||||
contents: List[str],
|
||||
prompts_ratio: Dict,
|
||||
question_num_per_content: int = 1,
|
||||
max_retries: int = 3,
|
||||
random_state: int = 42,
|
||||
batch: int = 4,
|
||||
) -> List[List[Dict]]:
|
||||
"""
|
||||
Generate a qa set from the list of contents.
|
||||
You can set the ratio of prompts that you want to use for generating qa.
|
||||
It distributes the number of questions to generate for each content by the ratio randomly.
|
||||
|
||||
:param llm: Llama index model
|
||||
:param contents: List of content strings.
|
||||
:param prompts_ratio: Dictionary of prompt paths and their ratios.
|
||||
Example: {"prompt/prompt1.txt": 0.5, "prompt/prompt2.txt": 0.5}
|
||||
The value sum doesn't have to be 1.
|
||||
The path must be the absolute path, and the file must exist.
|
||||
Plus, it has to be a text file which contains proper prompt.
|
||||
Each prompt must contain the following placeholders:
|
||||
- {{text}}: The content string
|
||||
- {{num_questions}}: The number of questions to generate
|
||||
:param question_num_per_content: Number of questions to generate for each content.
|
||||
Default is 1.
|
||||
:param max_retries: The maximum number of retries when generated question number is not equal to the target number.
|
||||
Default is 3.
|
||||
:param random_state: Random seed
|
||||
Default is 42.
|
||||
:param batch: The batch size to process asynchronously.
|
||||
Default is 4.
|
||||
:return: 2-d list of dictionaries containing the query and generation_gt.
|
||||
"""
|
||||
prompts = list(map(lambda path: open(path, "r").read(), prompts_ratio.keys()))
|
||||
assert all([validate_llama_index_prompt(prompt) for prompt in prompts])
|
||||
|
||||
content_indices = list(range(len(contents)))
|
||||
random.seed(random_state)
|
||||
random.shuffle(content_indices)
|
||||
|
||||
slice_content_indices: List[List[str]] = distribute_list_by_ratio(
|
||||
content_indices, list(prompts_ratio.values())
|
||||
)
|
||||
temp_df = pd.DataFrame({"idx": slice_content_indices, "prompt": prompts})
|
||||
temp_df = temp_df.explode("idx", ignore_index=True)
|
||||
temp_df = temp_df.sort_values(by="idx", ascending=True)
|
||||
|
||||
final_df = pd.DataFrame({"content": contents, "prompt": temp_df["prompt"].tolist()})
|
||||
|
||||
tasks = [
|
||||
async_qa_gen_llama_index(
|
||||
content, llm, prompt, question_num_per_content, max_retries
|
||||
)
|
||||
for content, prompt in zip(
|
||||
final_df["content"].tolist(), final_df["prompt"].tolist()
|
||||
)
|
||||
]
|
||||
|
||||
loops = get_event_loop()
|
||||
results = loops.run_until_complete(process_batch(tasks, batch))
|
||||
|
||||
return results
|
||||
|
||||
|
||||
async def async_qa_gen_llama_index(
|
||||
content: str,
|
||||
llm: LLM,
|
||||
prompt: str,
|
||||
question_num: int = 1,
|
||||
max_retries: int = 3,
|
||||
):
|
||||
"""
|
||||
Generate a qa set by using the given content and the llama index model.
|
||||
You must select the question type.
|
||||
|
||||
:param content: Content string
|
||||
:param llm: Llama index model
|
||||
:param prompt: The prompt to use for the qa generation.
|
||||
The prompt must include the following placeholders:
|
||||
- {{text}}: The content string
|
||||
- {{num_questions}}: The number of questions to generate
|
||||
:param question_num: The number of questions to generate
|
||||
:param max_retries: Maximum number of retries when generated question number is not equal to the target number
|
||||
:return: List of dictionaries containing the query and generation_gt
|
||||
"""
|
||||
validate_llama_index_prompt(prompt)
|
||||
|
||||
async def generate(content: str, llm: LLM):
|
||||
for _ in range(max_retries):
|
||||
output = await llm.acomplete(
|
||||
prompt.replace("{{text}}", content).replace(
|
||||
"{{num_questions}}", str(question_num)
|
||||
)
|
||||
)
|
||||
result = parse_output(output.text)
|
||||
if len(result) == question_num:
|
||||
return result
|
||||
raise InterruptedError(
|
||||
f"Failed to generate output of length {question_num} after {max_retries} retries."
|
||||
)
|
||||
|
||||
return await generate(content, llm)
|
||||
|
||||
|
||||
async def generate_basic_answer(llm: LLM, passage_str: str, query: str) -> str:
|
||||
basic_answer_system_prompt = """You are an AI assistant to answer the given question in the provide evidence text.
|
||||
You can find the evidence from the given text about question, and you have to write a proper answer to the given question.
|
||||
You have to preserve the question's language at the answer.
|
||||
For example, if the input question is Korean, the output answer must be in Korean.
|
||||
"""
|
||||
user_prompt = f"Text:\n<|text_start|>\n{passage_str}\n<|text_end|>\n\nQuestion:\n{query}\n\nAnswer:"
|
||||
|
||||
response = await llm.achat(
|
||||
messages=[
|
||||
ChatMessage(role=MessageRole.SYSTEM, content=basic_answer_system_prompt),
|
||||
ChatMessage(role=MessageRole.USER, content=user_prompt),
|
||||
],
|
||||
temperature=1.0,
|
||||
)
|
||||
return response.message.content
|
||||
|
||||
|
||||
def validate_llama_index_prompt(prompt: str) -> bool:
|
||||
"""
|
||||
Validate the prompt for the llama index model.
|
||||
The prompt must include the following placeholders:
|
||||
- {{text}}: The content string
|
||||
- {{num_questions}}: The number of questions to generate
|
||||
"""
|
||||
if "{{text}}" not in prompt:
|
||||
raise ValueError("The prompt must include the placeholder {{text}}.")
|
||||
if "{{num_questions}}" not in prompt:
|
||||
raise ValueError("The prompt must include the placeholder {{num_questions}}.")
|
||||
return True
|
||||
|
||||
|
||||
def parse_output(result: str) -> List[Dict]:
|
||||
result = result.strip()
|
||||
result = result.split("[Q]:")
|
||||
final_result = list()
|
||||
for res in result:
|
||||
res = res.strip()
|
||||
if res and "\n[A]:" in res:
|
||||
qa = res.split("\n[A]:")
|
||||
final_result.append(
|
||||
{"query": qa[0].strip(), "generation_gt": qa[1].strip()}
|
||||
)
|
||||
return final_result
|
||||
|
||||
|
||||
def distribute_list_by_ratio(input_list, ratio) -> List[List[Any]]:
|
||||
total_ratio = sum(ratio)
|
||||
total_length = len(input_list)
|
||||
|
||||
# Calculate the length of each slice
|
||||
slice_lengths = [int((r / total_ratio) * total_length) for r in ratio]
|
||||
|
||||
# Adjust the last slice in case of rounding issues
|
||||
slice_lengths[-1] = total_length - sum(slice_lengths[:-1])
|
||||
|
||||
slices = []
|
||||
start = 0
|
||||
for length in slice_lengths:
|
||||
end = start + length
|
||||
slices.append(input_list[start:end])
|
||||
start = end
|
||||
|
||||
return slices
|
||||
@@ -0,0 +1,54 @@
|
||||
You're an AI tasked to convert Text into a question and answer set.
|
||||
Cover as many details from Text as possible in the QnA set.
|
||||
|
||||
Instructions:
|
||||
1. Both Questions and Answers MUST BE extracted from given Text
|
||||
2. Answers must be full sentences
|
||||
3. Questions should be as detailed as possible from Text
|
||||
4. Output must always have the provided number of QnAs
|
||||
5. Create questions that ask about information from the Text
|
||||
6. MUST include specific keywords from the Text.
|
||||
7. Do not mention any of these in the questions: "in the given text", "in the provided information", etc.
|
||||
|
||||
Question examples:
|
||||
1. How do owen and riggs know each other?
|
||||
2. What does the word fore "mean" in golf?
|
||||
3. What makes charging bull in nyc popular to tourists?
|
||||
4. What kind of pistol does the army use?
|
||||
5. Who was the greatest violin virtuoso in the romantic period?
|
||||
<|separator|>
|
||||
|
||||
Text:
|
||||
<|text_start|>
|
||||
Mark Hamill as Luke Skywalker : One of the last living Jedi , trained by Obi - Wan and Yoda , who is also a skilled X-wing fighter pilot allied with the Rebellion .
|
||||
Harrison Ford as Han Solo : A rogue smuggler , who aids the Rebellion against the Empire . Han is Luke and Leia 's friend , as well as Leia 's love interest .
|
||||
Carrie Fisher as Leia Organa : The former Princess of the destroyed planet Alderaan , who joins the Rebellion ; Luke 's twin sister , and Han 's love interest .
|
||||
Billy Dee Williams as Lando Calrissian : The former Baron Administrator of Cloud City and one of Han 's friends who aids the Rebellion .
|
||||
Anthony Daniels as C - 3PO : A humanoid protocol droid , who sides with the Rebellion .
|
||||
Peter Mayhew as Chewbacca : A Wookiee who is Han 's longtime friend , who takes part in the Rebellion .
|
||||
Kenny Baker as R2 - D2 : An astromech droid , bought by Luke ; and long - time friend to C - 3PO . He also portrays a GONK power droid in the background .
|
||||
Ian McDiarmid as the Emperor : The evil founding supreme ruler of the Galactic Empire , and Vader 's Sith Master .
|
||||
Frank Oz as Yoda : The wise , centuries - old Grand Master of the Jedi , who is Luke 's self - exiled Jedi Master living on Dagobah . After dying , he reappears to Luke as a Force - ghost . Yoda 's Puppetry was assisted by Mike Quinn .
|
||||
David Prowse as Darth Vader / Anakin Skywalker : A powerful Sith lord and the second in command of the Galactic Empire ; Luke and Leia 's father .
|
||||
<|text_end|>
|
||||
Output with 4 QnAs:
|
||||
<|separator|>
|
||||
|
||||
[Q]: who played luke father in return of the jedi
|
||||
[A]: David Prowse acted as Darth Vader, a.k.a Anakin Skywalker, which is Luke and Leia's father.
|
||||
[Q]: Who is Han Solo's best friend? And what species is he?
|
||||
[A]: Han Solo's best friend is Chewbacca, who is a Wookiee.
|
||||
[Q]: Who played luke's teacher in the return of the jedi
|
||||
[A]: Yoda, the wise, centuries-old Grand Master of the Jedi, who is Luke's self-exiled Jedi Master living on Dagobah, was played by Frank Oz.
|
||||
Also, there is a mention of Obi-Wan Kenobi, who trained Luke Skywalker.
|
||||
But I can't find who played Obi-Wan Kenobi in the given text.
|
||||
[Q]: Where Yoda lives in the return of the jedi?
|
||||
[A]: Yoda, the Jedi Master, lives on Dagobah.
|
||||
<|separator|>
|
||||
|
||||
Text:
|
||||
<|text_start|>
|
||||
{{text}}
|
||||
<|text_end|>
|
||||
Output with {{num_questions}} QnAs:
|
||||
<|separator|>
|
||||
75
autorag/data/legacy/qacreation/ragas.py
Normal file
75
autorag/data/legacy/qacreation/ragas.py
Normal file
@@ -0,0 +1,75 @@
|
||||
import uuid
|
||||
from typing import Optional
|
||||
|
||||
import pandas as pd
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.language_models import BaseChatModel
|
||||
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
||||
|
||||
from autorag.data.utils.util import corpus_df_to_langchain_documents
|
||||
from autorag.utils import cast_qa_dataset
|
||||
|
||||
|
||||
def generate_qa_ragas(
|
||||
corpus_df: pd.DataFrame,
|
||||
test_size: int,
|
||||
distributions: Optional[dict] = None,
|
||||
generator_llm: Optional[BaseChatModel] = None,
|
||||
critic_llm: Optional[BaseChatModel] = None,
|
||||
embedding_model: Optional[Embeddings] = None,
|
||||
**kwargs,
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
QA dataset generation using RAGAS.
|
||||
Returns qa dataset dataframe.
|
||||
|
||||
:param corpus_df: Corpus dataframe.
|
||||
:param test_size: Number of queries to generate.
|
||||
:param distributions: Distributions of different types of questions.
|
||||
Default is "simple is 0.5, multi_context is 0.4, and reasoning is 0.1."
|
||||
Each type of questions refers to Ragas evolution types.
|
||||
:param generator_llm: Generator language model from Langchain.
|
||||
:param critic_llm: Critic language model from Langchain.
|
||||
:param embedding_model: Embedding model from Langchain.
|
||||
:param kwargs: The additional option to pass to the 'generate_with_langchain_docs' method.
|
||||
You can input 'with_debugging_logs', 'is_async', 'raise_exceptions', and 'run_config'.
|
||||
:return: QA dataset dataframe.
|
||||
"""
|
||||
from ragas.testset import TestsetGenerator
|
||||
from ragas.testset.evolutions import simple, reasoning, multi_context
|
||||
|
||||
if generator_llm is None:
|
||||
generator_llm = ChatOpenAI(model="gpt-3.5-turbo-16k")
|
||||
if critic_llm is None:
|
||||
critic_llm = ChatOpenAI(model="gpt-4-turbo")
|
||||
if embedding_model is None:
|
||||
embedding_model = OpenAIEmbeddings()
|
||||
if distributions is None:
|
||||
distributions = {simple: 0.5, multi_context: 0.4, reasoning: 0.1}
|
||||
|
||||
assert sum(list(distributions.values())) == 1.0, "Sum of distributions must be 1.0"
|
||||
|
||||
generator = TestsetGenerator.from_langchain(
|
||||
generator_llm, critic_llm, embedding_model
|
||||
)
|
||||
|
||||
langchain_docs = corpus_df_to_langchain_documents(corpus_df)
|
||||
|
||||
test_df = generator.generate_with_langchain_docs(
|
||||
langchain_docs, test_size, distributions=distributions, **kwargs
|
||||
).to_pandas()
|
||||
|
||||
result_df = pd.DataFrame(
|
||||
{
|
||||
"qid": [str(uuid.uuid4()) for _ in range(len(test_df))],
|
||||
"query": test_df["question"].tolist(),
|
||||
"generation_gt": list(map(lambda x: x, test_df["ground_truth"].tolist())),
|
||||
}
|
||||
)
|
||||
|
||||
result_df["retrieval_gt"] = test_df["metadata"].apply(
|
||||
lambda x: list(map(lambda y: y["filename"], x))
|
||||
)
|
||||
result_df = cast_qa_dataset(result_df)
|
||||
|
||||
return result_df
|
||||
99
autorag/data/legacy/qacreation/simple.py
Normal file
99
autorag/data/legacy/qacreation/simple.py
Normal file
@@ -0,0 +1,99 @@
|
||||
import os
|
||||
import pathlib
|
||||
import uuid
|
||||
from typing import Callable
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def generate_qa_row(llm, corpus_data_row):
|
||||
"""
|
||||
this sample code to generate rag dataset using OpenAI chat model
|
||||
|
||||
:param llm: guidance model
|
||||
:param corpus_data_row: need "contents" column
|
||||
:return: should to be dict which has "query", "generation_gt" columns at least.
|
||||
"""
|
||||
from guidance import gen
|
||||
import guidance
|
||||
|
||||
temp_llm = llm
|
||||
with guidance.user():
|
||||
temp_llm += f"""
|
||||
You have to found a passge to solve "the problem".
|
||||
You need to build a clean and clear set of (problem, passage, answer) in json format
|
||||
so that you don't have to ask about "the problem" again.
|
||||
problem need to end with question mark("?").
|
||||
The process of approaching the answer based on the information of the given passage
|
||||
must be clearly and neatly displayed in the answer.\n
|
||||
\n
|
||||
Here is set of (problem, passage, answer) in JSON format:\n
|
||||
{{\n
|
||||
"passage": {corpus_data_row["contents"]}\n
|
||||
"problem":
|
||||
"""
|
||||
|
||||
with guidance.assistant():
|
||||
temp_llm += gen("query", stop="?")
|
||||
with guidance.user():
|
||||
temp_llm += """
|
||||
"answer":
|
||||
"""
|
||||
with guidance.assistant():
|
||||
temp_llm += gen("generation_gt")
|
||||
|
||||
corpus_data_row["metadata"]["qa_generation"] = "simple"
|
||||
|
||||
response = {"query": temp_llm["query"], "generation_gt": temp_llm["generation_gt"]}
|
||||
return response
|
||||
|
||||
|
||||
def generate_simple_qa_dataset(
|
||||
llm,
|
||||
corpus_data: pd.DataFrame,
|
||||
output_filepath: str,
|
||||
generate_row_function: Callable,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
corpus_data to qa_dataset
|
||||
qa_dataset will be saved to filepath(file_dir/filename)
|
||||
|
||||
:param llm: guidance.models.Model
|
||||
:param corpus_data: pd.DataFrame. refer to the basic structure
|
||||
:param output_filepath: file_dir must exist, filepath must not exist. file extension must be .parquet
|
||||
:param generate_row_function: input(llm, corpus_data_row, kwargs) output(dict[columns contain "query" and "generation_gt"])
|
||||
:param kwargs: if generate_row_function requires more args, use kwargs
|
||||
:return: qa_dataset as pd.DataFrame
|
||||
"""
|
||||
output_file_dir = pathlib.PurePath(output_filepath).parent
|
||||
if not os.path.isdir(output_file_dir):
|
||||
raise NotADirectoryError(f"directory {output_file_dir} not found.")
|
||||
if not output_filepath.endswith("parquet"):
|
||||
raise NameError(
|
||||
f'file path: {output_filepath} filename extension need to be ".parquet"'
|
||||
)
|
||||
if os.path.exists(output_filepath):
|
||||
raise FileExistsError(
|
||||
f"{output_filepath.split('/')[-1]} already exists in {output_file_dir}."
|
||||
)
|
||||
|
||||
qa_data_lst = []
|
||||
for _, corpus_data_row in corpus_data.iterrows():
|
||||
response = generate_row_function(
|
||||
llm=llm, corpus_data_row=corpus_data_row, **kwargs
|
||||
)
|
||||
qa_data_lst.append(
|
||||
{
|
||||
"qid": str(uuid.uuid4()),
|
||||
"query": response["query"],
|
||||
"retrieval_gt": [[corpus_data_row["doc_id"]]],
|
||||
"generation_gt": [response["generation_gt"]],
|
||||
"metadata": corpus_data_row["metadata"],
|
||||
}
|
||||
)
|
||||
|
||||
qa_dataset = pd.DataFrame(qa_data_lst)
|
||||
qa_dataset.to_parquet(output_filepath, index=False)
|
||||
|
||||
return qa_dataset
|
||||
Reference in New Issue
Block a user