Fix Dockerfile build issue
This commit is contained in:
303
autorag/nodes/retrieval/vectordb.py
Normal file
303
autorag/nodes/retrieval/vectordb.py
Normal file
@@ -0,0 +1,303 @@
|
||||
import itertools
|
||||
import logging
|
||||
import os
|
||||
from typing import List, Tuple, Optional
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from llama_index.core.embeddings import BaseEmbedding
|
||||
from llama_index.embeddings.openai import OpenAIEmbedding
|
||||
|
||||
from autorag.evaluation.metric.util import (
|
||||
calculate_l2_distance,
|
||||
calculate_inner_product,
|
||||
calculate_cosine_similarity,
|
||||
)
|
||||
from autorag.nodes.retrieval.base import evenly_distribute_passages, BaseRetrieval
|
||||
from autorag.utils import (
|
||||
validate_corpus_dataset,
|
||||
cast_corpus_dataset,
|
||||
cast_qa_dataset,
|
||||
validate_qa_dataset,
|
||||
)
|
||||
from autorag.utils.util import (
|
||||
get_event_loop,
|
||||
process_batch,
|
||||
openai_truncate_by_token,
|
||||
flatten_apply,
|
||||
result_to_dataframe,
|
||||
pop_params,
|
||||
fetch_contents,
|
||||
empty_cuda_cache,
|
||||
convert_inputs_to_list,
|
||||
make_batch,
|
||||
)
|
||||
from autorag.vectordb import load_vectordb_from_yaml
|
||||
from autorag.vectordb.base import BaseVectorStore
|
||||
|
||||
logger = logging.getLogger("AutoRAG")
|
||||
|
||||
|
||||
class VectorDB(BaseRetrieval):
|
||||
def __init__(self, project_dir: str, vectordb: str = "default", **kwargs):
|
||||
"""
|
||||
Initialize VectorDB retrieval node.
|
||||
|
||||
:param project_dir: The project directory path.
|
||||
:param vectordb: The vectordb name.
|
||||
You must configure the vectordb name in the config.yaml file.
|
||||
If you don't configure, it uses the default vectordb.
|
||||
:param kwargs: The optional arguments.
|
||||
Not affected in the init method.
|
||||
"""
|
||||
super().__init__(project_dir)
|
||||
|
||||
vectordb_config_path = os.path.join(self.resources_dir, "vectordb.yaml")
|
||||
self.vector_store = load_vectordb_from_yaml(
|
||||
vectordb_config_path, vectordb, project_dir
|
||||
)
|
||||
|
||||
self.embedding_model = self.vector_store.embedding
|
||||
|
||||
def __del__(self):
|
||||
del self.vector_store
|
||||
del self.embedding_model
|
||||
empty_cuda_cache()
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries = self.cast_to_run(previous_result)
|
||||
pure_params = pop_params(self._pure, kwargs)
|
||||
ids, scores = self._pure(queries, **pure_params)
|
||||
contents = fetch_contents(self.corpus_df, ids)
|
||||
return contents, ids, scores
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[List[str]],
|
||||
top_k: int,
|
||||
embedding_batch: int = 128,
|
||||
ids: Optional[List[List[str]]] = None,
|
||||
) -> Tuple[List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
VectorDB retrieval function.
|
||||
You have to get a chroma collection that is already ingested.
|
||||
You have to get an embedding model that is already used in ingesting.
|
||||
|
||||
:param queries: 2-d list of query strings.
|
||||
Each element of the list is a query strings of each row.
|
||||
:param top_k: The number of passages to be retrieved.
|
||||
:param embedding_batch: The number of queries to be processed in parallel.
|
||||
This is used to prevent API error at the query embedding.
|
||||
Default is 128.
|
||||
:param ids: The optional list of ids that you want to retrieve.
|
||||
You don't need to specify this in the general use cases.
|
||||
Default is None.
|
||||
|
||||
:return: The 2-d list contains a list of passage ids that retrieved from vectordb and 2-d list of its scores.
|
||||
It will be a length of queries. And each element has a length of top_k.
|
||||
"""
|
||||
# if ids are specified, fetch the ids score from Chroma
|
||||
if ids is not None:
|
||||
return self.__get_ids_scores(queries, ids, embedding_batch)
|
||||
|
||||
# run async vector_db_pure function
|
||||
tasks = [
|
||||
vectordb_pure(query_list, top_k, self.vector_store)
|
||||
for query_list in queries
|
||||
]
|
||||
loop = get_event_loop()
|
||||
results = loop.run_until_complete(
|
||||
process_batch(tasks, batch_size=embedding_batch)
|
||||
)
|
||||
id_result = list(map(lambda x: x[0], results))
|
||||
score_result = list(map(lambda x: x[1], results))
|
||||
return id_result, score_result
|
||||
|
||||
def __get_ids_scores(self, queries, ids, embedding_batch: int):
|
||||
# truncate queries and embedding execution here.
|
||||
openai_embedding_limit = 8000
|
||||
if isinstance(self.embedding_model, OpenAIEmbedding):
|
||||
queries = list(
|
||||
map(
|
||||
lambda query_list: openai_truncate_by_token(
|
||||
query_list,
|
||||
openai_embedding_limit,
|
||||
self.embedding_model.model_name,
|
||||
),
|
||||
queries,
|
||||
)
|
||||
)
|
||||
|
||||
query_embeddings = flatten_apply(
|
||||
run_query_embedding_batch,
|
||||
queries,
|
||||
embedding_model=self.embedding_model,
|
||||
batch_size=embedding_batch,
|
||||
)
|
||||
|
||||
loop = get_event_loop()
|
||||
|
||||
async def run_fetch(ids):
|
||||
final_result = []
|
||||
for id_list in ids:
|
||||
if len(id_list) == 0:
|
||||
final_result.append([])
|
||||
else:
|
||||
result = await self.vector_store.fetch(id_list)
|
||||
final_result.append(result)
|
||||
return final_result
|
||||
|
||||
content_embeddings = loop.run_until_complete(run_fetch(ids))
|
||||
|
||||
score_result = list(
|
||||
map(
|
||||
lambda query_embedding_list, content_embedding_list: get_id_scores(
|
||||
query_embedding_list,
|
||||
content_embedding_list,
|
||||
similarity_metric=self.vector_store.similarity_metric,
|
||||
),
|
||||
query_embeddings,
|
||||
content_embeddings,
|
||||
)
|
||||
)
|
||||
return ids, score_result
|
||||
|
||||
|
||||
async def vectordb_pure(
|
||||
queries: List[str], top_k: int, vectordb: BaseVectorStore
|
||||
) -> Tuple[List[str], List[float]]:
|
||||
"""
|
||||
Async VectorDB retrieval function.
|
||||
Its usage is for async retrieval of vector_db row by row.
|
||||
|
||||
:param query_embeddings: A list of query embeddings.
|
||||
:param top_k: The number of passages to be retrieved.
|
||||
:param vectordb: The vector store instance.
|
||||
:return: The tuple contains a list of passage ids that are retrieved from vectordb and a list of its scores.
|
||||
"""
|
||||
id_result, score_result = await vectordb.query(queries=queries, top_k=top_k)
|
||||
|
||||
# Distribute passages evenly
|
||||
id_result, score_result = evenly_distribute_passages(id_result, score_result, top_k)
|
||||
# sort id_result and score_result by score
|
||||
result = [
|
||||
(_id, score)
|
||||
for score, _id in sorted(
|
||||
zip(score_result, id_result), key=lambda pair: pair[0], reverse=True
|
||||
)
|
||||
]
|
||||
id_result, score_result = zip(*result)
|
||||
return list(id_result), list(score_result)
|
||||
|
||||
|
||||
async def filter_exist_ids(
|
||||
vectordb: BaseVectorStore,
|
||||
corpus_data: pd.DataFrame,
|
||||
) -> pd.DataFrame:
|
||||
corpus_data = cast_corpus_dataset(corpus_data)
|
||||
validate_corpus_dataset(corpus_data)
|
||||
ids = corpus_data["doc_id"].tolist()
|
||||
|
||||
# Query the collection to check if IDs already exist
|
||||
existed_bool_list = await vectordb.is_exist(ids=ids)
|
||||
# Assuming 'ids' is the key in the response
|
||||
new_passage = corpus_data[~pd.Series(existed_bool_list)]
|
||||
return new_passage
|
||||
|
||||
|
||||
async def filter_exist_ids_from_retrieval_gt(
|
||||
vectordb: BaseVectorStore,
|
||||
qa_data: pd.DataFrame,
|
||||
corpus_data: pd.DataFrame,
|
||||
) -> pd.DataFrame:
|
||||
qa_data = cast_qa_dataset(qa_data)
|
||||
validate_qa_dataset(qa_data)
|
||||
corpus_data = cast_corpus_dataset(corpus_data)
|
||||
validate_corpus_dataset(corpus_data)
|
||||
retrieval_gt = (
|
||||
qa_data["retrieval_gt"]
|
||||
.apply(lambda x: list(itertools.chain.from_iterable(x)))
|
||||
.tolist()
|
||||
)
|
||||
retrieval_gt = list(itertools.chain.from_iterable(retrieval_gt))
|
||||
retrieval_gt = list(set(retrieval_gt))
|
||||
|
||||
existed_bool_list = await vectordb.is_exist(ids=retrieval_gt)
|
||||
add_ids = []
|
||||
for ret_gt, is_exist in zip(retrieval_gt, existed_bool_list):
|
||||
if not is_exist:
|
||||
add_ids.append(ret_gt)
|
||||
new_passage = corpus_data[corpus_data["doc_id"].isin(add_ids)]
|
||||
return new_passage
|
||||
|
||||
|
||||
async def vectordb_ingest(
|
||||
vectordb: BaseVectorStore,
|
||||
corpus_data: pd.DataFrame,
|
||||
):
|
||||
"""
|
||||
Ingest given corpus data to the vectordb.
|
||||
It truncates corpus content when the embedding model is OpenAIEmbedding to the 8000 tokens.
|
||||
Plus, when the corpus content is empty (whitespace), it will be ignored.
|
||||
And if there is a document id that already exists in the collection, it will be ignored.
|
||||
|
||||
:param vectordb: A vector stores instance that you want to ingest.
|
||||
:param corpus_data: The corpus data that contains doc_id and contents columns.
|
||||
"""
|
||||
embedding_batch = vectordb.embedding_batch
|
||||
if not corpus_data.empty:
|
||||
new_contents = corpus_data["contents"].tolist()
|
||||
new_ids = corpus_data["doc_id"].tolist()
|
||||
content_batches = make_batch(new_contents, embedding_batch)
|
||||
id_batches = make_batch(new_ids, embedding_batch)
|
||||
for content_batch, id_batch in zip(content_batches, id_batches):
|
||||
await vectordb.add(ids=id_batch, texts=content_batch)
|
||||
|
||||
|
||||
def run_query_embedding_batch(
|
||||
queries: List[str], embedding_model: BaseEmbedding, batch_size: int
|
||||
) -> List[List[float]]:
|
||||
result = []
|
||||
for i in range(0, len(queries), batch_size):
|
||||
batch = queries[i : i + batch_size]
|
||||
embeddings = embedding_model.get_text_embedding_batch(batch)
|
||||
result.extend(embeddings)
|
||||
return result
|
||||
|
||||
|
||||
@convert_inputs_to_list
|
||||
def get_id_scores( # To find the uncalculated score when fuse the scores for the hybrid retrieval
|
||||
query_embeddings: List[
|
||||
List[float]
|
||||
], # `queries` is input. This is one user input query.
|
||||
content_embeddings: List[List[float]],
|
||||
similarity_metric: str,
|
||||
) -> List[
|
||||
float
|
||||
]: # The most high scores among each query. The length of a result is the same as the contents length.
|
||||
"""
|
||||
Calculate the highest similarity scores between query embeddings and content embeddings.
|
||||
|
||||
:param query_embeddings: A list of lists containing query embeddings.
|
||||
:param content_embeddings: A list of lists containing content embeddings.
|
||||
:param similarity_metric: The similarity metric to use ('l2', 'ip', or 'cosine').
|
||||
:return: A list of the highest similarity scores for each content embedding.
|
||||
"""
|
||||
metric_func_dict = {
|
||||
"l2": lambda x, y: 1 - calculate_l2_distance(x, y),
|
||||
"ip": calculate_inner_product,
|
||||
"cosine": calculate_cosine_similarity,
|
||||
}
|
||||
metric_func = metric_func_dict[similarity_metric]
|
||||
|
||||
result = []
|
||||
for content_embedding in content_embeddings:
|
||||
scores = []
|
||||
for query_embedding in query_embeddings:
|
||||
scores.append(
|
||||
metric_func(np.array(query_embedding), np.array(content_embedding))
|
||||
)
|
||||
result.append(max(scores))
|
||||
return result
|
||||
Reference in New Issue
Block a user