Fix Dockerfile build issue
This commit is contained in:
2
autorag/nodes/passageaugmenter/__init__.py
Normal file
2
autorag/nodes/passageaugmenter/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
from .pass_passage_augmenter import PassPassageAugmenter
|
||||
from .prev_next_augmenter import PrevNextPassageAugmenter
|
||||
80
autorag/nodes/passageaugmenter/base.py
Normal file
80
autorag/nodes/passageaugmenter/base.py
Normal file
@@ -0,0 +1,80 @@
|
||||
import abc
|
||||
import logging
|
||||
import os
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.schema import BaseModule
|
||||
from autorag.utils import (
|
||||
validate_qa_dataset,
|
||||
sort_by_scores,
|
||||
validate_corpus_dataset,
|
||||
cast_corpus_dataset,
|
||||
)
|
||||
from autorag.utils.util import select_top_k
|
||||
|
||||
logger = logging.getLogger("AutoRAG")
|
||||
|
||||
|
||||
class BasePassageAugmenter(BaseModule, metaclass=abc.ABCMeta):
|
||||
def __init__(self, project_dir: str, *args, **kwargs):
|
||||
logger.info(
|
||||
f"Initialize passage augmenter node - {self.__class__.__name__} module..."
|
||||
)
|
||||
data_dir = os.path.join(project_dir, "data")
|
||||
corpus_df = pd.read_parquet(
|
||||
os.path.join(data_dir, "corpus.parquet"), engine="pyarrow"
|
||||
)
|
||||
validate_corpus_dataset(corpus_df)
|
||||
corpus_df = cast_corpus_dataset(corpus_df)
|
||||
self.corpus_df = corpus_df
|
||||
|
||||
def __del__(self):
|
||||
logger.info(
|
||||
f"Initialize passage augmenter node - {self.__class__.__name__} module..."
|
||||
)
|
||||
|
||||
def cast_to_run(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
logger.info(
|
||||
f"Running passage augmenter node - {self.__class__.__name__} module..."
|
||||
)
|
||||
validate_qa_dataset(previous_result)
|
||||
|
||||
# find ids columns
|
||||
assert (
|
||||
"retrieved_ids" in previous_result.columns
|
||||
), "previous_result must have retrieved_ids column."
|
||||
ids = previous_result["retrieved_ids"].tolist()
|
||||
|
||||
return ids
|
||||
|
||||
@staticmethod
|
||||
def sort_by_scores(
|
||||
augmented_contents,
|
||||
augmented_ids,
|
||||
augmented_scores,
|
||||
top_k: int,
|
||||
reverse: bool = True,
|
||||
):
|
||||
# sort by scores
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"contents": augmented_contents,
|
||||
"ids": augmented_ids,
|
||||
"scores": augmented_scores,
|
||||
}
|
||||
)
|
||||
df[["contents", "ids", "scores"]] = df.apply(
|
||||
lambda row: sort_by_scores(row, reverse=reverse),
|
||||
axis=1,
|
||||
result_type="expand",
|
||||
)
|
||||
|
||||
# select by top_k
|
||||
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
|
||||
|
||||
return (
|
||||
results["contents"].tolist(),
|
||||
results["ids"].tolist(),
|
||||
results["scores"].tolist(),
|
||||
)
|
||||
43
autorag/nodes/passageaugmenter/pass_passage_augmenter.py
Normal file
43
autorag/nodes/passageaugmenter/pass_passage_augmenter.py
Normal file
@@ -0,0 +1,43 @@
|
||||
from typing import List
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.passageaugmenter.base import BasePassageAugmenter
|
||||
from autorag.utils import result_to_dataframe
|
||||
|
||||
|
||||
class PassPassageAugmenter(BasePassageAugmenter):
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
"""
|
||||
Run the passage augmenter node - PassPassageAugmenter module.
|
||||
|
||||
:param previous_result: The previous result Dataframe.
|
||||
:param top_k: You must input the top_k value to get the top k results.
|
||||
:param kwargs: Not affected.
|
||||
:return: DataFrame with retrieved_contents, retrieved_ids, and retrieve_scores columns
|
||||
"""
|
||||
top_k = kwargs.pop("top_k")
|
||||
|
||||
ids = self.cast_to_run(previous_result)
|
||||
contents = previous_result["retrieved_contents"].tolist()
|
||||
scores = previous_result["retrieve_scores"].tolist()
|
||||
|
||||
augmented_ids, augmented_contents, augmented_scores = self._pure(
|
||||
ids, contents, scores
|
||||
)
|
||||
return self.sort_by_scores(
|
||||
augmented_contents, augmented_ids, augmented_scores, top_k
|
||||
)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
ids_list: List[List[str]],
|
||||
contents_list: List[List[str]],
|
||||
scores_list: List[List[float]],
|
||||
):
|
||||
"""
|
||||
Do not perform augmentation.
|
||||
Return given passages, scores, and ids as is.
|
||||
"""
|
||||
return ids_list, contents_list, scores_list
|
||||
155
autorag/nodes/passageaugmenter/prev_next_augmenter.py
Normal file
155
autorag/nodes/passageaugmenter/prev_next_augmenter.py
Normal file
@@ -0,0 +1,155 @@
|
||||
from typing import List, Union
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from autorag.embedding.base import EmbeddingModel
|
||||
from autorag.evaluation.metric.util import calculate_cosine_similarity
|
||||
from autorag.nodes.passageaugmenter.base import BasePassageAugmenter
|
||||
from autorag.utils.util import (
|
||||
filter_dict_keys,
|
||||
fetch_contents,
|
||||
embedding_query_content,
|
||||
result_to_dataframe,
|
||||
empty_cuda_cache,
|
||||
)
|
||||
|
||||
|
||||
class PrevNextPassageAugmenter(BasePassageAugmenter):
|
||||
def __init__(
|
||||
self,
|
||||
project_dir: str,
|
||||
embedding_model: Union[str, dict] = "openai",
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Initialize the PrevNextPassageAugmenter module.
|
||||
|
||||
:param project_dir:
|
||||
:param embedding_model: The embedding model name to use for calculating cosine similarity
|
||||
Default is openai (text-embedding-ada-002)
|
||||
:param kwargs:
|
||||
"""
|
||||
super().__init__(project_dir, *args, **kwargs)
|
||||
slim_corpus_df = self.corpus_df[["doc_id", "metadata"]]
|
||||
slim_corpus_df.loc[:, "metadata"] = slim_corpus_df["metadata"].apply(
|
||||
filter_dict_keys, keys=["prev_id", "next_id"]
|
||||
)
|
||||
self.slim_corpus_df = slim_corpus_df
|
||||
|
||||
# init embedding model
|
||||
self.embedding_model = EmbeddingModel.load(embedding_model)()
|
||||
|
||||
def __del__(self):
|
||||
del self.embedding_model
|
||||
empty_cuda_cache()
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
"""
|
||||
Run the passage augmenter node - PrevNextPassageAugmenter module.
|
||||
|
||||
:param previous_result: The previous result Dataframe.
|
||||
:param top_k: You must input the top_k value to get the top k results.
|
||||
:param kwargs: Not affected.
|
||||
:return: DataFrame with retrieved_contents, retrieved_ids, and retrieve_scores columns
|
||||
"""
|
||||
top_k = kwargs.pop("top_k")
|
||||
|
||||
ids = self.cast_to_run(previous_result)
|
||||
# find queries columns
|
||||
assert (
|
||||
"query" in previous_result.columns
|
||||
), "previous_result must have query column."
|
||||
queries = previous_result["query"].tolist()
|
||||
|
||||
mode = kwargs.pop("mode", "both")
|
||||
num_passages = kwargs.pop("num_passages", 1)
|
||||
augmented_ids = self._pure(ids, num_passages, mode)
|
||||
|
||||
# fetch contents from corpus to use augmented ids
|
||||
augmented_contents = fetch_contents(self.corpus_df, augmented_ids)
|
||||
|
||||
query_embeddings, contents_embeddings = embedding_query_content(
|
||||
queries, augmented_contents, self.embedding_model, batch=128
|
||||
)
|
||||
|
||||
# get scores from calculated cosine similarity
|
||||
augmented_scores = [
|
||||
np.array(
|
||||
[
|
||||
calculate_cosine_similarity(query_embedding, x)
|
||||
for x in content_embeddings
|
||||
]
|
||||
).tolist()
|
||||
for query_embedding, content_embeddings in zip(
|
||||
query_embeddings, contents_embeddings
|
||||
)
|
||||
]
|
||||
return self.sort_by_scores(
|
||||
augmented_contents, augmented_ids, augmented_scores, top_k
|
||||
)
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
ids_list: List[List[str]],
|
||||
num_passages: int = 1,
|
||||
mode: str = "both",
|
||||
) -> List[List[str]]:
|
||||
"""
|
||||
Add passages before and/or after the retrieved passage.
|
||||
For more information, visit https://docs.llamaindex.ai/en/stable/examples/node_postprocessor/PrevNextPostprocessorDemo/.
|
||||
|
||||
:param ids_list: The list of lists of ids retrieved
|
||||
:param num_passages: The number of passages to add before and after the retrieved passage
|
||||
Default is 1.
|
||||
:param mode: The mode of augmentation
|
||||
'prev': add passages before the retrieved passage
|
||||
'next': add passages after the retrieved passage
|
||||
'both': add passages before and after the retrieved passage
|
||||
Default is 'next'.
|
||||
:return: The list of lists of augmented ids
|
||||
"""
|
||||
if mode not in ["prev", "next", "both"]:
|
||||
raise ValueError(f"mode must be 'prev', 'next', or 'both', but got {mode}")
|
||||
|
||||
augmented_ids = [
|
||||
(
|
||||
lambda ids: prev_next_augmenter_pure(
|
||||
ids, self.slim_corpus_df, mode, num_passages
|
||||
)
|
||||
)(ids)
|
||||
for ids in ids_list
|
||||
]
|
||||
|
||||
return augmented_ids
|
||||
|
||||
|
||||
def prev_next_augmenter_pure(
|
||||
ids: List[str], corpus_df: pd.DataFrame, mode: str, num_passages: int
|
||||
):
|
||||
def fetch_id_sequence(start_id, key):
|
||||
sequence = []
|
||||
current_id = start_id
|
||||
for _ in range(num_passages):
|
||||
current_id = (
|
||||
corpus_df.loc[corpus_df["doc_id"] == current_id]["metadata"]
|
||||
.values[0]
|
||||
.get(key)
|
||||
)
|
||||
if current_id is None:
|
||||
break
|
||||
sequence.append(current_id)
|
||||
return sequence
|
||||
|
||||
augmented_group = []
|
||||
for id_ in ids:
|
||||
current_ids = [id_]
|
||||
if mode in ["prev", "both"]:
|
||||
current_ids = fetch_id_sequence(id_, "prev_id")[::-1] + current_ids
|
||||
if mode in ["next", "both"]:
|
||||
current_ids += fetch_id_sequence(id_, "next_id")
|
||||
augmented_group.extend(current_ids)
|
||||
return augmented_group
|
||||
131
autorag/nodes/passageaugmenter/run.py
Normal file
131
autorag/nodes/passageaugmenter/run.py
Normal file
@@ -0,0 +1,131 @@
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
from typing import List, Dict
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.retrieval.run import evaluate_retrieval_node
|
||||
from autorag.schema.metricinput import MetricInput
|
||||
from autorag.strategy import measure_speed, filter_by_threshold, select_best
|
||||
from autorag.utils.util import apply_recursive, to_list
|
||||
|
||||
logger = logging.getLogger("AutoRAG")
|
||||
|
||||
|
||||
def run_passage_augmenter_node(
|
||||
modules: List,
|
||||
module_params: List[Dict],
|
||||
previous_result: pd.DataFrame,
|
||||
node_line_dir: str,
|
||||
strategies: Dict,
|
||||
) -> pd.DataFrame:
|
||||
if not os.path.exists(node_line_dir):
|
||||
os.makedirs(node_line_dir)
|
||||
project_dir = pathlib.PurePath(node_line_dir).parent.parent
|
||||
qa_df = pd.read_parquet(
|
||||
os.path.join(project_dir, "data", "qa.parquet"), engine="pyarrow"
|
||||
)
|
||||
retrieval_gt = qa_df["retrieval_gt"].tolist()
|
||||
retrieval_gt = apply_recursive(lambda x: str(x), to_list(retrieval_gt))
|
||||
|
||||
results, execution_times = zip(
|
||||
*map(
|
||||
lambda task: measure_speed(
|
||||
task[0].run_evaluator,
|
||||
project_dir=project_dir,
|
||||
previous_result=previous_result,
|
||||
**task[1],
|
||||
),
|
||||
zip(modules, module_params),
|
||||
)
|
||||
)
|
||||
average_times = list(map(lambda x: x / len(results[0]), execution_times))
|
||||
metric_inputs = [
|
||||
MetricInput(retrieval_gt=ret_gt, query=query, generation_gt=gen_gt)
|
||||
for ret_gt, query, gen_gt in zip(
|
||||
retrieval_gt,
|
||||
previous_result["query"].tolist(),
|
||||
previous_result["generation_gt"].tolist(),
|
||||
)
|
||||
]
|
||||
|
||||
# run metrics before filtering
|
||||
if strategies.get("metrics") is None:
|
||||
raise ValueError(
|
||||
"You must at least one metrics for passage_augmenter evaluation."
|
||||
)
|
||||
results = list(
|
||||
map(
|
||||
lambda x: evaluate_retrieval_node(
|
||||
x,
|
||||
metric_inputs,
|
||||
strategies.get("metrics"),
|
||||
),
|
||||
results,
|
||||
)
|
||||
)
|
||||
|
||||
# save results to folder
|
||||
save_dir = os.path.join(node_line_dir, "passage_augmenter") # node name
|
||||
if not os.path.exists(save_dir):
|
||||
os.makedirs(save_dir)
|
||||
filepaths = list(
|
||||
map(lambda x: os.path.join(save_dir, f"{x}.parquet"), range(len(modules)))
|
||||
)
|
||||
list(
|
||||
map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths))
|
||||
) # execute save to parquet
|
||||
filenames = list(map(lambda x: os.path.basename(x), filepaths))
|
||||
|
||||
summary_df = pd.DataFrame(
|
||||
{
|
||||
"filename": filenames,
|
||||
"module_name": list(map(lambda module: module.__name__, modules)),
|
||||
"module_params": module_params,
|
||||
"execution_time": average_times,
|
||||
**{
|
||||
f"passage_augmenter_{metric}": list(
|
||||
map(lambda result: result[metric].mean(), results)
|
||||
)
|
||||
for metric in strategies.get("metrics")
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
# filter by strategies
|
||||
if strategies.get("speed_threshold") is not None:
|
||||
results, filenames = filter_by_threshold(
|
||||
results, average_times, strategies["speed_threshold"], filenames
|
||||
)
|
||||
selected_result, selected_filename = select_best(
|
||||
results,
|
||||
strategies.get("metrics"),
|
||||
filenames,
|
||||
strategies.get("strategy", "mean"),
|
||||
)
|
||||
# change metric name columns to passage_augmenter_metric_name
|
||||
selected_result = selected_result.rename(
|
||||
columns={
|
||||
metric_name: f"passage_augmenter_{metric_name}"
|
||||
for metric_name in strategies["metrics"]
|
||||
}
|
||||
)
|
||||
# drop retrieval result columns in previous_result
|
||||
previous_result = previous_result.drop(
|
||||
columns=["retrieved_contents", "retrieved_ids", "retrieve_scores"]
|
||||
)
|
||||
best_result = pd.concat([previous_result, selected_result], axis=1)
|
||||
|
||||
# add 'is_best' column to summary file
|
||||
summary_df["is_best"] = summary_df["filename"] == selected_filename
|
||||
|
||||
# save files
|
||||
summary_df.to_csv(os.path.join(save_dir, "summary.csv"), index=False)
|
||||
best_result.to_parquet(
|
||||
os.path.join(
|
||||
save_dir, f"best_{os.path.splitext(selected_filename)[0]}.parquet"
|
||||
),
|
||||
index=False,
|
||||
)
|
||||
return best_result
|
||||
Reference in New Issue
Block a user