Fix Dockerfile build issue

This commit is contained in:
kyy
2025-03-18 16:41:12 +09:00
parent 6814230bfb
commit 9323aa254a
228 changed files with 467 additions and 3488 deletions

View File

@@ -0,0 +1,2 @@
from .pass_passage_augmenter import PassPassageAugmenter
from .prev_next_augmenter import PrevNextPassageAugmenter

View File

@@ -0,0 +1,80 @@
import abc
import logging
import os
import pandas as pd
from autorag.schema import BaseModule
from autorag.utils import (
validate_qa_dataset,
sort_by_scores,
validate_corpus_dataset,
cast_corpus_dataset,
)
from autorag.utils.util import select_top_k
logger = logging.getLogger("AutoRAG")
class BasePassageAugmenter(BaseModule, metaclass=abc.ABCMeta):
def __init__(self, project_dir: str, *args, **kwargs):
logger.info(
f"Initialize passage augmenter node - {self.__class__.__name__} module..."
)
data_dir = os.path.join(project_dir, "data")
corpus_df = pd.read_parquet(
os.path.join(data_dir, "corpus.parquet"), engine="pyarrow"
)
validate_corpus_dataset(corpus_df)
corpus_df = cast_corpus_dataset(corpus_df)
self.corpus_df = corpus_df
def __del__(self):
logger.info(
f"Initialize passage augmenter node - {self.__class__.__name__} module..."
)
def cast_to_run(self, previous_result: pd.DataFrame, *args, **kwargs):
logger.info(
f"Running passage augmenter node - {self.__class__.__name__} module..."
)
validate_qa_dataset(previous_result)
# find ids columns
assert (
"retrieved_ids" in previous_result.columns
), "previous_result must have retrieved_ids column."
ids = previous_result["retrieved_ids"].tolist()
return ids
@staticmethod
def sort_by_scores(
augmented_contents,
augmented_ids,
augmented_scores,
top_k: int,
reverse: bool = True,
):
# sort by scores
df = pd.DataFrame(
{
"contents": augmented_contents,
"ids": augmented_ids,
"scores": augmented_scores,
}
)
df[["contents", "ids", "scores"]] = df.apply(
lambda row: sort_by_scores(row, reverse=reverse),
axis=1,
result_type="expand",
)
# select by top_k
results = select_top_k(df, ["contents", "ids", "scores"], top_k)
return (
results["contents"].tolist(),
results["ids"].tolist(),
results["scores"].tolist(),
)

View File

@@ -0,0 +1,43 @@
from typing import List
import pandas as pd
from autorag.nodes.passageaugmenter.base import BasePassageAugmenter
from autorag.utils import result_to_dataframe
class PassPassageAugmenter(BasePassageAugmenter):
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
"""
Run the passage augmenter node - PassPassageAugmenter module.
:param previous_result: The previous result Dataframe.
:param top_k: You must input the top_k value to get the top k results.
:param kwargs: Not affected.
:return: DataFrame with retrieved_contents, retrieved_ids, and retrieve_scores columns
"""
top_k = kwargs.pop("top_k")
ids = self.cast_to_run(previous_result)
contents = previous_result["retrieved_contents"].tolist()
scores = previous_result["retrieve_scores"].tolist()
augmented_ids, augmented_contents, augmented_scores = self._pure(
ids, contents, scores
)
return self.sort_by_scores(
augmented_contents, augmented_ids, augmented_scores, top_k
)
def _pure(
self,
ids_list: List[List[str]],
contents_list: List[List[str]],
scores_list: List[List[float]],
):
"""
Do not perform augmentation.
Return given passages, scores, and ids as is.
"""
return ids_list, contents_list, scores_list

View File

@@ -0,0 +1,155 @@
from typing import List, Union
import numpy as np
import pandas as pd
from autorag.embedding.base import EmbeddingModel
from autorag.evaluation.metric.util import calculate_cosine_similarity
from autorag.nodes.passageaugmenter.base import BasePassageAugmenter
from autorag.utils.util import (
filter_dict_keys,
fetch_contents,
embedding_query_content,
result_to_dataframe,
empty_cuda_cache,
)
class PrevNextPassageAugmenter(BasePassageAugmenter):
def __init__(
self,
project_dir: str,
embedding_model: Union[str, dict] = "openai",
*args,
**kwargs,
):
"""
Initialize the PrevNextPassageAugmenter module.
:param project_dir:
:param embedding_model: The embedding model name to use for calculating cosine similarity
Default is openai (text-embedding-ada-002)
:param kwargs:
"""
super().__init__(project_dir, *args, **kwargs)
slim_corpus_df = self.corpus_df[["doc_id", "metadata"]]
slim_corpus_df.loc[:, "metadata"] = slim_corpus_df["metadata"].apply(
filter_dict_keys, keys=["prev_id", "next_id"]
)
self.slim_corpus_df = slim_corpus_df
# init embedding model
self.embedding_model = EmbeddingModel.load(embedding_model)()
def __del__(self):
del self.embedding_model
empty_cuda_cache()
super().__del__()
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
"""
Run the passage augmenter node - PrevNextPassageAugmenter module.
:param previous_result: The previous result Dataframe.
:param top_k: You must input the top_k value to get the top k results.
:param kwargs: Not affected.
:return: DataFrame with retrieved_contents, retrieved_ids, and retrieve_scores columns
"""
top_k = kwargs.pop("top_k")
ids = self.cast_to_run(previous_result)
# find queries columns
assert (
"query" in previous_result.columns
), "previous_result must have query column."
queries = previous_result["query"].tolist()
mode = kwargs.pop("mode", "both")
num_passages = kwargs.pop("num_passages", 1)
augmented_ids = self._pure(ids, num_passages, mode)
# fetch contents from corpus to use augmented ids
augmented_contents = fetch_contents(self.corpus_df, augmented_ids)
query_embeddings, contents_embeddings = embedding_query_content(
queries, augmented_contents, self.embedding_model, batch=128
)
# get scores from calculated cosine similarity
augmented_scores = [
np.array(
[
calculate_cosine_similarity(query_embedding, x)
for x in content_embeddings
]
).tolist()
for query_embedding, content_embeddings in zip(
query_embeddings, contents_embeddings
)
]
return self.sort_by_scores(
augmented_contents, augmented_ids, augmented_scores, top_k
)
def _pure(
self,
ids_list: List[List[str]],
num_passages: int = 1,
mode: str = "both",
) -> List[List[str]]:
"""
Add passages before and/or after the retrieved passage.
For more information, visit https://docs.llamaindex.ai/en/stable/examples/node_postprocessor/PrevNextPostprocessorDemo/.
:param ids_list: The list of lists of ids retrieved
:param num_passages: The number of passages to add before and after the retrieved passage
Default is 1.
:param mode: The mode of augmentation
'prev': add passages before the retrieved passage
'next': add passages after the retrieved passage
'both': add passages before and after the retrieved passage
Default is 'next'.
:return: The list of lists of augmented ids
"""
if mode not in ["prev", "next", "both"]:
raise ValueError(f"mode must be 'prev', 'next', or 'both', but got {mode}")
augmented_ids = [
(
lambda ids: prev_next_augmenter_pure(
ids, self.slim_corpus_df, mode, num_passages
)
)(ids)
for ids in ids_list
]
return augmented_ids
def prev_next_augmenter_pure(
ids: List[str], corpus_df: pd.DataFrame, mode: str, num_passages: int
):
def fetch_id_sequence(start_id, key):
sequence = []
current_id = start_id
for _ in range(num_passages):
current_id = (
corpus_df.loc[corpus_df["doc_id"] == current_id]["metadata"]
.values[0]
.get(key)
)
if current_id is None:
break
sequence.append(current_id)
return sequence
augmented_group = []
for id_ in ids:
current_ids = [id_]
if mode in ["prev", "both"]:
current_ids = fetch_id_sequence(id_, "prev_id")[::-1] + current_ids
if mode in ["next", "both"]:
current_ids += fetch_id_sequence(id_, "next_id")
augmented_group.extend(current_ids)
return augmented_group

View File

@@ -0,0 +1,131 @@
import logging
import os
import pathlib
from typing import List, Dict
import pandas as pd
from autorag.nodes.retrieval.run import evaluate_retrieval_node
from autorag.schema.metricinput import MetricInput
from autorag.strategy import measure_speed, filter_by_threshold, select_best
from autorag.utils.util import apply_recursive, to_list
logger = logging.getLogger("AutoRAG")
def run_passage_augmenter_node(
modules: List,
module_params: List[Dict],
previous_result: pd.DataFrame,
node_line_dir: str,
strategies: Dict,
) -> pd.DataFrame:
if not os.path.exists(node_line_dir):
os.makedirs(node_line_dir)
project_dir = pathlib.PurePath(node_line_dir).parent.parent
qa_df = pd.read_parquet(
os.path.join(project_dir, "data", "qa.parquet"), engine="pyarrow"
)
retrieval_gt = qa_df["retrieval_gt"].tolist()
retrieval_gt = apply_recursive(lambda x: str(x), to_list(retrieval_gt))
results, execution_times = zip(
*map(
lambda task: measure_speed(
task[0].run_evaluator,
project_dir=project_dir,
previous_result=previous_result,
**task[1],
),
zip(modules, module_params),
)
)
average_times = list(map(lambda x: x / len(results[0]), execution_times))
metric_inputs = [
MetricInput(retrieval_gt=ret_gt, query=query, generation_gt=gen_gt)
for ret_gt, query, gen_gt in zip(
retrieval_gt,
previous_result["query"].tolist(),
previous_result["generation_gt"].tolist(),
)
]
# run metrics before filtering
if strategies.get("metrics") is None:
raise ValueError(
"You must at least one metrics for passage_augmenter evaluation."
)
results = list(
map(
lambda x: evaluate_retrieval_node(
x,
metric_inputs,
strategies.get("metrics"),
),
results,
)
)
# save results to folder
save_dir = os.path.join(node_line_dir, "passage_augmenter") # node name
if not os.path.exists(save_dir):
os.makedirs(save_dir)
filepaths = list(
map(lambda x: os.path.join(save_dir, f"{x}.parquet"), range(len(modules)))
)
list(
map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths))
) # execute save to parquet
filenames = list(map(lambda x: os.path.basename(x), filepaths))
summary_df = pd.DataFrame(
{
"filename": filenames,
"module_name": list(map(lambda module: module.__name__, modules)),
"module_params": module_params,
"execution_time": average_times,
**{
f"passage_augmenter_{metric}": list(
map(lambda result: result[metric].mean(), results)
)
for metric in strategies.get("metrics")
},
}
)
# filter by strategies
if strategies.get("speed_threshold") is not None:
results, filenames = filter_by_threshold(
results, average_times, strategies["speed_threshold"], filenames
)
selected_result, selected_filename = select_best(
results,
strategies.get("metrics"),
filenames,
strategies.get("strategy", "mean"),
)
# change metric name columns to passage_augmenter_metric_name
selected_result = selected_result.rename(
columns={
metric_name: f"passage_augmenter_{metric_name}"
for metric_name in strategies["metrics"]
}
)
# drop retrieval result columns in previous_result
previous_result = previous_result.drop(
columns=["retrieved_contents", "retrieved_ids", "retrieve_scores"]
)
best_result = pd.concat([previous_result, selected_result], axis=1)
# add 'is_best' column to summary file
summary_df["is_best"] = summary_df["filename"] == selected_filename
# save files
summary_df.to_csv(os.path.join(save_dir, "summary.csv"), index=False)
best_result.to_parquet(
os.path.join(
save_dir, f"best_{os.path.splitext(selected_filename)[0]}.parquet"
),
index=False,
)
return best_result