Fix Dockerfile build issue
This commit is contained in:
560
autorag/evaluator.py
Normal file
560
autorag/evaluator.py
Normal file
@@ -0,0 +1,560 @@
|
||||
import glob
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
from datetime import datetime
|
||||
from itertools import chain
|
||||
from typing import List, Dict, Optional
|
||||
from rich.progress import Progress, BarColumn, TimeElapsedColumn
|
||||
|
||||
import pandas as pd
|
||||
import yaml
|
||||
|
||||
from autorag.node_line import run_node_line
|
||||
from autorag.nodes.retrieval.base import get_bm25_pkl_name
|
||||
from autorag.nodes.retrieval.bm25 import bm25_ingest
|
||||
from autorag.nodes.retrieval.vectordb import (
|
||||
vectordb_ingest,
|
||||
filter_exist_ids,
|
||||
filter_exist_ids_from_retrieval_gt,
|
||||
)
|
||||
from autorag.schema import Node
|
||||
from autorag.schema.node import (
|
||||
module_type_exists,
|
||||
extract_values_from_nodes,
|
||||
extract_values_from_nodes_strategy,
|
||||
)
|
||||
from autorag.utils import (
|
||||
cast_qa_dataset,
|
||||
cast_corpus_dataset,
|
||||
validate_qa_from_corpus_dataset,
|
||||
)
|
||||
from autorag.utils.util import (
|
||||
load_summary_file,
|
||||
explode,
|
||||
load_yaml_config,
|
||||
get_event_loop,
|
||||
)
|
||||
from autorag.vectordb import load_all_vectordb_from_yaml
|
||||
|
||||
logger = logging.getLogger("AutoRAG")
|
||||
|
||||
ascii_art = """
|
||||
_ _____ _____
|
||||
/\ | | | __ \ /\ / ____|
|
||||
/ \ _ _| |_ ___ | |__) | / \ | | __
|
||||
/ /\ \| | | | __/ _ \| _ / / /\ \| | |_ |
|
||||
/ ____ \ |_| | || (_) | | \ \ / ____ \ |__| |
|
||||
/_/ \_\__,_|\__\___/|_| \_\/_/ \_\_____|
|
||||
|
||||
"""
|
||||
|
||||
|
||||
class Evaluator:
|
||||
def __init__(
|
||||
self,
|
||||
qa_data_path: str,
|
||||
corpus_data_path: str,
|
||||
project_dir: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
Initialize an Evaluator object.
|
||||
|
||||
:param qa_data_path: The path to the QA dataset.
|
||||
Must be parquet file.
|
||||
:param corpus_data_path: The path to the corpus dataset.
|
||||
Must be parquet file.
|
||||
:param project_dir: The path to the project directory.
|
||||
Default is the current directory.
|
||||
"""
|
||||
# validate data paths
|
||||
if not os.path.exists(qa_data_path):
|
||||
raise ValueError(f"QA data path {qa_data_path} does not exist.")
|
||||
if not os.path.exists(corpus_data_path):
|
||||
raise ValueError(f"Corpus data path {corpus_data_path} does not exist.")
|
||||
if not qa_data_path.endswith(".parquet"):
|
||||
raise ValueError(f"QA data path {qa_data_path} is not a parquet file.")
|
||||
if not corpus_data_path.endswith(".parquet"):
|
||||
raise ValueError(
|
||||
f"Corpus data path {corpus_data_path} is not a parquet file."
|
||||
)
|
||||
self.qa_data_path = qa_data_path
|
||||
self.corpus_data_path = corpus_data_path
|
||||
self.qa_data = pd.read_parquet(qa_data_path, engine="pyarrow")
|
||||
self.corpus_data = pd.read_parquet(corpus_data_path, engine="pyarrow")
|
||||
self.qa_data = cast_qa_dataset(self.qa_data)
|
||||
self.corpus_data = cast_corpus_dataset(self.corpus_data)
|
||||
self.project_dir = project_dir if project_dir is not None else os.getcwd()
|
||||
if not os.path.exists(self.project_dir):
|
||||
os.makedirs(self.project_dir)
|
||||
|
||||
validate_qa_from_corpus_dataset(self.qa_data, self.corpus_data)
|
||||
|
||||
# copy dataset to the project directory
|
||||
if not os.path.exists(os.path.join(self.project_dir, "data")):
|
||||
os.makedirs(os.path.join(self.project_dir, "data"))
|
||||
qa_path_in_project = os.path.join(self.project_dir, "data", "qa.parquet")
|
||||
if not os.path.exists(qa_path_in_project):
|
||||
self.qa_data.to_parquet(qa_path_in_project, index=False)
|
||||
corpus_path_in_project = os.path.join(
|
||||
self.project_dir, "data", "corpus.parquet"
|
||||
)
|
||||
if not os.path.exists(corpus_path_in_project):
|
||||
self.corpus_data.to_parquet(corpus_path_in_project, index=False)
|
||||
|
||||
def start_trial(
|
||||
self, yaml_path: str, skip_validation: bool = False, full_ingest: bool = True
|
||||
):
|
||||
"""
|
||||
Start AutoRAG trial.
|
||||
The trial means one experiment to optimize the RAG pipeline.
|
||||
It consists of ingesting corpus data, running all nodes and modules, evaluating and finding the optimal modules.
|
||||
|
||||
:param yaml_path: The config YAML path
|
||||
:param skip_validation: If True, it skips the validation step.
|
||||
The validation step checks the input config YAML file is well formatted,
|
||||
and there is any problem with the system settings.
|
||||
Default is False.
|
||||
:param full_ingest: If True, it checks the whole corpus data from corpus.parquet that exists in the Vector DB.
|
||||
If your corpus is huge and don't want to check the whole vector DB, please set it to False.
|
||||
:return: None
|
||||
"""
|
||||
# Make Resources directory
|
||||
os.makedirs(os.path.join(self.project_dir, "resources"), exist_ok=True)
|
||||
|
||||
if not skip_validation:
|
||||
logger.info(ascii_art)
|
||||
logger.info(
|
||||
"Start Validation input data and config YAML file first. "
|
||||
"If you want to skip this, put the --skip_validation flag or "
|
||||
"`skip_validation` at the start_trial function."
|
||||
)
|
||||
from autorag.validator import Validator # resolve circular import
|
||||
|
||||
validator = Validator(
|
||||
qa_data_path=self.qa_data_path, corpus_data_path=self.corpus_data_path
|
||||
)
|
||||
validator.validate(yaml_path)
|
||||
|
||||
os.environ["PROJECT_DIR"] = self.project_dir
|
||||
|
||||
trial_name = self.__get_new_trial_name()
|
||||
self.__make_trial_dir(trial_name)
|
||||
|
||||
# copy YAML file to the trial directory
|
||||
shutil.copy(
|
||||
yaml_path, os.path.join(self.project_dir, trial_name, "config.yaml")
|
||||
)
|
||||
yaml_dict = load_yaml_config(yaml_path)
|
||||
vectordb = yaml_dict.get("vectordb", [])
|
||||
|
||||
vectordb_config_path = os.path.join(
|
||||
self.project_dir, "resources", "vectordb.yaml"
|
||||
)
|
||||
with open(vectordb_config_path, "w") as f:
|
||||
yaml.safe_dump({"vectordb": vectordb}, f)
|
||||
|
||||
node_lines = self._load_node_lines(yaml_path)
|
||||
self.__ingest_bm25_full(node_lines)
|
||||
|
||||
with Progress(
|
||||
"[progress.description]{task.description}",
|
||||
BarColumn(),
|
||||
"[progress.percentage]{task.percentage:>3.0f}%",
|
||||
"[progress.bar]{task.completed}/{task.total}",
|
||||
TimeElapsedColumn(),
|
||||
) as progress:
|
||||
# Ingest VectorDB corpus
|
||||
if any(
|
||||
list(
|
||||
map(
|
||||
lambda nodes: module_type_exists(nodes, "vectordb"),
|
||||
node_lines.values(),
|
||||
)
|
||||
)
|
||||
):
|
||||
task_ingest = progress.add_task("[cyan]Ingesting VectorDB...", total=1)
|
||||
|
||||
loop = get_event_loop()
|
||||
loop.run_until_complete(self.__ingest_vectordb(yaml_path, full_ingest))
|
||||
|
||||
progress.update(task_ingest, completed=1)
|
||||
|
||||
trial_summary_df = pd.DataFrame(
|
||||
columns=[
|
||||
"node_line_name",
|
||||
"node_type",
|
||||
"best_module_filename",
|
||||
"best_module_name",
|
||||
"best_module_params",
|
||||
"best_execution_time",
|
||||
]
|
||||
)
|
||||
task_eval = progress.add_task(
|
||||
"[cyan]Evaluating...", total=sum(map(len, node_lines.values()))
|
||||
)
|
||||
|
||||
for i, (node_line_name, node_line) in enumerate(node_lines.items()):
|
||||
node_line_dir = os.path.join(
|
||||
self.project_dir, trial_name, node_line_name
|
||||
)
|
||||
os.makedirs(node_line_dir, exist_ok=False)
|
||||
if i == 0:
|
||||
previous_result = self.qa_data
|
||||
logger.info(f"Running node line {node_line_name}...")
|
||||
previous_result = run_node_line(
|
||||
node_line, node_line_dir, previous_result, progress, task_eval
|
||||
)
|
||||
|
||||
trial_summary_df = self._append_node_line_summary(
|
||||
node_line_name, node_line_dir, trial_summary_df
|
||||
)
|
||||
|
||||
trial_summary_df.to_csv(
|
||||
os.path.join(self.project_dir, trial_name, "summary.csv"), index=False
|
||||
)
|
||||
|
||||
logger.info("Evaluation complete.")
|
||||
|
||||
def __ingest_bm25_full(self, node_lines: Dict[str, List[Node]]):
|
||||
if any(
|
||||
list(
|
||||
map(
|
||||
lambda nodes: module_type_exists(nodes, "bm25"), node_lines.values()
|
||||
)
|
||||
)
|
||||
):
|
||||
logger.info("Embedding BM25 corpus...")
|
||||
bm25_tokenizer_list = list(
|
||||
chain.from_iterable(
|
||||
map(
|
||||
lambda nodes: self._find_bm25_tokenizer(nodes),
|
||||
node_lines.values(),
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
if len(bm25_tokenizer_list) == 0:
|
||||
bm25_tokenizer_list = ["porter_stemmer"]
|
||||
for bm25_tokenizer in bm25_tokenizer_list:
|
||||
bm25_dir = os.path.join(
|
||||
self.project_dir, "resources", get_bm25_pkl_name(bm25_tokenizer)
|
||||
)
|
||||
if not os.path.exists(os.path.dirname(bm25_dir)):
|
||||
os.makedirs(os.path.dirname(bm25_dir))
|
||||
# ingest because bm25 supports update new corpus data
|
||||
bm25_ingest(bm25_dir, self.corpus_data, bm25_tokenizer=bm25_tokenizer)
|
||||
logger.info("BM25 corpus embedding complete.")
|
||||
|
||||
def __get_new_trial_name(self) -> str:
|
||||
trial_json_path = os.path.join(self.project_dir, "trial.json")
|
||||
if not os.path.exists(trial_json_path):
|
||||
return "0"
|
||||
with open(trial_json_path, "r") as f:
|
||||
trial_json = json.load(f)
|
||||
return str(int(trial_json[-1]["trial_name"]) + 1)
|
||||
|
||||
def __make_trial_dir(self, trial_name: str):
|
||||
trial_json_path = os.path.join(self.project_dir, "trial.json")
|
||||
if os.path.exists(trial_json_path):
|
||||
with open(trial_json_path, "r") as f:
|
||||
trial_json = json.load(f)
|
||||
else:
|
||||
trial_json = []
|
||||
|
||||
trial_json.append(
|
||||
{
|
||||
"trial_name": trial_name,
|
||||
"start_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
}
|
||||
)
|
||||
os.makedirs(os.path.join(self.project_dir, trial_name))
|
||||
with open(trial_json_path, "w") as f:
|
||||
json.dump(trial_json, f, indent=4)
|
||||
|
||||
@staticmethod
|
||||
def _load_node_lines(yaml_path: str) -> Dict[str, List[Node]]:
|
||||
yaml_dict = load_yaml_config(yaml_path)
|
||||
node_lines = yaml_dict["node_lines"]
|
||||
node_line_dict = {}
|
||||
for node_line in node_lines:
|
||||
node_line_dict[node_line["node_line_name"]] = list(
|
||||
map(lambda node: Node.from_dict(node), node_line["nodes"])
|
||||
)
|
||||
return node_line_dict
|
||||
|
||||
def restart_trial(self, trial_path: str):
|
||||
logger.info(ascii_art)
|
||||
os.environ["PROJECT_DIR"] = self.project_dir
|
||||
# Check if trial_path exists
|
||||
if not os.path.exists(trial_path):
|
||||
raise ValueError(f"Trial path {trial_path} does not exist.")
|
||||
# Check if trial is completed
|
||||
if os.path.exists(os.path.join(trial_path, "summary.csv")):
|
||||
raise ValueError(f"Trial path {trial_path} is already completed.")
|
||||
|
||||
# Extract node lines from config.yaml
|
||||
yaml_path = os.path.join(trial_path, "config.yaml")
|
||||
node_lines = self._load_node_lines(yaml_path)
|
||||
|
||||
node_line_names = list(node_lines.keys())
|
||||
nodes = list(node_lines.values())
|
||||
node_names = list(
|
||||
map(lambda node: list(map(lambda n: n.node_type, node)), nodes)
|
||||
)
|
||||
|
||||
# If the First Node Line folder hasn't even been created, proceed to start_trial
|
||||
if not os.path.exists(os.path.join(trial_path, node_line_names[0])):
|
||||
self.start_trial(yaml_path)
|
||||
return None
|
||||
|
||||
# Find conflict node line and node
|
||||
conflict_line_name, conflict_node_name = self.__find_conflict_point(
|
||||
trial_path, node_line_names, node_lines
|
||||
)
|
||||
node_dir = os.path.join(trial_path, conflict_line_name, conflict_node_name)
|
||||
if os.path.exists(node_dir):
|
||||
shutil.rmtree(node_dir)
|
||||
|
||||
# Set remain_nodes and remain_lines
|
||||
remain_nodes, completed_node_names, remain_lines, remain_line_names = (
|
||||
self._set_remain_nodes_and_lines(
|
||||
node_line_names,
|
||||
nodes,
|
||||
node_names,
|
||||
conflict_node_name,
|
||||
conflict_line_name,
|
||||
)
|
||||
)
|
||||
# Set previous_result
|
||||
previous_result = self.__set_previous_result(
|
||||
node_line_names, node_names, trial_path, conflict_node_name
|
||||
)
|
||||
|
||||
# Run Node
|
||||
if remain_nodes:
|
||||
conflict_line_dir = os.path.join(trial_path, conflict_line_name)
|
||||
summary_lst = []
|
||||
# Get already run node summary and append to summary_lst
|
||||
for completed_node_name in completed_node_names:
|
||||
summary_lst = self._append_node_summary(
|
||||
conflict_line_dir, completed_node_name, summary_lst
|
||||
)
|
||||
for node in remain_nodes:
|
||||
previous_result = node.run(previous_result, conflict_line_dir)
|
||||
summary_lst = self._append_node_summary(
|
||||
conflict_line_dir, node.node_type, summary_lst
|
||||
)
|
||||
pd.DataFrame(summary_lst).to_csv(
|
||||
os.path.join(conflict_line_dir, "summary.csv"), index=False
|
||||
)
|
||||
|
||||
# Run node line
|
||||
trial_summary_df = pd.DataFrame(
|
||||
columns=[
|
||||
"node_line_name",
|
||||
"node_type",
|
||||
"best_module_filename",
|
||||
"best_module_name",
|
||||
"best_module_params",
|
||||
"best_execution_time",
|
||||
]
|
||||
)
|
||||
completed_line_names = node_line_names[
|
||||
: node_line_names.index(conflict_line_name)
|
||||
]
|
||||
# Get already run node line's summary and append to trial_summary_df
|
||||
if completed_line_names:
|
||||
for line_name in completed_line_names:
|
||||
node_line_dir = os.path.join(trial_path, line_name)
|
||||
trial_summary_df = self._append_node_line_summary(
|
||||
line_name, node_line_dir, trial_summary_df
|
||||
)
|
||||
if remain_lines:
|
||||
for node_line_name, node_line in zip(remain_line_names, remain_lines):
|
||||
node_line_dir = os.path.join(trial_path, node_line_name)
|
||||
if not os.path.exists(node_line_dir):
|
||||
os.makedirs(node_line_dir)
|
||||
logger.info(f"Running node line {node_line_name}...")
|
||||
previous_result = run_node_line(
|
||||
node_line, node_line_dir, previous_result
|
||||
)
|
||||
trial_summary_df = self._append_node_line_summary(
|
||||
node_line_name, node_line_dir, trial_summary_df
|
||||
)
|
||||
trial_summary_df.to_csv(os.path.join(trial_path, "summary.csv"), index=False)
|
||||
|
||||
logger.info("Evaluation complete.")
|
||||
|
||||
def __find_conflict_point(
|
||||
self,
|
||||
trial_path: str,
|
||||
node_line_names: List[str],
|
||||
node_lines: Dict[str, List[Node]],
|
||||
) -> tuple[str, str]:
|
||||
for node_line_name in node_line_names:
|
||||
node_line_dir = os.path.join(trial_path, node_line_name)
|
||||
if not os.path.exists(node_line_dir):
|
||||
return node_line_name, node_lines[node_line_name][0].node_type
|
||||
|
||||
if not os.path.exists(os.path.join(node_line_dir, "summary.csv")):
|
||||
conflict_node_name = self._find_conflict_node_name(
|
||||
node_line_dir, node_lines[node_line_name]
|
||||
)
|
||||
return node_line_name, conflict_node_name
|
||||
|
||||
raise ValueError(f"No error node line found in {trial_path}.")
|
||||
|
||||
@staticmethod
|
||||
def _find_conflict_node_name(node_line_dir: str, node_line: List[Node]) -> str:
|
||||
for node in node_line:
|
||||
node_dir = os.path.join(node_line_dir, node.node_type)
|
||||
if not os.path.exists(node_dir) or not os.path.exists(
|
||||
os.path.join(node_dir, "summary.csv")
|
||||
):
|
||||
return node.node_type
|
||||
raise TypeError("No conflict node name found.")
|
||||
|
||||
def __set_previous_result(
|
||||
self,
|
||||
node_line_names: List[str],
|
||||
node_names: List[List[str]],
|
||||
trial_path: str,
|
||||
conflict_node_name: str,
|
||||
):
|
||||
exploded_node_line, exploded_node = explode(node_line_names, node_names)
|
||||
conflict_node_index = exploded_node.index(conflict_node_name)
|
||||
# Set previous_result
|
||||
if conflict_node_index == 0:
|
||||
previous_result = self.qa_data
|
||||
else:
|
||||
previous_node_line = exploded_node_line[conflict_node_index - 1]
|
||||
previous_node = exploded_node[conflict_node_index - 1]
|
||||
|
||||
previous_node_dir = os.path.join(
|
||||
trial_path, previous_node_line, previous_node
|
||||
)
|
||||
best_file_pattern = f"{previous_node_dir}/best_*.parquet"
|
||||
previous_result = pd.read_parquet(
|
||||
glob.glob(best_file_pattern)[0], engine="pyarrow"
|
||||
)
|
||||
return previous_result
|
||||
|
||||
@staticmethod
|
||||
def _set_remain_nodes_and_lines(
|
||||
node_line_names: List[str],
|
||||
nodes: List[List[Node]],
|
||||
node_names: List[List[str]],
|
||||
conflict_node_name: str,
|
||||
conflict_node_line_name: str,
|
||||
):
|
||||
conflict_node_line_index = node_line_names.index(conflict_node_line_name)
|
||||
full_conflict_node_line_nodes = nodes[conflict_node_line_index]
|
||||
full_conflict_node_line_node_names = node_names[conflict_node_line_index]
|
||||
|
||||
if conflict_node_name == full_conflict_node_line_node_names[0]:
|
||||
remain_nodes = None
|
||||
completed_node_names = None
|
||||
remain_node_lines = nodes[conflict_node_line_index:]
|
||||
remain_node_line_names = node_line_names[conflict_node_line_index:]
|
||||
else:
|
||||
conflict_node_index = full_conflict_node_line_node_names.index(
|
||||
conflict_node_name
|
||||
)
|
||||
remain_nodes = full_conflict_node_line_nodes[conflict_node_index:]
|
||||
completed_node_names = full_conflict_node_line_node_names[
|
||||
:conflict_node_index
|
||||
]
|
||||
if conflict_node_line_index + 1 >= len(node_line_names):
|
||||
remain_node_lines = None
|
||||
remain_node_line_names = None
|
||||
else:
|
||||
remain_node_lines = nodes[conflict_node_line_index + 1 :]
|
||||
remain_node_line_names = node_line_names[conflict_node_line_index + 1 :]
|
||||
return (
|
||||
remain_nodes,
|
||||
completed_node_names,
|
||||
remain_node_lines,
|
||||
remain_node_line_names,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _append_node_line_summary(
|
||||
node_line_name: str, node_line_dir: str, trial_summary_df: pd.DataFrame
|
||||
):
|
||||
summary_df = load_summary_file(
|
||||
os.path.join(node_line_dir, "summary.csv"),
|
||||
dict_columns=["best_module_params"],
|
||||
)
|
||||
summary_df = summary_df.assign(node_line_name=node_line_name)
|
||||
summary_df = summary_df[list(trial_summary_df.columns)]
|
||||
if len(trial_summary_df) <= 0:
|
||||
trial_summary_df = summary_df
|
||||
else:
|
||||
trial_summary_df = pd.concat(
|
||||
[trial_summary_df, summary_df], ignore_index=True
|
||||
)
|
||||
return trial_summary_df
|
||||
|
||||
@staticmethod
|
||||
def _append_node_summary(
|
||||
node_line_dir: str, node_name: str, summary_lst: List[Dict]
|
||||
):
|
||||
node_summary_df = load_summary_file(
|
||||
os.path.join(node_line_dir, node_name, "summary.csv")
|
||||
)
|
||||
best_node_row = node_summary_df.loc[node_summary_df["is_best"]]
|
||||
summary_lst.append(
|
||||
{
|
||||
"node_type": node_name,
|
||||
"best_module_filename": best_node_row["filename"].values[0],
|
||||
"best_module_name": best_node_row["module_name"].values[0],
|
||||
"best_module_params": best_node_row["module_params"].values[0],
|
||||
"best_execution_time": best_node_row["execution_time"].values[0],
|
||||
}
|
||||
)
|
||||
return summary_lst
|
||||
|
||||
@staticmethod
|
||||
def _find_bm25_tokenizer(nodes: List[Node]):
|
||||
bm25_tokenizer_list = extract_values_from_nodes(nodes, "bm25_tokenizer")
|
||||
strategy_tokenizer_list = list(
|
||||
chain.from_iterable(
|
||||
extract_values_from_nodes_strategy(nodes, "bm25_tokenizer")
|
||||
)
|
||||
)
|
||||
return list(set(bm25_tokenizer_list + strategy_tokenizer_list))
|
||||
|
||||
@staticmethod
|
||||
def _find_embedding_model(nodes: List[Node]):
|
||||
embedding_models_list = extract_values_from_nodes(nodes, "embedding_model")
|
||||
retrieval_module_dicts = extract_values_from_nodes_strategy(
|
||||
nodes, "retrieval_modules"
|
||||
)
|
||||
for retrieval_modules in retrieval_module_dicts:
|
||||
vectordb_modules = list(
|
||||
filter(lambda x: x["module_type"] == "vectordb", retrieval_modules)
|
||||
)
|
||||
embedding_models_list.extend(
|
||||
list(map(lambda x: x.get("embedding_model", None), vectordb_modules))
|
||||
)
|
||||
embedding_models_list = list(
|
||||
filter(lambda x: x is not None, embedding_models_list)
|
||||
)
|
||||
return list(set(embedding_models_list))
|
||||
|
||||
async def __ingest_vectordb(self, yaml_path, full_ingest: bool):
|
||||
vectordb_list = load_all_vectordb_from_yaml(yaml_path, self.project_dir)
|
||||
if full_ingest is True:
|
||||
# get the target ingest corpus from the whole corpus
|
||||
for vectordb in vectordb_list:
|
||||
target_corpus = await filter_exist_ids(vectordb, self.corpus_data)
|
||||
await vectordb_ingest(vectordb, target_corpus)
|
||||
else:
|
||||
# get the target ingest corpus from the retrieval gt only
|
||||
for vectordb in vectordb_list:
|
||||
target_corpus = await filter_exist_ids_from_retrieval_gt(
|
||||
vectordb, self.qa_data, self.corpus_data
|
||||
)
|
||||
await vectordb_ingest(vectordb, target_corpus)
|
||||
Reference in New Issue
Block a user