Fix Dockerfile build issue
This commit is contained in:
4
autorag/nodes/retrieval/__init__.py
Normal file
4
autorag/nodes/retrieval/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from .bm25 import BM25
|
||||
from .hybrid_cc import HybridCC
|
||||
from .hybrid_rrf import HybridRRF
|
||||
from .vectordb import VectorDB
|
||||
127
autorag/nodes/retrieval/base.py
Normal file
127
autorag/nodes/retrieval/base.py
Normal file
@@ -0,0 +1,127 @@
|
||||
import abc
|
||||
import logging
|
||||
import os
|
||||
from typing import List, Union, Tuple
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.schema import BaseModule
|
||||
from autorag.support import get_support_modules
|
||||
from autorag.utils import fetch_contents, result_to_dataframe, validate_qa_dataset
|
||||
from autorag.utils.util import pop_params
|
||||
|
||||
logger = logging.getLogger("AutoRAG")
|
||||
|
||||
|
||||
class BaseRetrieval(BaseModule, metaclass=abc.ABCMeta):
|
||||
def __init__(self, project_dir: str, *args, **kwargs):
|
||||
logger.info(f"Initialize retrieval node - {self.__class__.__name__}")
|
||||
|
||||
self.resources_dir = os.path.join(project_dir, "resources")
|
||||
data_dir = os.path.join(project_dir, "data")
|
||||
# fetch data from corpus_data
|
||||
self.corpus_df = pd.read_parquet(
|
||||
os.path.join(data_dir, "corpus.parquet"), engine="pyarrow"
|
||||
)
|
||||
|
||||
def __del__(self):
|
||||
logger.info(f"Deleting retrieval node - {self.__class__.__name__} module...")
|
||||
|
||||
def cast_to_run(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
logger.info(f"Running retrieval node - {self.__class__.__name__} module...")
|
||||
validate_qa_dataset(previous_result)
|
||||
# find queries columns & type cast queries
|
||||
assert (
|
||||
"query" in previous_result.columns
|
||||
), "previous_result must have query column."
|
||||
if "queries" not in previous_result.columns:
|
||||
previous_result["queries"] = previous_result["query"]
|
||||
previous_result.loc[:, "queries"] = previous_result["queries"].apply(
|
||||
cast_queries
|
||||
)
|
||||
queries = previous_result["queries"].tolist()
|
||||
return queries
|
||||
|
||||
|
||||
class HybridRetrieval(BaseRetrieval, metaclass=abc.ABCMeta):
|
||||
def __init__(
|
||||
self, project_dir: str, target_modules, target_module_params, *args, **kwargs
|
||||
):
|
||||
super().__init__(project_dir)
|
||||
self.target_modules = list(
|
||||
map(
|
||||
lambda x, y: get_support_modules(x)(
|
||||
**y,
|
||||
project_dir=project_dir,
|
||||
),
|
||||
target_modules,
|
||||
target_module_params,
|
||||
)
|
||||
)
|
||||
self.target_module_params = target_module_params
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
result_dfs: List[pd.DataFrame] = list(
|
||||
map(
|
||||
lambda x, y: x.pure(
|
||||
**y,
|
||||
previous_result=previous_result,
|
||||
),
|
||||
self.target_modules,
|
||||
self.target_module_params,
|
||||
)
|
||||
)
|
||||
ids = tuple(
|
||||
map(lambda df: df["retrieved_ids"].apply(list).tolist(), result_dfs)
|
||||
)
|
||||
scores = tuple(
|
||||
map(
|
||||
lambda df: df["retrieve_scores"].apply(list).tolist(),
|
||||
result_dfs,
|
||||
)
|
||||
)
|
||||
|
||||
_pure_params = pop_params(self._pure, kwargs)
|
||||
if "ids" in _pure_params or "scores" in _pure_params:
|
||||
raise ValueError(
|
||||
"With specifying ids or scores, you must use HybridRRF.run_evaluator instead."
|
||||
)
|
||||
ids, scores = self._pure(ids=ids, scores=scores, **_pure_params)
|
||||
contents = fetch_contents(self.corpus_df, ids)
|
||||
return contents, ids, scores
|
||||
|
||||
|
||||
def cast_queries(queries: Union[str, List[str]]) -> List[str]:
|
||||
if isinstance(queries, str):
|
||||
return [queries]
|
||||
elif isinstance(queries, List):
|
||||
return queries
|
||||
else:
|
||||
raise ValueError(f"queries must be str or list, but got {type(queries)}")
|
||||
|
||||
|
||||
def evenly_distribute_passages(
|
||||
ids: List[List[str]], scores: List[List[float]], top_k: int
|
||||
) -> Tuple[List[str], List[float]]:
|
||||
assert len(ids) == len(scores), "ids and scores must have same length."
|
||||
query_cnt = len(ids)
|
||||
avg_len = top_k // query_cnt
|
||||
remainder = top_k % query_cnt
|
||||
|
||||
new_ids = []
|
||||
new_scores = []
|
||||
for i in range(query_cnt):
|
||||
if i < remainder:
|
||||
new_ids.extend(ids[i][: avg_len + 1])
|
||||
new_scores.extend(scores[i][: avg_len + 1])
|
||||
else:
|
||||
new_ids.extend(ids[i][:avg_len])
|
||||
new_scores.extend(scores[i][:avg_len])
|
||||
|
||||
return new_ids, new_scores
|
||||
|
||||
|
||||
def get_bm25_pkl_name(bm25_tokenizer: str):
|
||||
bm25_tokenizer = bm25_tokenizer.replace("/", "")
|
||||
return f"bm25_{bm25_tokenizer}.pkl"
|
||||
365
autorag/nodes/retrieval/bm25.py
Normal file
365
autorag/nodes/retrieval/bm25.py
Normal file
@@ -0,0 +1,365 @@
|
||||
import asyncio
|
||||
import os
|
||||
import pickle
|
||||
import re
|
||||
from typing import List, Dict, Tuple, Callable, Union, Iterable, Optional
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from llama_index.core.indices.keyword_table.utils import simple_extract_keywords
|
||||
from nltk import PorterStemmer
|
||||
from rank_bm25 import BM25Okapi
|
||||
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
||||
|
||||
from autorag.nodes.retrieval.base import (
|
||||
evenly_distribute_passages,
|
||||
BaseRetrieval,
|
||||
get_bm25_pkl_name,
|
||||
)
|
||||
from autorag.utils import validate_corpus_dataset, fetch_contents
|
||||
from autorag.utils.util import (
|
||||
get_event_loop,
|
||||
normalize_string,
|
||||
result_to_dataframe,
|
||||
pop_params,
|
||||
)
|
||||
|
||||
|
||||
def tokenize_ko_kiwi(texts: List[str]) -> List[List[str]]:
|
||||
try:
|
||||
from kiwipiepy import Kiwi, Token
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"You need to install kiwipiepy to use 'ko_kiwi' tokenizer. "
|
||||
"Please install kiwipiepy by running 'pip install kiwipiepy'. "
|
||||
"Or install Korean version of AutoRAG by running 'pip install AutoRAG[ko]'."
|
||||
)
|
||||
texts = list(map(lambda x: x.strip().lower(), texts))
|
||||
kiwi = Kiwi()
|
||||
tokenized_list: Iterable[List[Token]] = kiwi.tokenize(texts)
|
||||
return [list(map(lambda x: x.form, token_list)) for token_list in tokenized_list]
|
||||
|
||||
|
||||
def tokenize_ko_kkma(texts: List[str]) -> List[List[str]]:
|
||||
try:
|
||||
from konlpy.tag import Kkma
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"You need to install konlpy to use 'ko_kkma' tokenizer. "
|
||||
"Please install konlpy by running 'pip install konlpy'. "
|
||||
"Or install Korean version of AutoRAG by running 'pip install AutoRAG[ko]'."
|
||||
)
|
||||
tokenizer = Kkma()
|
||||
tokenized_list: List[List[str]] = list(map(lambda x: tokenizer.morphs(x), texts))
|
||||
return tokenized_list
|
||||
|
||||
|
||||
def tokenize_ko_okt(texts: List[str]) -> List[List[str]]:
|
||||
try:
|
||||
from konlpy.tag import Okt
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"You need to install konlpy to use 'ko_kkma' tokenizer. "
|
||||
"Please install konlpy by running 'pip install konlpy'. "
|
||||
"Or install Korean version of AutoRAG by running 'pip install AutoRAG[ko]'."
|
||||
)
|
||||
tokenizer = Okt()
|
||||
tokenized_list: List[List[str]] = list(map(lambda x: tokenizer.morphs(x), texts))
|
||||
return tokenized_list
|
||||
|
||||
|
||||
def tokenize_porter_stemmer(texts: List[str]) -> List[List[str]]:
|
||||
def tokenize_remove_stopword(text: str, stemmer) -> List[str]:
|
||||
text = text.lower()
|
||||
words = list(simple_extract_keywords(text))
|
||||
return [stemmer.stem(word) for word in words]
|
||||
|
||||
stemmer = PorterStemmer()
|
||||
tokenized_list: List[List[str]] = list(
|
||||
map(lambda x: tokenize_remove_stopword(x, stemmer), texts)
|
||||
)
|
||||
return tokenized_list
|
||||
|
||||
|
||||
def tokenize_space(texts: List[str]) -> List[List[str]]:
|
||||
def tokenize_space_text(text: str) -> List[str]:
|
||||
text = normalize_string(text)
|
||||
return re.split(r"\s+", text.strip())
|
||||
|
||||
return list(map(tokenize_space_text, texts))
|
||||
|
||||
|
||||
def load_bm25_corpus(bm25_path: str) -> Dict:
|
||||
if bm25_path is None:
|
||||
return {}
|
||||
with open(bm25_path, "rb") as f:
|
||||
bm25_corpus = pickle.load(f)
|
||||
return bm25_corpus
|
||||
|
||||
|
||||
def tokenize_ja_sudachipy(texts: List[str]) -> List[List[str]]:
|
||||
try:
|
||||
from sudachipy import dictionary, tokenizer
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"You need to install SudachiPy to use 'sudachipy' tokenizer. "
|
||||
"Please install SudachiPy by running 'pip install sudachipy'."
|
||||
)
|
||||
|
||||
# Initialize SudachiPy with the default tokenizer
|
||||
tokenizer_obj = dictionary.Dictionary(dict="core").create()
|
||||
|
||||
# Choose the tokenizer mode: NORMAL, SEARCH, A
|
||||
mode = tokenizer.Tokenizer.SplitMode.A
|
||||
|
||||
# Tokenize the input texts
|
||||
tokenized_list = []
|
||||
for text in texts:
|
||||
tokens = tokenizer_obj.tokenize(text, mode)
|
||||
tokenized_list.append([token.surface() for token in tokens])
|
||||
|
||||
return tokenized_list
|
||||
|
||||
|
||||
BM25_TOKENIZER = {
|
||||
"porter_stemmer": tokenize_porter_stemmer,
|
||||
"ko_kiwi": tokenize_ko_kiwi,
|
||||
"space": tokenize_space,
|
||||
"ko_kkma": tokenize_ko_kkma,
|
||||
"ko_okt": tokenize_ko_okt,
|
||||
"sudachipy": tokenize_ja_sudachipy,
|
||||
}
|
||||
|
||||
|
||||
class BM25(BaseRetrieval):
|
||||
def __init__(self, project_dir: str, *args, **kwargs):
|
||||
"""
|
||||
Initialize BM25 module.
|
||||
(Retrieval)
|
||||
|
||||
:param project_dir: The project directory path.
|
||||
:param bm25_tokenizer: The tokenizer name that is used to the BM25.
|
||||
It supports 'porter_stemmer', 'ko_kiwi', and huggingface `AutoTokenizer`.
|
||||
You can pass huggingface tokenizer name.
|
||||
Default is porter_stemmer.
|
||||
:param kwargs: The optional arguments.
|
||||
"""
|
||||
|
||||
super().__init__(project_dir)
|
||||
# check if bm25_path and file exist
|
||||
bm25_tokenizer = kwargs.get("bm25_tokenizer", None)
|
||||
if bm25_tokenizer is None:
|
||||
bm25_tokenizer = "porter_stemmer"
|
||||
bm25_path = os.path.join(self.resources_dir, get_bm25_pkl_name(bm25_tokenizer))
|
||||
|
||||
assert (
|
||||
bm25_path is not None
|
||||
), "bm25_path must be specified for using bm25 retrieval."
|
||||
assert os.path.exists(
|
||||
bm25_path
|
||||
), f"bm25_path {bm25_path} does not exist. Please ingest first."
|
||||
|
||||
self.bm25_corpus = load_bm25_corpus(bm25_path)
|
||||
assert (
|
||||
"tokens" and "passage_id" in list(self.bm25_corpus.keys())
|
||||
), "bm25_corpus must contain tokens and passage_id. Please check you ingested bm25 corpus correctly."
|
||||
self.tokenizer = select_bm25_tokenizer(bm25_tokenizer)
|
||||
assert self.bm25_corpus["tokenizer_name"] == bm25_tokenizer, (
|
||||
f"The bm25 corpus tokenizer is {self.bm25_corpus['tokenizer_name']}, but your input is {bm25_tokenizer}. "
|
||||
f"You need to ingest again. Delete bm25 pkl file and re-ingest it."
|
||||
)
|
||||
self.bm25_instance = BM25Okapi(self.bm25_corpus["tokens"])
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries = self.cast_to_run(previous_result)
|
||||
pure_params = pop_params(self._pure, kwargs)
|
||||
ids, scores = self._pure(queries, *args, **pure_params)
|
||||
contents = fetch_contents(self.corpus_df, ids)
|
||||
return contents, ids, scores
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[List[str]],
|
||||
top_k: int,
|
||||
ids: Optional[List[List[str]]] = None,
|
||||
) -> Tuple[List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
BM25 retrieval function.
|
||||
You have to load a pickle file that is already ingested.
|
||||
|
||||
:param queries: 2-d list of query strings.
|
||||
Each element of the list is a query strings of each row.
|
||||
:param top_k: The number of passages to be retrieved.
|
||||
:param ids: The optional list of ids that you want to retrieve.
|
||||
You don't need to specify this in the general use cases.
|
||||
Default is None.
|
||||
:return: The 2-d list contains a list of passage ids that retrieved from bm25 and 2-d list of its scores.
|
||||
It will be a length of queries. And each element has a length of top_k.
|
||||
"""
|
||||
if ids is not None:
|
||||
score_result = list(
|
||||
map(
|
||||
lambda query_list, id_list: get_bm25_scores(
|
||||
query_list,
|
||||
id_list,
|
||||
self.tokenizer,
|
||||
self.bm25_instance,
|
||||
self.bm25_corpus,
|
||||
),
|
||||
queries,
|
||||
ids,
|
||||
)
|
||||
)
|
||||
return ids, score_result
|
||||
|
||||
# run async bm25_pure function
|
||||
tasks = [
|
||||
bm25_pure(
|
||||
input_queries,
|
||||
top_k,
|
||||
self.tokenizer,
|
||||
self.bm25_instance,
|
||||
self.bm25_corpus,
|
||||
)
|
||||
for input_queries in queries
|
||||
]
|
||||
loop = get_event_loop()
|
||||
results = loop.run_until_complete(asyncio.gather(*tasks))
|
||||
id_result = list(map(lambda x: x[0], results))
|
||||
score_result = list(map(lambda x: x[1], results))
|
||||
return id_result, score_result
|
||||
|
||||
|
||||
async def bm25_pure(
|
||||
queries: List[str], top_k: int, tokenizer, bm25_api: BM25Okapi, bm25_corpus: Dict
|
||||
) -> Tuple[List[str], List[float]]:
|
||||
"""
|
||||
Async BM25 retrieval function.
|
||||
Its usage is for async retrieval of bm25 row by row.
|
||||
|
||||
:param queries: A list of query strings.
|
||||
:param top_k: The number of passages to be retrieved.
|
||||
:param tokenizer: A tokenizer that will be used to tokenize queries.
|
||||
:param bm25_api: A bm25 api instance that will be used to retrieve passages.
|
||||
:param bm25_corpus: A dictionary containing the bm25 corpus, which is doc_id from corpus and tokenized corpus.
|
||||
Its data structure looks like this:
|
||||
|
||||
.. Code:: python
|
||||
|
||||
{
|
||||
"tokens": [], # 2d list of tokens
|
||||
"passage_id": [], # 2d list of passage_id. Type must be str.
|
||||
}
|
||||
:return: The tuple contains a list of passage ids that retrieved from bm25 and its scores.
|
||||
"""
|
||||
# I don't make queries operation to async, because queries length might be small, so it will occur overhead.
|
||||
tokenized_queries = tokenize(queries, tokenizer)
|
||||
id_result = []
|
||||
score_result = []
|
||||
for query in tokenized_queries:
|
||||
scores = bm25_api.get_scores(query)
|
||||
sorted_scores = sorted(scores, reverse=True)
|
||||
top_n_index = np.argsort(scores)[::-1][:top_k]
|
||||
ids = [bm25_corpus["passage_id"][i] for i in top_n_index]
|
||||
id_result.append(ids)
|
||||
score_result.append(sorted_scores[:top_k])
|
||||
|
||||
# make a total result to top_k
|
||||
id_result, score_result = evenly_distribute_passages(id_result, score_result, top_k)
|
||||
# sort id_result and score_result by score
|
||||
result = [
|
||||
(_id, score)
|
||||
for score, _id in sorted(
|
||||
zip(score_result, id_result), key=lambda pair: pair[0], reverse=True
|
||||
)
|
||||
]
|
||||
id_result, score_result = zip(*result)
|
||||
return list(id_result), list(score_result)
|
||||
|
||||
|
||||
def get_bm25_scores(
|
||||
queries: List[str],
|
||||
ids: List[str],
|
||||
tokenizer,
|
||||
bm25_api: BM25Okapi,
|
||||
bm25_corpus: Dict,
|
||||
) -> List[float]:
|
||||
if len(ids) == 0 or not bool(ids):
|
||||
return []
|
||||
tokenized_queries = tokenize(queries, tokenizer)
|
||||
result_dict = {id_: [] for id_ in ids}
|
||||
for query in tokenized_queries:
|
||||
scores = bm25_api.get_scores(query)
|
||||
for i, id_ in enumerate(ids):
|
||||
result_dict[id_].append(scores[bm25_corpus["passage_id"].index(id_)])
|
||||
result_df = pd.DataFrame(result_dict)
|
||||
return result_df.max(axis=0).tolist()
|
||||
|
||||
|
||||
def tokenize(queries: List[str], tokenizer) -> List[List[int]]:
|
||||
if isinstance(tokenizer, PreTrainedTokenizerBase):
|
||||
tokenized_queries = tokenizer(queries).input_ids
|
||||
else:
|
||||
tokenized_queries = tokenizer(queries)
|
||||
return tokenized_queries
|
||||
|
||||
|
||||
def bm25_ingest(
|
||||
corpus_path: str, corpus_data: pd.DataFrame, bm25_tokenizer: str = "porter_stemmer"
|
||||
):
|
||||
if not corpus_path.endswith(".pkl"):
|
||||
raise ValueError(f"Corpus path {corpus_path} is not a pickle file.")
|
||||
validate_corpus_dataset(corpus_data)
|
||||
ids = corpus_data["doc_id"].tolist()
|
||||
|
||||
# Initialize bm25_corpus
|
||||
bm25_corpus = pd.DataFrame()
|
||||
|
||||
# Load the BM25 corpus if it exists and get the passage ids
|
||||
if os.path.exists(corpus_path) and os.path.getsize(corpus_path) > 0:
|
||||
with open(corpus_path, "rb") as r:
|
||||
corpus = pickle.load(r)
|
||||
bm25_corpus = pd.DataFrame.from_dict(corpus)
|
||||
duplicated_passage_rows = bm25_corpus[bm25_corpus["passage_id"].isin(ids)]
|
||||
new_passage = corpus_data[
|
||||
~corpus_data["doc_id"].isin(duplicated_passage_rows["passage_id"])
|
||||
]
|
||||
else:
|
||||
new_passage = corpus_data
|
||||
|
||||
if not new_passage.empty:
|
||||
tokenizer = select_bm25_tokenizer(bm25_tokenizer)
|
||||
if isinstance(tokenizer, PreTrainedTokenizerBase):
|
||||
tokenized_corpus = tokenizer(new_passage["contents"].tolist()).input_ids
|
||||
else:
|
||||
tokenized_corpus = tokenizer(new_passage["contents"].tolist())
|
||||
new_bm25_corpus = pd.DataFrame(
|
||||
{
|
||||
"tokens": tokenized_corpus,
|
||||
"passage_id": new_passage["doc_id"].tolist(),
|
||||
}
|
||||
)
|
||||
|
||||
if not bm25_corpus.empty:
|
||||
bm25_corpus_updated = pd.concat(
|
||||
[bm25_corpus, new_bm25_corpus], ignore_index=True
|
||||
)
|
||||
bm25_dict = bm25_corpus_updated.to_dict("list")
|
||||
else:
|
||||
bm25_dict = new_bm25_corpus.to_dict("list")
|
||||
|
||||
# add tokenizer name to bm25_dict
|
||||
bm25_dict["tokenizer_name"] = bm25_tokenizer
|
||||
|
||||
with open(corpus_path, "wb") as w:
|
||||
pickle.dump(bm25_dict, w)
|
||||
|
||||
|
||||
def select_bm25_tokenizer(
|
||||
bm25_tokenizer: str,
|
||||
) -> Callable[[str], List[Union[int, str]]]:
|
||||
if bm25_tokenizer in list(BM25_TOKENIZER.keys()):
|
||||
return BM25_TOKENIZER[bm25_tokenizer]
|
||||
|
||||
return AutoTokenizer.from_pretrained(bm25_tokenizer, use_fast=False)
|
||||
214
autorag/nodes/retrieval/hybrid_cc.py
Normal file
214
autorag/nodes/retrieval/hybrid_cc.py
Normal file
@@ -0,0 +1,214 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Tuple, List, Union
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.retrieval.base import HybridRetrieval
|
||||
from autorag.utils.util import pop_params, fetch_contents, result_to_dataframe
|
||||
|
||||
|
||||
def normalize_mm(scores: List[str], fixed_min_value: float = 0):
|
||||
arr = np.array(scores)
|
||||
max_value = np.max(arr)
|
||||
min_value = np.min(arr)
|
||||
norm_score = (arr - min_value) / (max_value - min_value)
|
||||
return norm_score
|
||||
|
||||
|
||||
def normalize_tmm(scores: List[str], fixed_min_value: float):
|
||||
arr = np.array(scores)
|
||||
max_value = np.max(arr)
|
||||
norm_score = (arr - fixed_min_value) / (max_value - fixed_min_value)
|
||||
return norm_score
|
||||
|
||||
|
||||
def normalize_z(scores: List[str], fixed_min_value: float = 0):
|
||||
arr = np.array(scores)
|
||||
mean_value = np.mean(arr)
|
||||
std_value = np.std(arr)
|
||||
norm_score = (arr - mean_value) / std_value
|
||||
return norm_score
|
||||
|
||||
|
||||
def normalize_dbsf(scores: List[str], fixed_min_value: float = 0):
|
||||
arr = np.array(scores)
|
||||
mean_value = np.mean(arr)
|
||||
std_value = np.std(arr)
|
||||
min_value = mean_value - 3 * std_value
|
||||
max_value = mean_value + 3 * std_value
|
||||
norm_score = (arr - min_value) / (max_value - min_value)
|
||||
return norm_score
|
||||
|
||||
|
||||
normalize_method_dict = {
|
||||
"mm": normalize_mm,
|
||||
"tmm": normalize_tmm,
|
||||
"z": normalize_z,
|
||||
"dbsf": normalize_dbsf,
|
||||
}
|
||||
|
||||
|
||||
class HybridCC(HybridRetrieval):
|
||||
def _pure(
|
||||
self,
|
||||
ids: Tuple,
|
||||
scores: Tuple,
|
||||
top_k: int,
|
||||
weight: float,
|
||||
normalize_method: str = "mm",
|
||||
semantic_theoretical_min_value: float = -1.0,
|
||||
lexical_theoretical_min_value: float = 0.0,
|
||||
):
|
||||
return hybrid_cc(
|
||||
ids,
|
||||
scores,
|
||||
top_k,
|
||||
weight,
|
||||
normalize_method,
|
||||
semantic_theoretical_min_value,
|
||||
lexical_theoretical_min_value,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def run_evaluator(
|
||||
cls,
|
||||
project_dir: Union[str, Path],
|
||||
previous_result: pd.DataFrame,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
if "ids" in kwargs and "scores" in kwargs:
|
||||
data_dir = os.path.join(project_dir, "data")
|
||||
corpus_df = pd.read_parquet(
|
||||
os.path.join(data_dir, "corpus.parquet"), engine="pyarrow"
|
||||
)
|
||||
|
||||
params = pop_params(hybrid_cc, kwargs)
|
||||
assert (
|
||||
"ids" in params and "scores" in params and "top_k" in params
|
||||
), "ids, scores, and top_k must be specified."
|
||||
|
||||
@result_to_dataframe(
|
||||
["retrieved_contents", "retrieved_ids", "retrieve_scores"]
|
||||
)
|
||||
def __cc(**cc_params):
|
||||
ids, scores = hybrid_cc(**cc_params)
|
||||
contents = fetch_contents(corpus_df, ids)
|
||||
return contents, ids, scores
|
||||
|
||||
return __cc(**params)
|
||||
else:
|
||||
assert (
|
||||
"target_modules" in kwargs and "target_module_params" in kwargs
|
||||
), "target_modules and target_module_params must be specified if there is not ids and scores."
|
||||
instance = cls(project_dir, *args, **kwargs)
|
||||
result = instance.pure(previous_result, *args, **kwargs)
|
||||
del instance
|
||||
return result
|
||||
|
||||
|
||||
def hybrid_cc(
|
||||
ids: Tuple,
|
||||
scores: Tuple,
|
||||
top_k: int,
|
||||
weight: float,
|
||||
normalize_method: str = "mm",
|
||||
semantic_theoretical_min_value: float = -1.0,
|
||||
lexical_theoretical_min_value: float = 0.0,
|
||||
) -> Tuple[List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Hybrid CC function.
|
||||
CC (convex combination) is a method to fuse lexical and semantic retrieval results.
|
||||
It is a method that first normalizes the scores of each retrieval result,
|
||||
and then combines them with the given weights.
|
||||
It is uniquer than other retrieval modules, because it does not really execute retrieval,
|
||||
but just fuse the results of other retrieval functions.
|
||||
So you have to run more than two retrieval modules before running this function.
|
||||
And collect ids and scores result from each retrieval module.
|
||||
Make it as tuple and input it to this function.
|
||||
|
||||
:param ids: The tuple of ids that you want to fuse.
|
||||
The length of this must be the same as the length of scores.
|
||||
The semantic retrieval ids must be the first index.
|
||||
:param scores: The retrieve scores that you want to fuse.
|
||||
The length of this must be the same as the length of ids.
|
||||
The semantic retrieval scores must be the first index.
|
||||
:param top_k: The number of passages to be retrieved.
|
||||
:param normalize_method: The normalization method to use.
|
||||
There are some normalization method that you can use at the hybrid cc method.
|
||||
AutoRAG support following.
|
||||
- `mm`: Min-max scaling
|
||||
- `tmm`: Theoretical min-max scaling
|
||||
- `z`: z-score normalization
|
||||
- `dbsf`: 3-sigma normalization
|
||||
:param weight: The weight value. If the weight is 1.0, it means the
|
||||
weight to the semantic module will be 1.0 and weight to the lexical module will be 0.0.
|
||||
:param semantic_theoretical_min_value: This value used by `tmm` normalization method. You can set the
|
||||
theoretical minimum value by yourself. Default is -1.
|
||||
:param lexical_theoretical_min_value: This value used by `tmm` normalization method. You can set the
|
||||
theoretical minimum value by yourself. Default is 0.
|
||||
:return: The tuple of ids and fused scores that fused by CC. Plus, the third element is selected weight value.
|
||||
"""
|
||||
assert len(ids) == len(scores), "The length of ids and scores must be the same."
|
||||
assert len(ids) > 1, "You must input more than one retrieval results."
|
||||
assert top_k > 0, "top_k must be greater than 0."
|
||||
assert weight >= 0, "The weight must be greater than 0."
|
||||
assert weight <= 1, "The weight must be less than 1."
|
||||
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"semantic_ids": ids[0],
|
||||
"lexical_ids": ids[1],
|
||||
"semantic_score": scores[0],
|
||||
"lexical_score": scores[1],
|
||||
}
|
||||
)
|
||||
|
||||
def cc_pure_apply(row):
|
||||
return fuse_per_query(
|
||||
row["semantic_ids"],
|
||||
row["lexical_ids"],
|
||||
row["semantic_score"],
|
||||
row["lexical_score"],
|
||||
normalize_method=normalize_method,
|
||||
weight=weight,
|
||||
top_k=top_k,
|
||||
semantic_theoretical_min_value=semantic_theoretical_min_value,
|
||||
lexical_theoretical_min_value=lexical_theoretical_min_value,
|
||||
)
|
||||
|
||||
# fixed weight
|
||||
df[["cc_id", "cc_score"]] = df.apply(
|
||||
lambda row: cc_pure_apply(row), axis=1, result_type="expand"
|
||||
)
|
||||
return df["cc_id"].tolist(), df["cc_score"].tolist()
|
||||
|
||||
|
||||
def fuse_per_query(
|
||||
semantic_ids: List[str],
|
||||
lexical_ids: List[str],
|
||||
semantic_scores: List[float],
|
||||
lexical_scores: List[float],
|
||||
normalize_method: str,
|
||||
weight: float,
|
||||
top_k: int,
|
||||
semantic_theoretical_min_value: float,
|
||||
lexical_theoretical_min_value: float,
|
||||
):
|
||||
normalize_func = normalize_method_dict[normalize_method]
|
||||
norm_semantic_scores = normalize_func(
|
||||
semantic_scores, semantic_theoretical_min_value
|
||||
)
|
||||
norm_lexical_scores = normalize_func(lexical_scores, lexical_theoretical_min_value)
|
||||
ids = [semantic_ids, lexical_ids]
|
||||
scores = [norm_semantic_scores, norm_lexical_scores]
|
||||
df = pd.concat(
|
||||
[pd.Series(dict(zip(_id, score))) for _id, score in zip(ids, scores)], axis=1
|
||||
)
|
||||
df.columns = ["semantic", "lexical"]
|
||||
df = df.fillna(0)
|
||||
df["weighted_sum"] = df.mul((weight, 1.0 - weight)).sum(axis=1)
|
||||
df = df.sort_values(by="weighted_sum", ascending=False)
|
||||
return df.index.tolist()[:top_k], df["weighted_sum"][:top_k].tolist()
|
||||
128
autorag/nodes/retrieval/hybrid_rrf.py
Normal file
128
autorag/nodes/retrieval/hybrid_rrf.py
Normal file
@@ -0,0 +1,128 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple, Union
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.nodes.retrieval.base import HybridRetrieval
|
||||
from autorag.utils.util import pop_params, fetch_contents, result_to_dataframe
|
||||
|
||||
|
||||
class HybridRRF(HybridRetrieval):
|
||||
def _pure(self, ids, scores, top_k: int, weight: int = 60, rrf_k: int = -1):
|
||||
return hybrid_rrf(ids, scores, top_k, weight, rrf_k)
|
||||
|
||||
@classmethod
|
||||
def run_evaluator(
|
||||
cls,
|
||||
project_dir: Union[str, Path],
|
||||
previous_result: pd.DataFrame,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
if "ids" in kwargs and "scores" in kwargs:
|
||||
data_dir = os.path.join(project_dir, "data")
|
||||
corpus_df = pd.read_parquet(
|
||||
os.path.join(data_dir, "corpus.parquet"), engine="pyarrow"
|
||||
)
|
||||
|
||||
params = pop_params(hybrid_rrf, kwargs)
|
||||
assert (
|
||||
"ids" in params and "scores" in params and "top_k" in params
|
||||
), "ids, scores, and top_k must be specified."
|
||||
|
||||
@result_to_dataframe(
|
||||
["retrieved_contents", "retrieved_ids", "retrieve_scores"]
|
||||
)
|
||||
def __rrf(**rrf_params):
|
||||
ids, scores = hybrid_rrf(**rrf_params)
|
||||
contents = fetch_contents(corpus_df, ids)
|
||||
return contents, ids, scores
|
||||
|
||||
return __rrf(**params)
|
||||
else:
|
||||
assert (
|
||||
"target_modules" in kwargs and "target_module_params" in kwargs
|
||||
), "target_modules and target_module_params must be specified if there is not ids and scores."
|
||||
instance = cls(project_dir, *args, **kwargs)
|
||||
result = instance.pure(previous_result, *args, **kwargs)
|
||||
del instance
|
||||
return result
|
||||
|
||||
|
||||
def hybrid_rrf(
|
||||
ids: Tuple,
|
||||
scores: Tuple,
|
||||
top_k: int,
|
||||
weight: int = 60,
|
||||
rrf_k: int = -1,
|
||||
) -> Tuple[List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
Hybrid RRF function.
|
||||
RRF (Rank Reciprocal Fusion) is a method to fuse multiple retrieval results.
|
||||
It is common to fuse dense retrieval and sparse retrieval results using RRF.
|
||||
To use this function, you must input ids and scores as tuple.
|
||||
It is more unique than other retrieval modules because it does not really execute retrieval but just fuses
|
||||
the results of other retrieval functions.
|
||||
So you have to run more than two retrieval modules before running this function.
|
||||
And collect ids and scores result from each retrieval module.
|
||||
Make it as a tuple and input it to this function.
|
||||
|
||||
:param ids: The tuple of ids that you want to fuse.
|
||||
The length of this must be the same as the length of scores.
|
||||
:param scores: The retrieve scores that you want to fuse.
|
||||
The length of this must be the same as the length of ids.
|
||||
:param top_k: The number of passages to be retrieved.
|
||||
:param weight: Hyperparameter for RRF.
|
||||
It was originally rrf_k value.
|
||||
Default is 60.
|
||||
For more information, please visit our documentation.
|
||||
:param rrf_k: (Deprecated) Hyperparameter for RRF.
|
||||
It was originally rrf_k value. Will remove at a further version.
|
||||
:return: The tuple of ids and fused scores that are fused by RRF.
|
||||
"""
|
||||
assert len(ids) == len(scores), "The length of ids and scores must be the same."
|
||||
assert len(ids) > 1, "You must input more than one retrieval results."
|
||||
assert top_k > 0, "top_k must be greater than 0."
|
||||
assert weight > 0, "rrf_k must be greater than 0."
|
||||
|
||||
if rrf_k != -1:
|
||||
weight = int(rrf_k)
|
||||
else:
|
||||
weight = int(weight)
|
||||
|
||||
id_df = pd.DataFrame({f"id_{i}": id_list for i, id_list in enumerate(ids)})
|
||||
score_df = pd.DataFrame(
|
||||
{f"score_{i}": score_list for i, score_list in enumerate(scores)}
|
||||
)
|
||||
df = pd.concat([id_df, score_df], axis=1)
|
||||
|
||||
def rrf_pure_apply(row):
|
||||
ids_tuple = tuple(row[[f"id_{i}" for i in range(len(ids))]].values)
|
||||
scores_tuple = tuple(row[[f"score_{i}" for i in range(len(scores))]].values)
|
||||
return pd.Series(rrf_pure(ids_tuple, scores_tuple, weight, top_k))
|
||||
|
||||
df[["rrf_id", "rrf_score"]] = df.apply(rrf_pure_apply, axis=1)
|
||||
return df["rrf_id"].tolist(), df["rrf_score"].tolist()
|
||||
|
||||
|
||||
def rrf_pure(
|
||||
ids: Tuple, scores: Tuple, rrf_k: int, top_k: int
|
||||
) -> Tuple[List[str], List[float]]:
|
||||
df = pd.concat(
|
||||
[pd.Series(dict(zip(_id, score))) for _id, score in zip(ids, scores)], axis=1
|
||||
)
|
||||
rank_df = df.rank(ascending=False, method="min")
|
||||
rank_df = rank_df.fillna(0)
|
||||
rank_df["rrf"] = rank_df.apply(lambda row: rrf_calculate(row, rrf_k), axis=1)
|
||||
rank_df = rank_df.sort_values(by="rrf", ascending=False)
|
||||
return rank_df.index.tolist()[:top_k], rank_df["rrf"].tolist()[:top_k]
|
||||
|
||||
|
||||
def rrf_calculate(row, rrf_k):
|
||||
result = 0
|
||||
for r in row:
|
||||
if r == 0:
|
||||
continue
|
||||
result += 1 / (r + rrf_k)
|
||||
return result
|
||||
544
autorag/nodes/retrieval/run.py
Normal file
544
autorag/nodes/retrieval/run.py
Normal file
@@ -0,0 +1,544 @@
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
from copy import deepcopy
|
||||
from typing import List, Callable, Dict, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from autorag.evaluation import evaluate_retrieval
|
||||
from autorag.schema.metricinput import MetricInput
|
||||
from autorag.strategy import measure_speed, filter_by_threshold, select_best
|
||||
from autorag.support import get_support_modules
|
||||
from autorag.utils.util import get_best_row, to_list, apply_recursive
|
||||
|
||||
logger = logging.getLogger("AutoRAG")
|
||||
|
||||
semantic_module_names = ["vectordb", "VectorDB"]
|
||||
lexical_module_names = ["bm25", "BM25"]
|
||||
hybrid_module_names = ["hybrid_rrf", "hybrid_cc", "HybridCC", "HybridRRF"]
|
||||
|
||||
|
||||
def run_retrieval_node(
|
||||
modules: List,
|
||||
module_params: List[Dict],
|
||||
previous_result: pd.DataFrame,
|
||||
node_line_dir: str,
|
||||
strategies: Dict,
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Run evaluation and select the best module among retrieval node results.
|
||||
|
||||
:param modules: Retrieval modules to run.
|
||||
:param module_params: Retrieval module parameters.
|
||||
:param previous_result: Previous result dataframe.
|
||||
Could be query expansion's best result or qa data.
|
||||
:param node_line_dir: This node line's directory.
|
||||
:param strategies: Strategies for retrieval node.
|
||||
:return: The best result dataframe.
|
||||
It contains previous result columns and retrieval node's result columns.
|
||||
"""
|
||||
if not os.path.exists(node_line_dir):
|
||||
os.makedirs(node_line_dir)
|
||||
project_dir = pathlib.PurePath(node_line_dir).parent.parent
|
||||
qa_df = pd.read_parquet(
|
||||
os.path.join(project_dir, "data", "qa.parquet"), engine="pyarrow"
|
||||
)
|
||||
retrieval_gt = qa_df["retrieval_gt"].tolist()
|
||||
retrieval_gt = apply_recursive(lambda x: str(x), to_list(retrieval_gt))
|
||||
# make rows to metric_inputs
|
||||
metric_inputs = [
|
||||
MetricInput(retrieval_gt=ret_gt, query=query, generation_gt=gen_gt)
|
||||
for ret_gt, query, gen_gt in zip(
|
||||
retrieval_gt, qa_df["query"].tolist(), qa_df["generation_gt"].tolist()
|
||||
)
|
||||
]
|
||||
|
||||
save_dir = os.path.join(node_line_dir, "retrieval") # node name
|
||||
if not os.path.exists(save_dir):
|
||||
os.makedirs(save_dir)
|
||||
|
||||
def run(input_modules, input_module_params) -> Tuple[List[pd.DataFrame], List]:
|
||||
"""
|
||||
Run input modules and parameters.
|
||||
|
||||
:param input_modules: Input modules
|
||||
:param input_module_params: Input module parameters
|
||||
:return: First, it returns list of result dataframe.
|
||||
Second, it returns list of execution times.
|
||||
"""
|
||||
result, execution_times = zip(
|
||||
*map(
|
||||
lambda task: measure_speed(
|
||||
task[0].run_evaluator,
|
||||
project_dir=project_dir,
|
||||
previous_result=previous_result,
|
||||
**task[1],
|
||||
),
|
||||
zip(input_modules, input_module_params),
|
||||
)
|
||||
)
|
||||
average_times = list(map(lambda x: x / len(result[0]), execution_times))
|
||||
|
||||
# run metrics before filtering
|
||||
if strategies.get("metrics") is None:
|
||||
raise ValueError("You must at least one metrics for retrieval evaluation.")
|
||||
result = list(
|
||||
map(
|
||||
lambda x: evaluate_retrieval_node(
|
||||
x,
|
||||
metric_inputs,
|
||||
strategies.get("metrics"),
|
||||
),
|
||||
result,
|
||||
)
|
||||
)
|
||||
|
||||
return result, average_times
|
||||
|
||||
def save_and_summary(
|
||||
input_modules,
|
||||
input_module_params,
|
||||
result_list,
|
||||
execution_time_list,
|
||||
filename_start: int,
|
||||
):
|
||||
"""
|
||||
Save the result and make summary file
|
||||
|
||||
:param input_modules: Input modules
|
||||
:param input_module_params: Input module parameters
|
||||
:param result_list: Result list
|
||||
:param execution_time_list: Execution times
|
||||
:param filename_start: The first filename to use
|
||||
:return: First, it returns list of result dataframe.
|
||||
Second, it returns list of execution times.
|
||||
"""
|
||||
|
||||
# save results to folder
|
||||
filepaths = list(
|
||||
map(
|
||||
lambda x: os.path.join(save_dir, f"{x}.parquet"),
|
||||
range(filename_start, filename_start + len(input_modules)),
|
||||
)
|
||||
)
|
||||
list(
|
||||
map(
|
||||
lambda x: x[0].to_parquet(x[1], index=False),
|
||||
zip(result_list, filepaths),
|
||||
)
|
||||
) # execute save to parquet
|
||||
filename_list = list(map(lambda x: os.path.basename(x), filepaths))
|
||||
|
||||
summary_df = pd.DataFrame(
|
||||
{
|
||||
"filename": filename_list,
|
||||
"module_name": list(map(lambda module: module.__name__, input_modules)),
|
||||
"module_params": input_module_params,
|
||||
"execution_time": execution_time_list,
|
||||
**{
|
||||
metric: list(map(lambda result: result[metric].mean(), result_list))
|
||||
for metric in strategies.get("metrics")
|
||||
},
|
||||
}
|
||||
)
|
||||
summary_df.to_csv(os.path.join(save_dir, "summary.csv"), index=False)
|
||||
return summary_df
|
||||
|
||||
def find_best(results, average_times, filenames):
|
||||
# filter by strategies
|
||||
if strategies.get("speed_threshold") is not None:
|
||||
results, filenames = filter_by_threshold(
|
||||
results, average_times, strategies["speed_threshold"], filenames
|
||||
)
|
||||
selected_result, selected_filename = select_best(
|
||||
results,
|
||||
strategies.get("metrics"),
|
||||
filenames,
|
||||
strategies.get("strategy", "mean"),
|
||||
)
|
||||
return selected_result, selected_filename
|
||||
|
||||
filename_first = 0
|
||||
# run semantic modules
|
||||
logger.info("Running retrieval node - semantic retrieval module...")
|
||||
if any([module.__name__ in semantic_module_names for module in modules]):
|
||||
semantic_modules, semantic_module_params = zip(
|
||||
*filter(
|
||||
lambda x: x[0].__name__ in semantic_module_names,
|
||||
zip(modules, module_params),
|
||||
)
|
||||
)
|
||||
semantic_results, semantic_times = run(semantic_modules, semantic_module_params)
|
||||
semantic_summary_df = save_and_summary(
|
||||
semantic_modules,
|
||||
semantic_module_params,
|
||||
semantic_results,
|
||||
semantic_times,
|
||||
filename_first,
|
||||
)
|
||||
semantic_selected_result, semantic_selected_filename = find_best(
|
||||
semantic_results, semantic_times, semantic_summary_df["filename"].tolist()
|
||||
)
|
||||
semantic_summary_df["is_best"] = (
|
||||
semantic_summary_df["filename"] == semantic_selected_filename
|
||||
)
|
||||
filename_first += len(semantic_modules)
|
||||
else:
|
||||
(
|
||||
semantic_selected_filename,
|
||||
semantic_summary_df,
|
||||
semantic_results,
|
||||
semantic_times,
|
||||
) = None, pd.DataFrame(), [], []
|
||||
# run lexical modules
|
||||
logger.info("Running retrieval node - lexical retrieval module...")
|
||||
if any([module.__name__ in lexical_module_names for module in modules]):
|
||||
lexical_modules, lexical_module_params = zip(
|
||||
*filter(
|
||||
lambda x: x[0].__name__ in lexical_module_names,
|
||||
zip(modules, module_params),
|
||||
)
|
||||
)
|
||||
lexical_results, lexical_times = run(lexical_modules, lexical_module_params)
|
||||
lexical_summary_df = save_and_summary(
|
||||
lexical_modules,
|
||||
lexical_module_params,
|
||||
lexical_results,
|
||||
lexical_times,
|
||||
filename_first,
|
||||
)
|
||||
lexical_selected_result, lexical_selected_filename = find_best(
|
||||
lexical_results, lexical_times, lexical_summary_df["filename"].tolist()
|
||||
)
|
||||
lexical_summary_df["is_best"] = (
|
||||
lexical_summary_df["filename"] == lexical_selected_filename
|
||||
)
|
||||
filename_first += len(lexical_modules)
|
||||
else:
|
||||
(
|
||||
lexical_selected_filename,
|
||||
lexical_summary_df,
|
||||
lexical_results,
|
||||
lexical_times,
|
||||
) = None, pd.DataFrame(), [], []
|
||||
|
||||
logger.info("Running retrieval node - hybrid retrieval module...")
|
||||
# Next, run hybrid retrieval
|
||||
if any([module.__name__ in hybrid_module_names for module in modules]):
|
||||
hybrid_modules, hybrid_module_params = zip(
|
||||
*filter(
|
||||
lambda x: x[0].__name__ in hybrid_module_names,
|
||||
zip(modules, module_params),
|
||||
)
|
||||
)
|
||||
if all(
|
||||
["target_module_params" in x for x in hybrid_module_params]
|
||||
): # for Runner.run
|
||||
# If target_module_params are already given, run hybrid retrieval directly
|
||||
hybrid_results, hybrid_times = run(hybrid_modules, hybrid_module_params)
|
||||
hybrid_summary_df = save_and_summary(
|
||||
hybrid_modules,
|
||||
hybrid_module_params,
|
||||
hybrid_results,
|
||||
hybrid_times,
|
||||
filename_first,
|
||||
)
|
||||
filename_first += len(hybrid_modules)
|
||||
else: # for Evaluator
|
||||
# get id and score
|
||||
ids_scores = get_ids_and_scores(
|
||||
save_dir,
|
||||
[semantic_selected_filename, lexical_selected_filename],
|
||||
semantic_summary_df,
|
||||
lexical_summary_df,
|
||||
previous_result,
|
||||
)
|
||||
hybrid_module_params = list(
|
||||
map(lambda x: {**x, **ids_scores}, hybrid_module_params)
|
||||
)
|
||||
|
||||
# optimize each modules
|
||||
real_hybrid_times = [
|
||||
get_hybrid_execution_times(semantic_summary_df, lexical_summary_df)
|
||||
] * len(hybrid_module_params)
|
||||
hybrid_times = real_hybrid_times.copy()
|
||||
hybrid_results = []
|
||||
for module, module_param in zip(hybrid_modules, hybrid_module_params):
|
||||
module_result_df, module_best_weight = optimize_hybrid(
|
||||
module,
|
||||
module_param,
|
||||
strategies,
|
||||
metric_inputs,
|
||||
project_dir,
|
||||
previous_result,
|
||||
)
|
||||
module_param["weight"] = module_best_weight
|
||||
hybrid_results.append(module_result_df)
|
||||
|
||||
hybrid_summary_df = save_and_summary(
|
||||
hybrid_modules,
|
||||
hybrid_module_params,
|
||||
hybrid_results,
|
||||
hybrid_times,
|
||||
filename_first,
|
||||
)
|
||||
filename_first += len(hybrid_modules)
|
||||
hybrid_summary_df["execution_time"] = hybrid_times
|
||||
best_semantic_summary_row = semantic_summary_df.loc[
|
||||
semantic_summary_df["is_best"]
|
||||
].iloc[0]
|
||||
best_lexical_summary_row = lexical_summary_df.loc[
|
||||
lexical_summary_df["is_best"]
|
||||
].iloc[0]
|
||||
target_modules = (
|
||||
best_semantic_summary_row["module_name"],
|
||||
best_lexical_summary_row["module_name"],
|
||||
)
|
||||
target_module_params = (
|
||||
best_semantic_summary_row["module_params"],
|
||||
best_lexical_summary_row["module_params"],
|
||||
)
|
||||
hybrid_summary_df = edit_summary_df_params(
|
||||
hybrid_summary_df, target_modules, target_module_params
|
||||
)
|
||||
else:
|
||||
if any([module.__name__ in hybrid_module_names for module in modules]):
|
||||
logger.warning(
|
||||
"You must at least one semantic module and lexical module for hybrid evaluation."
|
||||
"Passing hybrid module."
|
||||
)
|
||||
_, hybrid_summary_df, hybrid_results, hybrid_times = (
|
||||
None,
|
||||
pd.DataFrame(),
|
||||
[],
|
||||
[],
|
||||
)
|
||||
|
||||
summary = pd.concat(
|
||||
[semantic_summary_df, lexical_summary_df, hybrid_summary_df], ignore_index=True
|
||||
)
|
||||
results = semantic_results + lexical_results + hybrid_results
|
||||
average_times = semantic_times + lexical_times + hybrid_times
|
||||
filenames = summary["filename"].tolist()
|
||||
|
||||
# filter by strategies
|
||||
selected_result, selected_filename = find_best(results, average_times, filenames)
|
||||
best_result = pd.concat([previous_result, selected_result], axis=1)
|
||||
|
||||
# add summary.csv 'is_best' column
|
||||
summary["is_best"] = summary["filename"] == selected_filename
|
||||
|
||||
# save the result files
|
||||
best_result.to_parquet(
|
||||
os.path.join(
|
||||
save_dir, f"best_{os.path.splitext(selected_filename)[0]}.parquet"
|
||||
),
|
||||
index=False,
|
||||
)
|
||||
summary.to_csv(os.path.join(save_dir, "summary.csv"), index=False)
|
||||
return best_result
|
||||
|
||||
|
||||
def evaluate_retrieval_node(
|
||||
result_df: pd.DataFrame,
|
||||
metric_inputs: List[MetricInput],
|
||||
metrics: Union[List[str], List[Dict]],
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Evaluate retrieval node from retrieval node result dataframe.
|
||||
|
||||
:param result_df: The result dataframe from a retrieval node.
|
||||
:param metric_inputs: List of metric input schema for AutoRAG.
|
||||
:param metrics: Metric list from input strategies.
|
||||
:return: Return result_df with metrics columns.
|
||||
The columns will be 'retrieved_contents', 'retrieved_ids', 'retrieve_scores', and metric names.
|
||||
"""
|
||||
|
||||
@evaluate_retrieval(
|
||||
metric_inputs=metric_inputs,
|
||||
metrics=metrics,
|
||||
)
|
||||
def evaluate_this_module(df: pd.DataFrame):
|
||||
return (
|
||||
df["retrieved_contents"].tolist(),
|
||||
df["retrieved_ids"].tolist(),
|
||||
df["retrieve_scores"].tolist(),
|
||||
)
|
||||
|
||||
return evaluate_this_module(result_df)
|
||||
|
||||
|
||||
def edit_summary_df_params(
|
||||
summary_df: pd.DataFrame, target_modules, target_module_params
|
||||
) -> pd.DataFrame:
|
||||
def delete_ids_scores(x):
|
||||
del x["ids"]
|
||||
del x["scores"]
|
||||
return x
|
||||
|
||||
summary_df["module_params"] = summary_df["module_params"].apply(delete_ids_scores)
|
||||
summary_df["new_params"] = [
|
||||
{"target_modules": target_modules, "target_module_params": target_module_params}
|
||||
] * len(summary_df)
|
||||
summary_df["module_params"] = summary_df.apply(
|
||||
lambda row: {**row["module_params"], **row["new_params"]}, axis=1
|
||||
)
|
||||
summary_df = summary_df.drop(columns=["new_params"])
|
||||
return summary_df
|
||||
|
||||
|
||||
def get_ids_and_scores(
|
||||
node_dir: str,
|
||||
filenames: List[str],
|
||||
semantic_summary_df: pd.DataFrame,
|
||||
lexical_summary_df: pd.DataFrame,
|
||||
previous_result,
|
||||
) -> Dict[str, Tuple[List[List[str]], List[List[float]]]]:
|
||||
project_dir = pathlib.PurePath(node_dir).parent.parent.parent
|
||||
best_results_df = list(
|
||||
map(
|
||||
lambda filename: pd.read_parquet(
|
||||
os.path.join(node_dir, filename), engine="pyarrow"
|
||||
),
|
||||
filenames,
|
||||
)
|
||||
)
|
||||
ids = tuple(
|
||||
map(lambda df: df["retrieved_ids"].apply(list).tolist(), best_results_df)
|
||||
)
|
||||
scores = tuple(
|
||||
map(lambda df: df["retrieve_scores"].apply(list).tolist(), best_results_df)
|
||||
)
|
||||
# search non-duplicate ids
|
||||
semantic_ids = deepcopy(ids[0])
|
||||
lexical_ids = deepcopy(ids[1])
|
||||
|
||||
def get_non_duplicate_ids(target_ids, compare_ids) -> List[List[str]]:
|
||||
"""
|
||||
Get non-duplicate ids from target_ids and compare_ids.
|
||||
If you want to non-duplicate ids of semantic_ids, you have to put it at target_ids.
|
||||
"""
|
||||
result_ids = []
|
||||
assert len(target_ids) == len(compare_ids)
|
||||
for target_id_list, compare_id_list in zip(target_ids, compare_ids):
|
||||
query_duplicated = list(set(compare_id_list) - set(target_id_list))
|
||||
duplicate_list = query_duplicated if len(query_duplicated) != 0 else []
|
||||
result_ids.append(duplicate_list)
|
||||
return result_ids
|
||||
|
||||
lexical_target_ids = get_non_duplicate_ids(lexical_ids, semantic_ids)
|
||||
semantic_target_ids = get_non_duplicate_ids(semantic_ids, lexical_ids)
|
||||
|
||||
new_id_tuple = (
|
||||
[a + b for a, b in zip(semantic_ids, semantic_target_ids)],
|
||||
[a + b for a, b in zip(lexical_ids, lexical_target_ids)],
|
||||
)
|
||||
|
||||
# search non-duplicate ids' scores
|
||||
new_semantic_scores = get_scores_by_ids(
|
||||
semantic_target_ids, semantic_summary_df, project_dir, previous_result
|
||||
)
|
||||
new_lexical_scores = get_scores_by_ids(
|
||||
lexical_target_ids, lexical_summary_df, project_dir, previous_result
|
||||
)
|
||||
|
||||
new_score_tuple = (
|
||||
[a + b for a, b in zip(scores[0], new_semantic_scores)],
|
||||
[a + b for a, b in zip(scores[1], new_lexical_scores)],
|
||||
)
|
||||
return {
|
||||
"ids": new_id_tuple,
|
||||
"scores": new_score_tuple,
|
||||
}
|
||||
|
||||
|
||||
def get_scores_by_ids(
|
||||
ids: List[List[str]], module_summary_df: pd.DataFrame, project_dir, previous_result
|
||||
) -> List[List[float]]:
|
||||
module_name = get_best_row(module_summary_df)["module_name"]
|
||||
module_params = get_best_row(module_summary_df)["module_params"]
|
||||
module = get_support_modules(module_name)
|
||||
result_df = module.run_evaluator(
|
||||
project_dir=project_dir,
|
||||
previous_result=previous_result,
|
||||
ids=ids,
|
||||
**module_params,
|
||||
)
|
||||
return to_list(result_df["retrieve_scores"].tolist())
|
||||
|
||||
|
||||
def find_unique_elems(list1: List[str], list2: List[str]) -> List[str]:
|
||||
return list(set(list1).symmetric_difference(set(list2)))
|
||||
|
||||
|
||||
def get_hybrid_execution_times(lexical_summary, semantic_summary) -> float:
|
||||
lexical_execution_time = lexical_summary.loc[lexical_summary["is_best"]].iloc[0][
|
||||
"execution_time"
|
||||
]
|
||||
semantic_execution_time = semantic_summary.loc[semantic_summary["is_best"]].iloc[0][
|
||||
"execution_time"
|
||||
]
|
||||
return lexical_execution_time + semantic_execution_time
|
||||
|
||||
|
||||
def optimize_hybrid(
|
||||
hybrid_module_func: Callable,
|
||||
hybrid_module_param: Dict,
|
||||
strategy: Dict,
|
||||
input_metrics: List[MetricInput],
|
||||
project_dir,
|
||||
previous_result,
|
||||
):
|
||||
if (
|
||||
hybrid_module_func.__name__ == "HybridRRF"
|
||||
or hybrid_module_func.__name__ == "hybrid_rrf"
|
||||
):
|
||||
weight_range = hybrid_module_param.pop("weight_range", (4, 80))
|
||||
test_weight_size = weight_range[1] - weight_range[0] + 1
|
||||
elif (
|
||||
hybrid_module_func.__name__ == "HybridCC"
|
||||
or hybrid_module_func.__name__ == "hybrid_cc"
|
||||
):
|
||||
weight_range = hybrid_module_param.pop("weight_range", (0.0, 1.0))
|
||||
test_weight_size = hybrid_module_param.pop("test_weight_size", 101)
|
||||
else:
|
||||
raise ValueError("You must input hybrid module function at hybrid_module_func.")
|
||||
|
||||
weight_candidates = np.linspace(
|
||||
weight_range[0], weight_range[1], test_weight_size
|
||||
).tolist()
|
||||
|
||||
result_list = []
|
||||
for weight_value in weight_candidates:
|
||||
result_df = hybrid_module_func.run_evaluator(
|
||||
project_dir=project_dir,
|
||||
previous_result=previous_result,
|
||||
weight=weight_value,
|
||||
**hybrid_module_param,
|
||||
)
|
||||
result_list.append(result_df)
|
||||
|
||||
# evaluate here
|
||||
if strategy.get("metrics") is None:
|
||||
raise ValueError("You must at least one metrics for retrieval evaluation.")
|
||||
result_list = list(
|
||||
map(
|
||||
lambda x: evaluate_retrieval_node(
|
||||
x,
|
||||
input_metrics,
|
||||
strategy.get("metrics"),
|
||||
),
|
||||
result_list,
|
||||
)
|
||||
)
|
||||
|
||||
# select best result
|
||||
best_result_df, best_weight = select_best(
|
||||
result_list,
|
||||
strategy.get("metrics"),
|
||||
metadatas=weight_candidates,
|
||||
strategy_name=strategy.get("strategy", "normalize_mean"),
|
||||
)
|
||||
return best_result_df, best_weight
|
||||
303
autorag/nodes/retrieval/vectordb.py
Normal file
303
autorag/nodes/retrieval/vectordb.py
Normal file
@@ -0,0 +1,303 @@
|
||||
import itertools
|
||||
import logging
|
||||
import os
|
||||
from typing import List, Tuple, Optional
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from llama_index.core.embeddings import BaseEmbedding
|
||||
from llama_index.embeddings.openai import OpenAIEmbedding
|
||||
|
||||
from autorag.evaluation.metric.util import (
|
||||
calculate_l2_distance,
|
||||
calculate_inner_product,
|
||||
calculate_cosine_similarity,
|
||||
)
|
||||
from autorag.nodes.retrieval.base import evenly_distribute_passages, BaseRetrieval
|
||||
from autorag.utils import (
|
||||
validate_corpus_dataset,
|
||||
cast_corpus_dataset,
|
||||
cast_qa_dataset,
|
||||
validate_qa_dataset,
|
||||
)
|
||||
from autorag.utils.util import (
|
||||
get_event_loop,
|
||||
process_batch,
|
||||
openai_truncate_by_token,
|
||||
flatten_apply,
|
||||
result_to_dataframe,
|
||||
pop_params,
|
||||
fetch_contents,
|
||||
empty_cuda_cache,
|
||||
convert_inputs_to_list,
|
||||
make_batch,
|
||||
)
|
||||
from autorag.vectordb import load_vectordb_from_yaml
|
||||
from autorag.vectordb.base import BaseVectorStore
|
||||
|
||||
logger = logging.getLogger("AutoRAG")
|
||||
|
||||
|
||||
class VectorDB(BaseRetrieval):
|
||||
def __init__(self, project_dir: str, vectordb: str = "default", **kwargs):
|
||||
"""
|
||||
Initialize VectorDB retrieval node.
|
||||
|
||||
:param project_dir: The project directory path.
|
||||
:param vectordb: The vectordb name.
|
||||
You must configure the vectordb name in the config.yaml file.
|
||||
If you don't configure, it uses the default vectordb.
|
||||
:param kwargs: The optional arguments.
|
||||
Not affected in the init method.
|
||||
"""
|
||||
super().__init__(project_dir)
|
||||
|
||||
vectordb_config_path = os.path.join(self.resources_dir, "vectordb.yaml")
|
||||
self.vector_store = load_vectordb_from_yaml(
|
||||
vectordb_config_path, vectordb, project_dir
|
||||
)
|
||||
|
||||
self.embedding_model = self.vector_store.embedding
|
||||
|
||||
def __del__(self):
|
||||
del self.vector_store
|
||||
del self.embedding_model
|
||||
empty_cuda_cache()
|
||||
super().__del__()
|
||||
|
||||
@result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"])
|
||||
def pure(self, previous_result: pd.DataFrame, *args, **kwargs):
|
||||
queries = self.cast_to_run(previous_result)
|
||||
pure_params = pop_params(self._pure, kwargs)
|
||||
ids, scores = self._pure(queries, **pure_params)
|
||||
contents = fetch_contents(self.corpus_df, ids)
|
||||
return contents, ids, scores
|
||||
|
||||
def _pure(
|
||||
self,
|
||||
queries: List[List[str]],
|
||||
top_k: int,
|
||||
embedding_batch: int = 128,
|
||||
ids: Optional[List[List[str]]] = None,
|
||||
) -> Tuple[List[List[str]], List[List[float]]]:
|
||||
"""
|
||||
VectorDB retrieval function.
|
||||
You have to get a chroma collection that is already ingested.
|
||||
You have to get an embedding model that is already used in ingesting.
|
||||
|
||||
:param queries: 2-d list of query strings.
|
||||
Each element of the list is a query strings of each row.
|
||||
:param top_k: The number of passages to be retrieved.
|
||||
:param embedding_batch: The number of queries to be processed in parallel.
|
||||
This is used to prevent API error at the query embedding.
|
||||
Default is 128.
|
||||
:param ids: The optional list of ids that you want to retrieve.
|
||||
You don't need to specify this in the general use cases.
|
||||
Default is None.
|
||||
|
||||
:return: The 2-d list contains a list of passage ids that retrieved from vectordb and 2-d list of its scores.
|
||||
It will be a length of queries. And each element has a length of top_k.
|
||||
"""
|
||||
# if ids are specified, fetch the ids score from Chroma
|
||||
if ids is not None:
|
||||
return self.__get_ids_scores(queries, ids, embedding_batch)
|
||||
|
||||
# run async vector_db_pure function
|
||||
tasks = [
|
||||
vectordb_pure(query_list, top_k, self.vector_store)
|
||||
for query_list in queries
|
||||
]
|
||||
loop = get_event_loop()
|
||||
results = loop.run_until_complete(
|
||||
process_batch(tasks, batch_size=embedding_batch)
|
||||
)
|
||||
id_result = list(map(lambda x: x[0], results))
|
||||
score_result = list(map(lambda x: x[1], results))
|
||||
return id_result, score_result
|
||||
|
||||
def __get_ids_scores(self, queries, ids, embedding_batch: int):
|
||||
# truncate queries and embedding execution here.
|
||||
openai_embedding_limit = 8000
|
||||
if isinstance(self.embedding_model, OpenAIEmbedding):
|
||||
queries = list(
|
||||
map(
|
||||
lambda query_list: openai_truncate_by_token(
|
||||
query_list,
|
||||
openai_embedding_limit,
|
||||
self.embedding_model.model_name,
|
||||
),
|
||||
queries,
|
||||
)
|
||||
)
|
||||
|
||||
query_embeddings = flatten_apply(
|
||||
run_query_embedding_batch,
|
||||
queries,
|
||||
embedding_model=self.embedding_model,
|
||||
batch_size=embedding_batch,
|
||||
)
|
||||
|
||||
loop = get_event_loop()
|
||||
|
||||
async def run_fetch(ids):
|
||||
final_result = []
|
||||
for id_list in ids:
|
||||
if len(id_list) == 0:
|
||||
final_result.append([])
|
||||
else:
|
||||
result = await self.vector_store.fetch(id_list)
|
||||
final_result.append(result)
|
||||
return final_result
|
||||
|
||||
content_embeddings = loop.run_until_complete(run_fetch(ids))
|
||||
|
||||
score_result = list(
|
||||
map(
|
||||
lambda query_embedding_list, content_embedding_list: get_id_scores(
|
||||
query_embedding_list,
|
||||
content_embedding_list,
|
||||
similarity_metric=self.vector_store.similarity_metric,
|
||||
),
|
||||
query_embeddings,
|
||||
content_embeddings,
|
||||
)
|
||||
)
|
||||
return ids, score_result
|
||||
|
||||
|
||||
async def vectordb_pure(
|
||||
queries: List[str], top_k: int, vectordb: BaseVectorStore
|
||||
) -> Tuple[List[str], List[float]]:
|
||||
"""
|
||||
Async VectorDB retrieval function.
|
||||
Its usage is for async retrieval of vector_db row by row.
|
||||
|
||||
:param query_embeddings: A list of query embeddings.
|
||||
:param top_k: The number of passages to be retrieved.
|
||||
:param vectordb: The vector store instance.
|
||||
:return: The tuple contains a list of passage ids that are retrieved from vectordb and a list of its scores.
|
||||
"""
|
||||
id_result, score_result = await vectordb.query(queries=queries, top_k=top_k)
|
||||
|
||||
# Distribute passages evenly
|
||||
id_result, score_result = evenly_distribute_passages(id_result, score_result, top_k)
|
||||
# sort id_result and score_result by score
|
||||
result = [
|
||||
(_id, score)
|
||||
for score, _id in sorted(
|
||||
zip(score_result, id_result), key=lambda pair: pair[0], reverse=True
|
||||
)
|
||||
]
|
||||
id_result, score_result = zip(*result)
|
||||
return list(id_result), list(score_result)
|
||||
|
||||
|
||||
async def filter_exist_ids(
|
||||
vectordb: BaseVectorStore,
|
||||
corpus_data: pd.DataFrame,
|
||||
) -> pd.DataFrame:
|
||||
corpus_data = cast_corpus_dataset(corpus_data)
|
||||
validate_corpus_dataset(corpus_data)
|
||||
ids = corpus_data["doc_id"].tolist()
|
||||
|
||||
# Query the collection to check if IDs already exist
|
||||
existed_bool_list = await vectordb.is_exist(ids=ids)
|
||||
# Assuming 'ids' is the key in the response
|
||||
new_passage = corpus_data[~pd.Series(existed_bool_list)]
|
||||
return new_passage
|
||||
|
||||
|
||||
async def filter_exist_ids_from_retrieval_gt(
|
||||
vectordb: BaseVectorStore,
|
||||
qa_data: pd.DataFrame,
|
||||
corpus_data: pd.DataFrame,
|
||||
) -> pd.DataFrame:
|
||||
qa_data = cast_qa_dataset(qa_data)
|
||||
validate_qa_dataset(qa_data)
|
||||
corpus_data = cast_corpus_dataset(corpus_data)
|
||||
validate_corpus_dataset(corpus_data)
|
||||
retrieval_gt = (
|
||||
qa_data["retrieval_gt"]
|
||||
.apply(lambda x: list(itertools.chain.from_iterable(x)))
|
||||
.tolist()
|
||||
)
|
||||
retrieval_gt = list(itertools.chain.from_iterable(retrieval_gt))
|
||||
retrieval_gt = list(set(retrieval_gt))
|
||||
|
||||
existed_bool_list = await vectordb.is_exist(ids=retrieval_gt)
|
||||
add_ids = []
|
||||
for ret_gt, is_exist in zip(retrieval_gt, existed_bool_list):
|
||||
if not is_exist:
|
||||
add_ids.append(ret_gt)
|
||||
new_passage = corpus_data[corpus_data["doc_id"].isin(add_ids)]
|
||||
return new_passage
|
||||
|
||||
|
||||
async def vectordb_ingest(
|
||||
vectordb: BaseVectorStore,
|
||||
corpus_data: pd.DataFrame,
|
||||
):
|
||||
"""
|
||||
Ingest given corpus data to the vectordb.
|
||||
It truncates corpus content when the embedding model is OpenAIEmbedding to the 8000 tokens.
|
||||
Plus, when the corpus content is empty (whitespace), it will be ignored.
|
||||
And if there is a document id that already exists in the collection, it will be ignored.
|
||||
|
||||
:param vectordb: A vector stores instance that you want to ingest.
|
||||
:param corpus_data: The corpus data that contains doc_id and contents columns.
|
||||
"""
|
||||
embedding_batch = vectordb.embedding_batch
|
||||
if not corpus_data.empty:
|
||||
new_contents = corpus_data["contents"].tolist()
|
||||
new_ids = corpus_data["doc_id"].tolist()
|
||||
content_batches = make_batch(new_contents, embedding_batch)
|
||||
id_batches = make_batch(new_ids, embedding_batch)
|
||||
for content_batch, id_batch in zip(content_batches, id_batches):
|
||||
await vectordb.add(ids=id_batch, texts=content_batch)
|
||||
|
||||
|
||||
def run_query_embedding_batch(
|
||||
queries: List[str], embedding_model: BaseEmbedding, batch_size: int
|
||||
) -> List[List[float]]:
|
||||
result = []
|
||||
for i in range(0, len(queries), batch_size):
|
||||
batch = queries[i : i + batch_size]
|
||||
embeddings = embedding_model.get_text_embedding_batch(batch)
|
||||
result.extend(embeddings)
|
||||
return result
|
||||
|
||||
|
||||
@convert_inputs_to_list
|
||||
def get_id_scores( # To find the uncalculated score when fuse the scores for the hybrid retrieval
|
||||
query_embeddings: List[
|
||||
List[float]
|
||||
], # `queries` is input. This is one user input query.
|
||||
content_embeddings: List[List[float]],
|
||||
similarity_metric: str,
|
||||
) -> List[
|
||||
float
|
||||
]: # The most high scores among each query. The length of a result is the same as the contents length.
|
||||
"""
|
||||
Calculate the highest similarity scores between query embeddings and content embeddings.
|
||||
|
||||
:param query_embeddings: A list of lists containing query embeddings.
|
||||
:param content_embeddings: A list of lists containing content embeddings.
|
||||
:param similarity_metric: The similarity metric to use ('l2', 'ip', or 'cosine').
|
||||
:return: A list of the highest similarity scores for each content embedding.
|
||||
"""
|
||||
metric_func_dict = {
|
||||
"l2": lambda x, y: 1 - calculate_l2_distance(x, y),
|
||||
"ip": calculate_inner_product,
|
||||
"cosine": calculate_cosine_similarity,
|
||||
}
|
||||
metric_func = metric_func_dict[similarity_metric]
|
||||
|
||||
result = []
|
||||
for content_embedding in content_embeddings:
|
||||
scores = []
|
||||
for query_embedding in query_embeddings:
|
||||
scores.append(
|
||||
metric_func(np.array(query_embedding), np.array(content_embedding))
|
||||
)
|
||||
result.append(max(scores))
|
||||
return result
|
||||
Reference in New Issue
Block a user