commit ba9c1a4a5f7d0e3d2cbb27a7106cb96b2bf3325d Author: kyy Date: Fri Mar 14 17:28:01 2025 +0900 Initial commit diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..d7ae301 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,165 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +venv*/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea/ +pytest.ini +.DS_Store +projects/tutorial_1 +!projects/tutorial_1/config.yaml diff --git a/.env.sample b/.env.sample new file mode 100644 index 0000000..a76c5e9 --- /dev/null +++ b/.env.sample @@ -0,0 +1,2 @@ +OPENAI_API_KEY=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 +LLAMA_CLOUD_API_KEY=llx-MkHkuDxnSxXEHvsIPAtjEZl4iSB8pHS1mgYDVZQlA690LUub \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d60f3ce --- /dev/null +++ b/.gitignore @@ -0,0 +1,169 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea/ +.DS_Store +pytest.ini +projects +test_projects + +# Visual Studio Code +.vscode/ diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..8c62292 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "autorag-frontend"] + path = autorag-frontend + url = https://github.com/Auto-RAG/autorag-frontend.git diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..4e61a58 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,28 @@ +# Base stage: Install common dependencies +FROM python:3.10-slim AS base + +# Set working directory and environment variables +WORKDIR /usr/src/app +ENV PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 \ + PIP_NO_CACHE_DIR=1 + +# Copy only requirements files first to leverage Docker cache +COPY pyproject.toml ./ + +# Install system and Python dependencies in a single layer +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + build-essential \ + gcc \ + libssl-dev && \ + pip install --no-cache-dir --upgrade pip setuptools setuptools-scm && \ + rm -rf /var/lib/apt/lists/* + +# Copy project files +COPY . . + +# Install base project +RUN pip install --no-cache-dir -e . + +CMD ["bash"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..ca3068a --- /dev/null +++ b/README.md @@ -0,0 +1,154 @@ +# AutoRAG Evaluation + +이 문서는 AutoRAG을 활용하여 RAG 파이프라인을 설정하고 평가하는 과정에 대한 안내입니다. + +--- + +## 📌 환경 세팅 + +1. `.env` 파일을 설정합니다. (`.env.sample` 참고) +2. Docker 이미지를 빌드합니다. + +```bash + docker build -t autorag-base . +``` + +3. Docker Compose를 실행하여 서비스를 시작합니다. + +```bash + docker compose up -d +``` + +4. Hugginface embedding, Ollama LLM을 위한 추가 모듈 설치를 시작합니다. + +```bash + pip install -r requirements_custom.txt +``` + +--- + +## 📂 데이터 생성 + +RAG 평가를 위해 **QA 데이터 세트**와 **코퍼스 데이터 세트**가 필요합니다. + +### 1️⃣ 프로젝트 폴더 생성 + +```bash +cd projects +mkdir -p "project_name" +cd "project_name" +mkdir -p raw_data config +``` + +- **`raw_data/`**: 분석할 원본 데이터를 저장 (`.pdf` 등) +- **`config/`**: 파싱(`parse.yaml`), 청킹(`chunk.yaml`) 설정 파일을 저장 + +### 2️⃣ 파싱 설정 (`parse.yaml`) + +파싱 모듈을 설정합니다. + +```yaml +modules: + - module_type: langchain_parse + parse_method: pdfminer +``` + +여러 개의 파싱 모듈을 동시에 사용할 수도 있습니다. + +### 3️⃣ 청킹 설정 (`chunk.yaml`) + +청킹 모듈을 설정합니다. + +```yaml +modules: + - module_type: llama_index_chunk + chunk_method: Token + chunk_size: 1024 + chunk_overlap: 24 + add_file_name: en +``` + +여러 개의 청킹 모듈을 사용할 경우, QA 데이터와 매핑해야 합니다. + +### 4️⃣ QA 데이터 생성 + +`raw_data/`에 저장된 파일을 바탕으로 **파싱 → 청킹 → QA 데이터 생성**을 진행합니다. +QA 데이터는 `GPT-4o-mini` 모델을 사용하여 **20건**을 생성합니다. + +```bash +cd autorag-workspace +sh making.sh +``` + +--- + +## 🔍 RAG Pipeline 평가 + +### 1️⃣ Ollama 모델 다운로드 + +WSL(Windows Subsystem for Linux)에서 실행합니다. + +```bash +docker exec -it autorag-ollama bash +ollama pull phi4 +ollama list +``` + +### 2️⃣ AutoRAG 평가 실행 + +```bash +cd Autorag-workspace +python main.py +``` + +### 3️⃣ 평가 결과 확인 + +평가 결과는 프로젝트 폴더 내 `benchmark_*` 경로에 저장됩니다. + +#### ✅ 전체 파이프라인 평가 결과 + +```bash +cd projects/프로젝트이름/benchmark_{*}/*/ +summary.csv +``` + +#### ✅ 세부 평가 결과 + +- **검색기 노드 라인 평가 결과** + ```bash + cd ./retrieve_node_line + summary.csv + ``` +- **검색 노드 평가 결과** + ```bash + cd ./retrieve_node_line/retrival + summary.csv + ``` +- **리랭커 노드 평가 결과** + ```bash + cd ./retrieve_node_line/passage_reranker + summary.csv + ``` +- **생성기 노드 라인 평가 결과** + ```bash + cd ./post_retrieve_node_line + summary.csv + ``` +- **생성 노드 평가 결과** + ```bash + cd ./post_retrieve_node_line/generator + summary.csv + ``` + +> 📌 **참고:** `./projects/example_01` 폴더는 데이터 생성부터 평가까지 진행된 예제입니다. + +--- + +## 📊 평가 대시보드 실행 + +```bash +cd Autorag-workspace +sh dashboard.sh +``` + +AutoRAG 평가 결과를 자세히 확인할 수 있습니다. diff --git a/autorag-workspace/autorag/VERSION b/autorag-workspace/autorag/VERSION new file mode 100644 index 0000000..0b69c00 --- /dev/null +++ b/autorag-workspace/autorag/VERSION @@ -0,0 +1 @@ +0.3.14 diff --git a/autorag-workspace/autorag/__init__.py b/autorag-workspace/autorag/__init__.py new file mode 100644 index 0000000..b8d2bed --- /dev/null +++ b/autorag-workspace/autorag/__init__.py @@ -0,0 +1,113 @@ +import logging +import logging.config +import os +import sys +from random import random +from typing import List, Any + +from llama_index.core.embeddings.mock_embed_model import MockEmbedding +from llama_index.core.base.llms.types import CompletionResponse +from llama_index.core.llms.mock import MockLLM +from llama_index.llms.bedrock import Bedrock +from llama_index.embeddings.openai import OpenAIEmbedding +from llama_index.embeddings.openai import OpenAIEmbeddingModelType + +from llama_index.llms.openai import OpenAI +from llama_index.llms.openai_like import OpenAILike +from langchain_openai.embeddings import OpenAIEmbeddings +from rich.logging import RichHandler + +from llama_index.llms.ollama import Ollama + +version_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "VERSION") + +with open(version_path, "r") as f: + __version__ = f.read().strip() + + +class LazyInit: + def __init__(self, factory, *args, **kwargs): + self._factory = factory + self._args = args + self._kwargs = kwargs + self._instance = None + + def __call__(self): + if self._instance is None: + self._instance = self._factory(*self._args, **self._kwargs) + return self._instance + + def __getattr__(self, name): + if self._instance is None: + self._instance = self._factory(*self._args, **self._kwargs) + return getattr(self._instance, name) + + +rich_format = "[%(filename)s:%(lineno)s] >> %(message)s" +logging.basicConfig( + level="INFO", format=rich_format, handlers=[RichHandler(rich_tracebacks=True)] +) +logger = logging.getLogger("AutoRAG") + + +def handle_exception(exc_type, exc_value, exc_traceback): + logger = logging.getLogger("AutoRAG") + logger.error("Unexpected exception", exc_info=(exc_type, exc_value, exc_traceback)) + + +sys.excepthook = handle_exception + + +class AutoRAGBedrock(Bedrock): + async def acomplete( + self, prompt: str, formatted: bool = False, **kwargs: Any + ) -> CompletionResponse: + return self.complete(prompt, formatted=formatted, **kwargs) + + +generator_models = { + "openai": OpenAI, + "openailike": OpenAILike, + "mock": MockLLM, + "bedrock": AutoRAGBedrock, + "ollama": Ollama, +} + +# embedding_models = { + +# } + +try: + from llama_index.llms.huggingface import HuggingFaceLLM + from llama_index.llms.ollama import Ollama + + generator_models["huggingfacellm"] = HuggingFaceLLM + generator_models["ollama"] = Ollama + +except ImportError: + logger.info( + "You are using API version of AutoRAG. " + "To use local version, run pip install 'AutoRAG[gpu]'" + ) + +# try: +# from llama_index.embeddings.huggingface import HuggingFaceEmbedding +# embedding_models["hf_all_mpnet_base_v2"] = HuggingFaceEmbedding # 250312 변경 - 김용연 +# embedding_models["hf_KURE-v1"] = HuggingFaceEmbedding # 250312 변경 - 김용연 +# embedding_models["hf_snowflake-arctic-embed-l-v2.0-ko"] = HuggingFaceEmbedding # 250313 변경 - 김용연 + +# except ImportError: +# logger.info( +# "You are using API version of AutoRAG." +# "To use local version, run pip install 'AutoRAG[gpu]'" +# ) + +try: + import transformers + + transformers.logging.set_verbosity_error() +except ImportError: + logger.info( + "You are using API version of AutoRAG." + "To use local version, run pip install 'AutoRAG[gpu]'" + ) diff --git a/autorag-workspace/autorag/chunker.py b/autorag-workspace/autorag/chunker.py new file mode 100644 index 0000000..486c61f --- /dev/null +++ b/autorag-workspace/autorag/chunker.py @@ -0,0 +1,51 @@ +import logging +import os +import shutil +from typing import Optional + +import pandas as pd + +from autorag.data.chunk.run import run_chunker +from autorag.data.utils.util import load_yaml, get_param_combinations + +logger = logging.getLogger("AutoRAG") + + +class Chunker: + def __init__(self, raw_df: pd.DataFrame, project_dir: Optional[str] = None): + self.parsed_raw = raw_df + self.project_dir = project_dir if project_dir is not None else os.getcwd() + + @classmethod + def from_parquet( + cls, parsed_data_path: str, project_dir: Optional[str] = None + ) -> "Chunker": + if not os.path.exists(parsed_data_path): + raise ValueError(f"parsed_data_path {parsed_data_path} does not exist.") + if not parsed_data_path.endswith("parquet"): + raise ValueError( + f"parsed_data_path {parsed_data_path} is not a parquet file." + ) + parsed_result = pd.read_parquet(parsed_data_path, engine="pyarrow") + return cls(parsed_result, project_dir) + + def start_chunking(self, yaml_path: str): + if not os.path.exists(self.project_dir): + os.makedirs(self.project_dir) + + # Copy YAML file to the trial directory + shutil.copy(yaml_path, os.path.join(self.project_dir, "chunk_config.yaml")) + + # load yaml file + modules = load_yaml(yaml_path) + + input_modules, input_params = get_param_combinations(modules) + + logger.info("Chunking Start...") + run_chunker( + modules=input_modules, + module_params=input_params, + parsed_result=self.parsed_raw, + project_dir=self.project_dir, + ) + logger.info("Chunking Done!") diff --git a/autorag-workspace/autorag/cli.py b/autorag-workspace/autorag/cli.py new file mode 100644 index 0000000..d114b83 --- /dev/null +++ b/autorag-workspace/autorag/cli.py @@ -0,0 +1,209 @@ +import importlib.resources +import logging +import os +import pathlib +import subprocess +from pathlib import Path +from typing import Optional + +import click +import nest_asyncio + +from autorag import dashboard +from autorag.deploy import extract_best_config as original_extract_best_config +from autorag.deploy.api import ApiRunner +from autorag.evaluator import Evaluator +from autorag.validator import Validator + +logger = logging.getLogger("AutoRAG") + +autorag_dir = os.path.dirname(os.path.realpath(__file__)) +version_file = os.path.join(autorag_dir, "VERSION") +with open(version_file, "r") as f: + __version__ = f.read().strip() + + +@click.group() +@click.version_option(__version__) +def cli(): + pass + + +@click.command() +@click.option( + "--config", + "-c", + help="Path to config yaml file. Must be yaml or yml file.", + type=str, +) +@click.option( + "--qa_data_path", help="Path to QA dataset. Must be parquet file.", type=str +) +@click.option( + "--corpus_data_path", help="Path to corpus dataset. Must be parquet file.", type=str +) +@click.option( + "--project_dir", help="Path to project directory.", type=str, default=None +) +@click.option( + "--skip_validation", + help="Skip validation or not. Default is False.", + type=bool, + default=False, +) +def evaluate(config, qa_data_path, corpus_data_path, project_dir, skip_validation): + if not config.endswith(".yaml") and not config.endswith(".yml"): + raise ValueError(f"Config file {config} is not a yaml or yml file.") + if not os.path.exists(config): + raise ValueError(f"Config file {config} does not exist.") + evaluator = Evaluator(qa_data_path, corpus_data_path, project_dir=project_dir) + evaluator.start_trial(config, skip_validation=skip_validation) + + +@click.command() +@click.option( + "--config_path", type=str, help="Path to extracted config yaml file.", default=None +) +@click.option("--host", type=str, default="0.0.0.0", help="Host address") +@click.option("--port", type=int, default=8000, help="Port number") +@click.option( + "--trial_dir", + type=click.Path(file_okay=False, dir_okay=True, exists=True), + default=None, + help="Path to trial directory.", +) +@click.option( + "--project_dir", help="Path to project directory.", type=str, default=None +) +@click.option( + "--remote", help="Run the API server in remote mode.", type=bool, default=False +) +def run_api(config_path, host, port, trial_dir, project_dir, remote: bool): + if trial_dir is None: + runner = ApiRunner.from_yaml(config_path, project_dir=project_dir) + else: + runner = ApiRunner.from_trial_folder(trial_dir) + logger.info(f"Running API server at {host}:{port}...") + nest_asyncio.apply() + runner.run_api_server(host, port, remote=remote) + + +@click.command() +@click.option( + "--yaml_path", type=click.Path(path_type=Path), help="Path to the YAML file." +) +@click.option( + "--project_dir", + type=click.Path(path_type=Path), + help="Path to the project directory.", +) +@click.option( + "--trial_path", type=click.Path(path_type=Path), help="Path to the trial directory." +) +def run_web( + yaml_path: Optional[str], project_dir: Optional[str], trial_path: Optional[str] +): + try: + with importlib.resources.path("autorag", "web.py") as web_path: + web_py_path = str(web_path) + except ImportError: + raise ImportError( + "Could not locate the web.py file within the autorag package." + " Please ensure that autorag is correctly installed." + ) + + if not yaml_path and not trial_path: + raise ValueError("yaml_path or trial_path must be given.") + elif yaml_path and trial_path: + raise ValueError("yaml_path and trial_path cannot be given at the same time.") + elif yaml_path and not project_dir: + subprocess.run( + ["streamlit", "run", web_py_path, "--", "--yaml_path", yaml_path] + ) + elif yaml_path and project_dir: + subprocess.run( + [ + "streamlit", + "run", + web_py_path, + "--", + "--yaml_path", + yaml_path, + "--project_dir", + project_dir, + ] + ) + elif trial_path: + subprocess.run( + ["streamlit", "run", web_py_path, "--", "--trial_path", trial_path] + ) + + +@click.command() +@click.option( + "--trial_dir", + type=click.Path(dir_okay=True, file_okay=False, exists=True), + required=True, +) +@click.option( + "--port", type=int, default=7690, help="Port number. The default is 7690." +) +def run_dashboard(trial_dir: str, port: int): + dashboard.run(trial_dir, port=port) + + +@click.command() +@click.option("--trial_path", type=click.Path(), help="Path to the trial directory.") +@click.option( + "--output_path", + type=click.Path(), + help="Path to the output directory." " Must be .yaml or .yml file.", +) +def extract_best_config(trial_path: str, output_path: str): + original_extract_best_config(trial_path, output_path) + + +@click.command() +@click.option("--trial_path", help="Path to trial directory.", type=str) +def restart_evaluate(trial_path): + if not os.path.exists(trial_path): + raise ValueError(f"trial_path {trial_path} does not exist.") + project_dir = str(pathlib.PurePath(trial_path).parent) + qa_data_path = os.path.join(project_dir, "data", "qa.parquet") + corpus_data_path = os.path.join(project_dir, "data", "corpus.parquet") + evaluator = Evaluator(qa_data_path, corpus_data_path, project_dir) + evaluator.restart_trial(trial_path) + + +@click.command() +@click.option( + "--config", + "-c", + help="Path to config yaml file. Must be yaml or yml file.", + type=str, +) +@click.option( + "--qa_data_path", help="Path to QA dataset. Must be parquet file.", type=str +) +@click.option( + "--corpus_data_path", help="Path to corpus dataset. Must be parquet file.", type=str +) +def validate(config, qa_data_path, corpus_data_path): + if not config.endswith(".yaml") and not config.endswith(".yml"): + raise ValueError(f"Config file {config} is not a parquet file.") + if not os.path.exists(config): + raise ValueError(f"Config file {config} does not exist.") + validator = Validator(qa_data_path=qa_data_path, corpus_data_path=corpus_data_path) + validator.validate(config) + + +cli.add_command(evaluate, "evaluate") +cli.add_command(run_api, "run_api") +cli.add_command(run_web, "run_web") +cli.add_command(run_dashboard, "dashboard") +cli.add_command(extract_best_config, "extract_best_config") +cli.add_command(restart_evaluate, "restart_evaluate") +cli.add_command(validate, "validate") + +if __name__ == "__main__": + cli() diff --git a/autorag-workspace/autorag/dashboard.py b/autorag-workspace/autorag/dashboard.py new file mode 100644 index 0000000..b55d2e9 --- /dev/null +++ b/autorag-workspace/autorag/dashboard.py @@ -0,0 +1,199 @@ +import ast +import logging +import os +from typing import Dict, List + +import matplotlib.pyplot as plt +import pandas as pd +import panel as pn +import seaborn as sns +import yaml +from bokeh.models import NumberFormatter, BooleanFormatter + +from autorag.utils.util import dict_to_markdown, dict_to_markdown_table + +pn.extension( + "terminal", + "tabulator", + "mathjax", + "ipywidgets", + console_output="disable", + sizing_mode="stretch_width", + css_files=[ + "https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/all.min.css" + ], +) +logger = logging.getLogger("AutoRAG") + + +def find_node_dir(trial_dir: str) -> List[str]: + trial_summary_df = pd.read_csv(os.path.join(trial_dir, "summary.csv")) + result_paths = [] + for idx, row in trial_summary_df.iterrows(): + node_line_name = row["node_line_name"] + node_type = row["node_type"] + result_paths.append(os.path.join(trial_dir, node_line_name, node_type)) + return result_paths + + +def get_metric_values(node_summary_df: pd.DataFrame) -> Dict: + non_metric_column_names = [ + "filename", + "module_name", + "module_params", + "execution_time", + "average_output_token", + "is_best", + ] + best_row = node_summary_df.loc[node_summary_df["is_best"]].drop( + columns=non_metric_column_names, errors="ignore" + ) + assert len(best_row) == 1, "The best module must be only one." + return best_row.iloc[0].to_dict() + + +def make_trial_summary_md(trial_dir): + markdown_text = f"""# Trial Result Summary +- Trial Directory : {trial_dir} + +""" + node_dirs = find_node_dir(trial_dir) + for node_dir in node_dirs: + node_summary_filepath = os.path.join(node_dir, "summary.csv") + node_type = os.path.basename(node_dir) + node_summary_df = pd.read_csv(node_summary_filepath) + best_row = node_summary_df.loc[node_summary_df["is_best"]].iloc[0] + metric_dict = get_metric_values(node_summary_df) + markdown_text += f"""--- + +## {node_type} best module + +### Module Name + +{best_row['module_name']} + +### Module Params + +{dict_to_markdown(ast.literal_eval(best_row['module_params']), level=3)} + +### Metric Values + +{dict_to_markdown_table(metric_dict, key_column_name='metric_name', value_column_name='metric_value')} + +""" + + return markdown_text + + +def node_view(node_dir: str): + non_metric_column_names = [ + "filename", + "module_name", + "module_params", + "execution_time", + "average_output_token", + "is_best", + ] + summary_df = pd.read_csv(os.path.join(node_dir, "summary.csv")) + bokeh_formatters = { + "float": NumberFormatter(format="0.000"), + "bool": BooleanFormatter(), + } + first_df = pd.read_parquet(os.path.join(node_dir, "0.parquet"), engine="pyarrow") + + each_module_df_widget = pn.widgets.Tabulator( + pd.DataFrame(columns=first_df.columns), + name="Module DataFrame", + formatters=bokeh_formatters, + pagination="local", + page_size=20, + widths=150, + ) + + def change_module_widget(event): + if event.column == "detail": + filename = summary_df["filename"].iloc[event.row] + filepath = os.path.join(node_dir, filename) + each_module_df = pd.read_parquet(filepath, engine="pyarrow") + each_module_df_widget.value = each_module_df + + df_widget = pn.widgets.Tabulator( + summary_df, + name="Summary DataFrame", + formatters=bokeh_formatters, + buttons={"detail": ''}, + widths=150, + ) + df_widget.on_click(change_module_widget) + + try: + fig, ax = plt.subplots(figsize=(10, 5)) + metric_df = summary_df.drop(columns=non_metric_column_names, errors="ignore") + sns.stripplot(data=metric_df, ax=ax) + strip_plot_pane = pn.pane.Matplotlib(fig, tight=True) + + fig2, ax2 = plt.subplots(figsize=(10, 5)) + sns.boxplot(data=metric_df, ax=ax2) + box_plot_pane = pn.pane.Matplotlib(fig2, tight=True) + plot_pane = pn.Row(strip_plot_pane, box_plot_pane) + + layout = pn.Column( + "## Summary distribution plot", + plot_pane, + "## Summary DataFrame", + df_widget, + "## Module Result DataFrame", + each_module_df_widget, + ) + except Exception as e: + logger.error(f"Skipping make boxplot and stripplot with error {e}") + layout = pn.Column("## Summary DataFrame", df_widget) + layout.servable() + return layout + + +CSS = """ +div.card-margin:nth-child(1) { + max-height: 300px; +} +div.card-margin:nth-child(2) { + max-height: 400px; +} +""" + + +def yaml_to_markdown(yaml_filepath): + markdown_content = "" + with open(yaml_filepath, "r", encoding="utf-8") as file: + try: + content = yaml.safe_load(file) + markdown_content += f"## {os.path.basename(yaml_filepath)}\n```yaml\n{yaml.safe_dump(content, allow_unicode=True)}\n```\n\n" + except yaml.YAMLError as exc: + print(f"Error in {yaml_filepath}: {exc}") + return markdown_content + + +def run(trial_dir: str, port: int = 7690): + trial_summary_md = make_trial_summary_md(trial_dir=trial_dir) + trial_summary_tab = pn.pane.Markdown(trial_summary_md, sizing_mode="stretch_width") + + node_views = [ + (str(os.path.basename(node_dir)), node_view(node_dir)) + for node_dir in find_node_dir(trial_dir) + ] + + yaml_file_markdown = yaml_to_markdown(os.path.join(trial_dir, "config.yaml")) + + yaml_file = pn.pane.Markdown(yaml_file_markdown, sizing_mode="stretch_width") + + tabs = pn.Tabs( + ("Summary", trial_summary_tab), + *node_views, + ("Used YAML file", yaml_file), + dynamic=True, + ) + + template = pn.template.FastListTemplate( + site="AutoRAG", title="Dashboard", main=[tabs], raw_css=[CSS] + ).servable() + template.show(port=port) diff --git a/autorag-workspace/autorag/data/__init__.py b/autorag-workspace/autorag/data/__init__.py new file mode 100644 index 0000000..8870996 --- /dev/null +++ b/autorag-workspace/autorag/data/__init__.py @@ -0,0 +1,109 @@ +import logging +from typing import List, Callable + +from langchain_community.document_loaders import ( + PDFMinerLoader, + PDFPlumberLoader, + PyPDFium2Loader, + PyPDFLoader, + PyMuPDFLoader, + UnstructuredPDFLoader, + CSVLoader, + JSONLoader, + UnstructuredMarkdownLoader, + BSHTMLLoader, + UnstructuredXMLLoader, + DirectoryLoader, +) +from langchain_unstructured import UnstructuredLoader +from langchain_upstage import UpstageDocumentParseLoader + +from llama_index.core.node_parser import ( + TokenTextSplitter, + SentenceSplitter, + SentenceWindowNodeParser, + SemanticSplitterNodeParser, + SemanticDoubleMergingSplitterNodeParser, + SimpleFileNodeParser, +) +from langchain.text_splitter import ( + RecursiveCharacterTextSplitter, + CharacterTextSplitter, + KonlpyTextSplitter, + SentenceTransformersTokenTextSplitter, +) + +from autorag import LazyInit + +logger = logging.getLogger("AutoRAG") + +parse_modules = { + # PDF + "pdfminer": PDFMinerLoader, + "pdfplumber": PDFPlumberLoader, + "pypdfium2": PyPDFium2Loader, + "pypdf": PyPDFLoader, + "pymupdf": PyMuPDFLoader, + "unstructuredpdf": UnstructuredPDFLoader, + # Common File Types + # 1. CSV + "csv": CSVLoader, + # 2. JSON + "json": JSONLoader, + # 3. Markdown + "unstructuredmarkdown": UnstructuredMarkdownLoader, + # 4. HTML + "bshtml": BSHTMLLoader, + # 5. XML + "unstructuredxml": UnstructuredXMLLoader, + # 6. All files + "directory": DirectoryLoader, + "unstructured": UnstructuredLoader, + "upstagedocumentparse": UpstageDocumentParseLoader, +} + +chunk_modules = { + # Llama Index + # Token + "token": TokenTextSplitter, + # Sentence + "sentence": SentenceSplitter, + # window + "sentencewindow": SentenceWindowNodeParser, + # Semantic + "semantic_llama_index": SemanticSplitterNodeParser, + "semanticdoublemerging": SemanticDoubleMergingSplitterNodeParser, + # Simple + "simplefile": SimpleFileNodeParser, + # LangChain + # Token + "sentencetransformerstoken": SentenceTransformersTokenTextSplitter, + # Character + "recursivecharacter": RecursiveCharacterTextSplitter, + "character": CharacterTextSplitter, + # Sentence + "konlpy": KonlpyTextSplitter, +} + + +def split_by_sentence_kiwi() -> Callable[[str], List[str]]: + try: + from kiwipiepy import Kiwi + except ImportError: + raise ImportError( + "You need to install kiwipiepy to use 'ko_kiwi' tokenizer. " + "Please install kiwipiepy by running 'pip install kiwipiepy'. " + "Or install Korean version of AutoRAG by running 'pip install AutoRAG[ko]'." + ) + kiwi = Kiwi() + + def split(text: str) -> List[str]: + kiwi_result = kiwi.split_into_sents(text) + sentences = list(map(lambda x: x.text, kiwi_result)) + + return sentences + + return split + + +sentence_splitter_modules = {"kiwi": LazyInit(split_by_sentence_kiwi)} diff --git a/autorag-workspace/autorag/data/chunk/__init__.py b/autorag-workspace/autorag/data/chunk/__init__.py new file mode 100644 index 0000000..4fdf55f --- /dev/null +++ b/autorag-workspace/autorag/data/chunk/__init__.py @@ -0,0 +1,2 @@ +from .llama_index_chunk import llama_index_chunk +from .langchain_chunk import langchain_chunk diff --git a/autorag-workspace/autorag/data/chunk/base.py b/autorag-workspace/autorag/data/chunk/base.py new file mode 100644 index 0000000..35a191e --- /dev/null +++ b/autorag-workspace/autorag/data/chunk/base.py @@ -0,0 +1,128 @@ +import functools +import logging +from typing import Tuple, List, Dict, Any + +import pandas as pd + +from autorag.embedding.base import EmbeddingModel +from autorag.data import chunk_modules, sentence_splitter_modules +from autorag.utils import result_to_dataframe + +logger = logging.getLogger("AutoRAG") + + +def chunker_node(func): + @functools.wraps(func) + @result_to_dataframe(["doc_id", "contents", "path", "start_end_idx", "metadata"]) + def wrapper( + parsed_result: pd.DataFrame, chunk_method: str, **kwargs + ) -> Tuple[ + List[str], List[str], List[str], List[Tuple[int, int]], List[Dict[str, Any]] + ]: + logger.info(f"Running chunker - {func.__name__} module...") + + # get texts from parsed_result + texts = parsed_result["texts"].tolist() + + # get filenames from parsed_result when 'add_file_name' is setting + file_name_language = kwargs.pop("add_file_name", None) + metadata_list = make_metadata_list(parsed_result) + + # run chunk module + if func.__name__ in ["llama_index_chunk", "langchain_chunk"]: + chunk_instance = __get_chunk_instance( + func.__name__, chunk_method.lower(), **kwargs + ) + result = func( + texts=texts, + chunker=chunk_instance, + file_name_language=file_name_language, + metadata_list=metadata_list, + ) + del chunk_instance + return result + else: + raise ValueError(f"Unsupported module_type: {func.__name__}") + + return wrapper + + +def make_metadata_list(parsed_result: pd.DataFrame) -> List[Dict[str, str]]: + metadata_list = [{} for _ in range(len(parsed_result["texts"]))] + + def _make_metadata_pure( + lst: List[str], key: str, metadata_lst: List[Dict[str, str]] + ): + for value, metadata in zip(lst, metadata_lst): + metadata[key] = value + + for column in ["page", "last_modified_datetime", "path"]: + if column in parsed_result.columns: + _make_metadata_pure(parsed_result[column].tolist(), column, metadata_list) + return metadata_list + + +def __get_chunk_instance(module_type: str, chunk_method: str, **kwargs): + # Add sentence_splitter to kwargs + sentence_available_methods = [ + "semantic_llama_index", + "semanticdoublemerging", + "sentencewindow", + ] + if chunk_method in sentence_available_methods: + # llama index default sentence_splitter is 'nltk -PunktSentenceTokenizer' + if "sentence_splitter" in kwargs.keys(): + sentence_splitter_str = kwargs.pop("sentence_splitter") + sentence_splitter_func = sentence_splitter_modules[sentence_splitter_str]() + kwargs.update({"sentence_splitter": sentence_splitter_func}) + + def get_embedding_model(_embed_model_str: str, _module_type: str): + if _embed_model_str == "openai": + if _module_type == "langchain_chunk": + _embed_model_str = "openai_langchain" + return EmbeddingModel.load(_embed_model_str)() + + # Add embed_model to kwargs + embedding_available_methods = ["semantic_llama_index", "semantic_langchain"] + if chunk_method in embedding_available_methods: + # there is no default embed_model, so we have to get it parameter and add it. + if "embed_model" not in kwargs.keys(): + raise ValueError(f"embed_model is required for {chunk_method} method.") + embed_model_str = kwargs.pop("embed_model") + embed_model = get_embedding_model(embed_model_str, module_type) + if chunk_method == "semantic_llama_index": + kwargs.update({"embed_model": embed_model}) + elif chunk_method == "semantic_langchain": + kwargs.update({"embeddings": embed_model}) + + return chunk_modules[chunk_method](**kwargs) + + +def add_file_name( + file_name_language: str, file_names: List[str], chunk_texts: List[str] +) -> List[str]: + if file_name_language == "en": + return list( + map( + lambda x: f"file_name: {x[1]}\n contents: {x[0]}", + zip(chunk_texts, file_names), + ) + ) + elif file_name_language == "ko": + return list( + map( + lambda x: f"파일 제목: {x[1]}\n 내용: {x[0]}", + zip(chunk_texts, file_names), + ) + ) + elif file_name_language == "ja": + return list( + map( + lambda x: f"ファイル名: {x[1]}\n 内容: {x[0]}", + zip(chunk_texts, file_names), + ) + ) + else: + raise ValueError( + f"Unsupported file_name_language: {file_name_language}. Choose from 'en' ,'ko' or 'ja." + ) diff --git a/autorag-workspace/autorag/data/chunk/langchain_chunk.py b/autorag-workspace/autorag/data/chunk/langchain_chunk.py new file mode 100644 index 0000000..7e967e2 --- /dev/null +++ b/autorag-workspace/autorag/data/chunk/langchain_chunk.py @@ -0,0 +1,76 @@ +import os +from itertools import chain +import uuid +from typing import Tuple, List, Dict, Any, Optional + +from langchain_text_splitters import TextSplitter + +from autorag.data.chunk.base import chunker_node, add_file_name +from autorag.data.utils.util import add_essential_metadata, get_start_end_idx + + +@chunker_node +def langchain_chunk( + texts: List[str], + chunker: TextSplitter, + file_name_language: Optional[str] = None, + metadata_list: Optional[List[Dict[str, str]]] = None, +) -> Tuple[ + List[str], List[str], List[str], List[Tuple[int, int]], List[Dict[str, Any]] +]: + """ + Chunk texts from the parsed result to use langchain chunk method + + :param texts: The list of texts to chunk from the parsed result + :param chunker: A langchain TextSplitter(Chunker) instance. + :param file_name_language: The language to use 'add_file_name' feature. + You need to set one of 'English' and 'Korean' + The 'add_file_name' feature is to add a file_name to chunked_contents. + This is used to prevent hallucination by retrieving contents from the wrong document. + Default form of 'English' is "file_name: {file_name}\n contents: {content}" + :param metadata_list: The list of dict of metadata from the parsed result + :return: tuple of lists containing the chunked doc_id, contents, path, start_idx, end_idx and metadata + """ + results = [ + langchain_chunk_pure(text, chunker, file_name_language, meta) + for text, meta in zip(texts, metadata_list) + ] + + doc_id, contents, path, start_end_idx, metadata = ( + list(chain.from_iterable(item)) for item in zip(*results) + ) + + return doc_id, contents, path, start_end_idx, metadata + + +def langchain_chunk_pure( + text: str, + chunker: TextSplitter, + file_name_language: Optional[str] = None, + _metadata: Optional[Dict[str, str]] = None, +): + # chunk + chunk_results = chunker.create_documents([text], metadatas=[_metadata]) + + # make doc_id + doc_id = list(str(uuid.uuid4()) for _ in range(len(chunk_results))) + + # make path + path_lst = list(map(lambda x: x.metadata.get("path", ""), chunk_results)) + + # make contents and start_end_idx + if file_name_language: + chunked_file_names = list(map(lambda x: os.path.basename(x), path_lst)) + chunked_texts = list(map(lambda x: x.page_content, chunk_results)) + start_end_idx = list(map(lambda x: get_start_end_idx(text, x), chunked_texts)) + contents = add_file_name(file_name_language, chunked_file_names, chunked_texts) + else: + contents = list(map(lambda node: node.page_content, chunk_results)) + start_end_idx = list(map(lambda x: get_start_end_idx(text, x), contents)) + + # make metadata + metadata = list( + map(lambda node: add_essential_metadata(node.metadata), chunk_results) + ) + + return doc_id, contents, path_lst, start_end_idx, metadata diff --git a/autorag-workspace/autorag/data/chunk/llama_index_chunk.py b/autorag-workspace/autorag/data/chunk/llama_index_chunk.py new file mode 100644 index 0000000..36e5403 --- /dev/null +++ b/autorag-workspace/autorag/data/chunk/llama_index_chunk.py @@ -0,0 +1,96 @@ +import os.path +from itertools import chain +from typing import Tuple, List, Dict, Any, Optional + +from llama_index.core import Document +from llama_index.core.node_parser.interface import NodeParser + +from autorag.utils.util import process_batch, get_event_loop +from autorag.data.chunk.base import chunker_node, add_file_name +from autorag.data.utils.util import ( + add_essential_metadata_llama_text_node, + get_start_end_idx, +) + + +@chunker_node +def llama_index_chunk( + texts: List[str], + chunker: NodeParser, + file_name_language: Optional[str] = None, + metadata_list: Optional[List[Dict[str, str]]] = None, + batch: int = 8, +) -> Tuple[ + List[str], List[str], List[str], List[Tuple[int, int]], List[Dict[str, Any]] +]: + """ + Chunk texts from the parsed result to use llama index chunk method + + :param texts: The list of texts to chunk from the parsed result + :param chunker: A llama index NodeParser(Chunker) instance. + :param file_name_language: The language to use 'add_file_name' feature. + You need to set one of 'English' and 'Korean' + The 'add_file_name' feature is to add a file_name to chunked_contents. + This is used to prevent hallucination by retrieving contents from the wrong document. + Default form of 'English' is "file_name: {file_name}\n contents: {content}" + :param metadata_list: The list of dict of metadata from the parsed result + :param batch: The batch size for chunk texts. Default is 8 + :return: tuple of lists containing the chunked doc_id, contents, path, start_idx, end_idx and metadata + """ + tasks = [ + llama_index_chunk_pure(text, chunker, file_name_language, meta) + for text, meta in zip(texts, metadata_list) + ] + loop = get_event_loop() + results = loop.run_until_complete(process_batch(tasks, batch)) + + doc_id, contents, path, start_end_idx, metadata = ( + list(chain.from_iterable(item)) for item in zip(*results) + ) + + return list(doc_id), list(contents), list(path), list(start_end_idx), list(metadata) + + +async def llama_index_chunk_pure( + text: str, + chunker: NodeParser, + file_name_language: Optional[str] = None, + _metadata: Optional[Dict[str, str]] = None, +): + # set document + document = [Document(text=text, metadata=_metadata)] + + # chunk document + chunk_results = await chunker.aget_nodes_from_documents(documents=document) + + # make doc_id + doc_id = list(map(lambda node: node.node_id, chunk_results)) + + # make path + path_lst = list(map(lambda x: x.metadata.get("path", ""), chunk_results)) + + # make contents and start_end_idx + if file_name_language: + chunked_file_names = list(map(lambda x: os.path.basename(x), path_lst)) + chunked_texts = list(map(lambda x: x.text, chunk_results)) + start_end_idx = list( + map( + lambda x: get_start_end_idx(text, x), + chunked_texts, + ) + ) + contents = add_file_name(file_name_language, chunked_file_names, chunked_texts) + else: + contents = list(map(lambda x: x.text, chunk_results)) + start_end_idx = list(map(lambda x: get_start_end_idx(text, x), contents)) + + metadata = list( + map( + lambda node: add_essential_metadata_llama_text_node( + node.metadata, node.relationships + ), + chunk_results, + ) + ) + + return doc_id, contents, path_lst, start_end_idx, metadata diff --git a/autorag-workspace/autorag/data/chunk/run.py b/autorag-workspace/autorag/data/chunk/run.py new file mode 100644 index 0000000..50233ec --- /dev/null +++ b/autorag-workspace/autorag/data/chunk/run.py @@ -0,0 +1,38 @@ +import os +from typing import Callable, List, Dict +import pandas as pd + +from autorag.strategy import measure_speed + + +def run_chunker( + modules: List[Callable], + module_params: List[Dict], + parsed_result: pd.DataFrame, + project_dir: str, +): + results, execution_times = zip( + *map( + lambda x: measure_speed(x[0], parsed_result=parsed_result, **x[1]), + zip(modules, module_params), + ) + ) + average_times = list(map(lambda x: x / len(results[0]), execution_times)) + + # save results to parquet files + filepaths = list( + map(lambda x: os.path.join(project_dir, f"{x}.parquet"), range(len(modules))) + ) + list(map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths))) + filenames = list(map(lambda x: os.path.basename(x), filepaths)) + + summary_df = pd.DataFrame( + { + "filename": filenames, + "module_name": list(map(lambda module: module.__name__, modules)), + "module_params": module_params, + "execution_time": average_times, + } + ) + summary_df.to_csv(os.path.join(project_dir, "summary.csv"), index=False) + return summary_df diff --git a/autorag-workspace/autorag/data/legacy/__init__.py b/autorag-workspace/autorag/data/legacy/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/autorag-workspace/autorag/data/legacy/corpus/__init__.py b/autorag-workspace/autorag/data/legacy/corpus/__init__.py new file mode 100644 index 0000000..6d3763c --- /dev/null +++ b/autorag-workspace/autorag/data/legacy/corpus/__init__.py @@ -0,0 +1,2 @@ +from .langchain import langchain_documents_to_parquet +from .llama_index import llama_documents_to_parquet, llama_text_node_to_parquet diff --git a/autorag-workspace/autorag/data/legacy/corpus/langchain.py b/autorag-workspace/autorag/data/legacy/corpus/langchain.py new file mode 100644 index 0000000..ab9bd19 --- /dev/null +++ b/autorag-workspace/autorag/data/legacy/corpus/langchain.py @@ -0,0 +1,47 @@ +import uuid +from typing import List, Optional + +import pandas as pd +from langchain_core.documents import Document + +from autorag.data.utils.util import add_essential_metadata +from autorag.utils.util import save_parquet_safe + + +def langchain_documents_to_parquet( + langchain_documents: List[Document], + output_filepath: Optional[str] = None, + upsert: bool = False, +) -> pd.DataFrame: + """ + Langchain documents to corpus dataframe. + Corpus dataframe will be saved to filepath(file_dir/filename) if given. + Return corpus dataframe whether the filepath is given. + You can use this method to create corpus.parquet after load and chunk using Llama Index. + + :param langchain_documents: List of langchain documents. + :param output_filepath: Optional filepath to save the parquet file. + If None, the function will return the processed_data as pd.DataFrame, but do not save as parquet. + File directory must exist. File extension must be .parquet + :param upsert: If true, the function will overwrite the existing file if it exists. + Default is False. + :return: Corpus data as pd.DataFrame + """ + + corpus_df = pd.DataFrame( + list( + map( + lambda doc: { + "doc_id": str(uuid.uuid4()), + "contents": doc.page_content, + "metadata": add_essential_metadata(doc.metadata), + }, + langchain_documents, + ) + ) + ) + + if output_filepath is not None: + save_parquet_safe(corpus_df, output_filepath, upsert=upsert) + + return corpus_df diff --git a/autorag-workspace/autorag/data/legacy/corpus/llama_index.py b/autorag-workspace/autorag/data/legacy/corpus/llama_index.py new file mode 100644 index 0000000..3182c39 --- /dev/null +++ b/autorag-workspace/autorag/data/legacy/corpus/llama_index.py @@ -0,0 +1,93 @@ +import uuid +from typing import List, Optional + +import pandas as pd +from llama_index.core import Document +from llama_index.core.schema import TextNode + +from autorag.data.utils.util import ( + add_essential_metadata, + add_essential_metadata_llama_text_node, +) +from autorag.utils.util import save_parquet_safe + + +def llama_documents_to_parquet( + llama_documents: List[Document], + output_filepath: Optional[str] = None, + upsert: bool = False, +) -> pd.DataFrame: + """ + Llama Index documents to corpus dataframe. + Corpus dataframe will be saved to filepath(file_dir/filename) if given. + Return corpus dataframe whether the filepath is given. + You can use this method to create corpus.parquet after load and chunk using Llama Index. + + :param llama_documents: List[Document] + :param output_filepath: Optional filepath to save the parquet file. + If None, the function will return the processed_data as pd.DataFrame, but do not save as parquet. + File directory must exist. File extension must be .parquet + :param upsert: If true, the function will overwrite the existing file if it exists. + Default is False. + :return: Corpus data as pd.DataFrame + """ + + doc_lst = pd.DataFrame( + list( + map( + lambda doc: { + "doc_id": str(uuid.uuid4()), + "contents": doc.text, + "metadata": add_essential_metadata(doc.metadata), + }, + llama_documents, + ) + ) + ) + + processed_df = pd.DataFrame(doc_lst) + + if output_filepath is not None: + save_parquet_safe(processed_df, output_filepath, upsert=upsert) + + return processed_df + + +def llama_text_node_to_parquet( + text_nodes: List[TextNode], + output_filepath: Optional[str] = None, + upsert: bool = False, +) -> pd.DataFrame: + """ + Llama Index text nodes to corpus dataframe. + Corpus dataframe will be saved to filepath(file_dir/filename) if given. + Return corpus dataframe whether the filepath is given. + You can use this method to create corpus.parquet after load and chunk using Llama Index. + + :param text_nodes: List of llama index text nodes. + :param output_filepath: Optional filepath to save the parquet file. + If None, the function will return the processed_data as pd.DataFrame, but do not save as parquet. + File directory must exist. File extension must be .parquet + :param upsert: If true, the function will overwrite the existing file if it exists. + Default is False. + :return: Corpus data as pd.DataFrame + """ + corpus_df = pd.DataFrame( + list( + map( + lambda node: { + "doc_id": node.node_id, + "contents": node.text, + "metadata": add_essential_metadata_llama_text_node( + node.metadata, node.relationships + ), + }, + text_nodes, + ) + ) + ) + + if output_filepath is not None: + save_parquet_safe(corpus_df, output_filepath, upsert=upsert) + + return corpus_df diff --git a/autorag-workspace/autorag/data/legacy/qacreation/__init__.py b/autorag-workspace/autorag/data/legacy/qacreation/__init__.py new file mode 100644 index 0000000..6a3678a --- /dev/null +++ b/autorag-workspace/autorag/data/legacy/qacreation/__init__.py @@ -0,0 +1,6 @@ +from .base import make_single_content_qa, make_qa_with_existing_qa +from .llama_index import ( + generate_qa_llama_index, + generate_answers, + generate_qa_llama_index_by_ratio, +) diff --git a/autorag-workspace/autorag/data/legacy/qacreation/base.py b/autorag-workspace/autorag/data/legacy/qacreation/base.py new file mode 100644 index 0000000..2d4d2e6 --- /dev/null +++ b/autorag-workspace/autorag/data/legacy/qacreation/base.py @@ -0,0 +1,239 @@ +import logging +import uuid +from typing import Callable, Optional, List + +import chromadb +import numpy as np +import pandas as pd +from tqdm import tqdm + +import autorag +from autorag.nodes.retrieval.vectordb import vectordb_ingest, vectordb_pure +from autorag.utils.util import ( + save_parquet_safe, + fetch_contents, + get_event_loop, + process_batch, +) + +logger = logging.getLogger("AutoRAG") + + +def make_single_content_qa( + corpus_df: pd.DataFrame, + content_size: int, + qa_creation_func: Callable, + output_filepath: Optional[str] = None, + upsert: bool = False, + random_state: int = 42, + cache_batch: int = 32, + **kwargs, +) -> pd.DataFrame: + """ + Make single content (single-hop, single-document) QA dataset using given qa_creation_func. + It generates a single content QA dataset, which means its retrieval ground truth will be only one. + It is the most basic form of QA dataset. + + :param corpus_df: The corpus dataframe to make QA dataset from. + :param content_size: This function will generate QA dataset for the given number of contents. + :param qa_creation_func: The function to create QA pairs. + You can use like `generate_qa_llama_index` or `generate_qa_llama_index_by_ratio`. + The input func must have `contents` parameter for the list of content string. + :param output_filepath: Optional filepath to save the parquet file. + If None, the function will return the processed_data as pd.DataFrame, but do not save as parquet. + File directory must exist. File extension must be .parquet + :param upsert: If true, the function will overwrite the existing file if it exists. + Default is False. + :param random_state: The random state for sampling corpus from the given corpus_df. + :param cache_batch: The number of batches to use for caching the generated QA dataset. + When the cache_batch size data is generated, the dataset will save to the designated output_filepath. + If the cache_batch size is too small, the process time will be longer. + :param kwargs: The keyword arguments for qa_creation_func. + :return: QA dataset dataframe. + You can save this as parquet file to use at AutoRAG. + """ + assert content_size > 0, "content_size must be greater than 0." + if content_size > len(corpus_df): + logger.warning( + f"content_size {content_size} is larger than the corpus size {len(corpus_df)}. " + "Setting content_size to the corpus size." + ) + content_size = len(corpus_df) + sampled_corpus = corpus_df.sample(n=content_size, random_state=random_state) + sampled_corpus = sampled_corpus.reset_index(drop=True) + + def make_query_generation_gt(row): + return row["qa"]["query"], row["qa"]["generation_gt"] + + qa_data = pd.DataFrame() + for idx, i in tqdm(enumerate(range(0, len(sampled_corpus), cache_batch))): + qa = qa_creation_func( + contents=sampled_corpus["contents"].tolist()[i : i + cache_batch], **kwargs + ) + + temp_qa_data = pd.DataFrame( + { + "qa": qa, + "retrieval_gt": sampled_corpus["doc_id"].tolist()[i : i + cache_batch], + } + ) + temp_qa_data = temp_qa_data.explode("qa", ignore_index=True) + temp_qa_data["qid"] = [str(uuid.uuid4()) for _ in range(len(temp_qa_data))] + temp_qa_data[["query", "generation_gt"]] = temp_qa_data.apply( + make_query_generation_gt, axis=1, result_type="expand" + ) + temp_qa_data = temp_qa_data.drop(columns=["qa"]) + + temp_qa_data["retrieval_gt"] = temp_qa_data["retrieval_gt"].apply( + lambda x: [[x]] + ) + temp_qa_data["generation_gt"] = temp_qa_data["generation_gt"].apply( + lambda x: [x] + ) + + if idx == 0: + qa_data = temp_qa_data + else: + qa_data = pd.concat([qa_data, temp_qa_data], ignore_index=True) + if output_filepath is not None: + save_parquet_safe(qa_data, output_filepath, upsert=upsert) + + return qa_data + + +def make_qa_with_existing_qa( + corpus_df: pd.DataFrame, + existing_query_df: pd.DataFrame, + content_size: int, + answer_creation_func: Optional[Callable] = None, + exist_gen_gt: Optional[bool] = False, + output_filepath: Optional[str] = None, + embedding_model: str = "openai_embed_3_large", + collection: Optional[chromadb.Collection] = None, + upsert: bool = False, + random_state: int = 42, + cache_batch: int = 32, + top_k: int = 3, + **kwargs, +) -> pd.DataFrame: + """ + Make single-hop QA dataset using given qa_creation_func and existing queries. + + :param corpus_df: The corpus dataframe to make QA dataset from. + :param existing_query_df: Dataframe containing existing queries to use for QA pair creation. + :param content_size: This function will generate QA dataset for the given number of contents. + :param answer_creation_func: Optional function to create answer with input query. + If exist_gen_gt is False, this function must be given. + :param exist_gen_gt: Optional boolean to use existing generation_gt. + If True, the existing_query_df must have 'generation_gt' column. + If False, the answer_creation_func must be given. + :param output_filepath: Optional filepath to save the parquet file. + :param embedding_model: The embedding model to use for vectorization. + You can add your own embedding model in the autorag.embedding_models. + Please refer to how to add an embedding model in this doc: https://docs.auto-rag.com/local_model.html + The default is 'openai_embed_3_large'. + :param collection: The chromadb collection to use for vector DB. + You can make any chromadb collection and use it here. + If you already ingested the corpus_df to the collection, the embedding process will not be repeated. + The default is None. If None, it makes a temporary collection. + :param upsert: If true, the function will overwrite the existing file if it exists. + :param random_state: The random state for sampling corpus from the given corpus_df. + :param cache_batch: The number of batches to use for caching the generated QA dataset. + :param top_k: The number of sources to refer by model. + Default is 3. + :param kwargs: The keyword arguments for qa_creation_func. + :return: QA dataset dataframe. + """ + raise DeprecationWarning("This function is deprecated.") + assert ( + "query" in existing_query_df.columns + ), "existing_query_df must have 'query' column." + + if exist_gen_gt: + assert ( + "generation_gt" in existing_query_df.columns + ), "existing_query_df must have 'generation_gt' column." + else: + assert ( + answer_creation_func is not None + ), "answer_creation_func must be given when exist_gen_gt is False." + + assert content_size > 0, "content_size must be greater than 0." + if content_size > len(corpus_df): + logger.warning( + f"content_size {content_size} is larger than the corpus size {len(corpus_df)}. " + "Setting content_size to the corpus size." + ) + content_size = len(corpus_df) + + logger.info("Loading local embedding model...") + embeddings = autorag.embedding_models[embedding_model]() + + # Vector DB creation + if collection is None: + chroma_client = chromadb.Client() + collection_name = "auto-rag" + collection = chroma_client.get_or_create_collection(collection_name) + + # embed corpus_df + vectordb_ingest(collection, corpus_df, embeddings) + query_embeddings = embeddings.get_text_embedding_batch( + existing_query_df["query"].tolist() + ) + + loop = get_event_loop() + tasks = [ + vectordb_pure([query_embedding], top_k, collection) + for query_embedding in query_embeddings + ] + results = loop.run_until_complete(process_batch(tasks, batch_size=cache_batch)) + retrieved_ids = list(map(lambda x: x[0], results)) + + retrieved_contents: List[List[str]] = fetch_contents(corpus_df, retrieved_ids) + input_passage_strs: List[str] = list( + map( + lambda x: "\n".join( + [f"Document {i + 1}\n{content}" for i, content in enumerate(x)] + ), + retrieved_contents, + ) + ) + + retrieved_qa_df = pd.DataFrame( + { + "qid": [str(uuid.uuid4()) for _ in range(len(existing_query_df))], + "query": existing_query_df["query"].tolist(), + "retrieval_gt": list(map(lambda x: [x], retrieved_ids)), + "input_passage_str": input_passage_strs, + } + ) + + if exist_gen_gt: + generation_gt = existing_query_df["generation_gt"].tolist() + if isinstance(generation_gt[0], np.ndarray): + retrieved_qa_df["generation_gt"] = generation_gt + else: + raise ValueError( + "In existing_query_df, generation_gt (per query) must be in the form of List[str]." + ) + + sample_qa_df = retrieved_qa_df.sample( + n=min(content_size, len(retrieved_qa_df)), random_state=random_state + ) + + qa_df = sample_qa_df.copy(deep=True) + qa_df.drop(columns=["input_passage_str"], inplace=True) + + if not exist_gen_gt: + generation_gt = answer_creation_func( + contents=sample_qa_df["input_passage_str"].tolist(), + queries=sample_qa_df["query"].tolist(), + batch=cache_batch, + **kwargs, + ) + qa_df["generation_gt"] = generation_gt + + if output_filepath is not None: + save_parquet_safe(qa_df, output_filepath, upsert=upsert) + + return qa_df diff --git a/autorag-workspace/autorag/data/legacy/qacreation/llama_index.py b/autorag-workspace/autorag/data/legacy/qacreation/llama_index.py new file mode 100644 index 0000000..d3420dd --- /dev/null +++ b/autorag-workspace/autorag/data/legacy/qacreation/llama_index.py @@ -0,0 +1,253 @@ +import os.path +import random +from typing import Optional, List, Dict, Any + +import pandas as pd +from llama_index.core.base.llms.types import ChatMessage, MessageRole +from llama_index.core.llms import LLM + +from autorag.utils.util import process_batch, get_event_loop + +package_dir = os.path.dirname(os.path.realpath(__file__)) + + +def generate_qa_llama_index( + llm: LLM, + contents: List[str], + prompt: Optional[str] = None, + question_num_per_content: int = 1, + max_retries: int = 3, + batch: int = 4, +) -> List[List[Dict]]: + """ + Generate a qa set from the list of contents. + It uses a single prompt for all contents. + If you want to use more than one prompt for generating qa, + you can consider using generate_qa_llama_index_by_ratio. + + :param llm: Llama index model + :param contents: List of content strings. + :param prompt: The prompt to use for the qa generation. + The prompt must include the following placeholders: + - {{text}}: The content string + - {{num_questions}}: The number of questions to generate + As default, the prompt is set to the default prompt for the question type. + :param question_num_per_content: Number of questions to generate for each content. + Default is 1. + :param max_retries: The maximum number of retries when generated question number is not equal to the target number. + Default is 3. + :param batch: The batch size to process asynchronously. + Default is 4. + :return: 2-d list of dictionaries containing the query and generation_gt. + """ + # load default prompt + if prompt is None: + prompt = open( + os.path.join(package_dir, "llama_index_default_prompt.txt"), "r" + ).read() + + tasks = [ + async_qa_gen_llama_index( + content, llm, prompt, question_num_per_content, max_retries + ) + for content in contents + ] + loops = get_event_loop() + results = loops.run_until_complete(process_batch(tasks, batch)) + return results + + +def generate_answers( + llm: LLM, + contents: List[str], + queries: List[str], + batch: int = 4, +) -> List[List[Dict]]: + """ + Generate qa sets from the list of contents using existing queries. + + :param llm: Llama index model + :param contents: List of content strings. + :param queries: List of existing queries. + :param batch: The batch size to process asynchronously. + :return: 2-d list of dictionaries containing the query and generation_gt. + """ + + tasks = [ + generate_basic_answer(llm, content, query) + for content, query in zip(contents, queries) + ] + loops = get_event_loop() + results = loops.run_until_complete(process_batch(tasks, batch)) + return results + + +def generate_qa_llama_index_by_ratio( + llm: LLM, + contents: List[str], + prompts_ratio: Dict, + question_num_per_content: int = 1, + max_retries: int = 3, + random_state: int = 42, + batch: int = 4, +) -> List[List[Dict]]: + """ + Generate a qa set from the list of contents. + You can set the ratio of prompts that you want to use for generating qa. + It distributes the number of questions to generate for each content by the ratio randomly. + + :param llm: Llama index model + :param contents: List of content strings. + :param prompts_ratio: Dictionary of prompt paths and their ratios. + Example: {"prompt/prompt1.txt": 0.5, "prompt/prompt2.txt": 0.5} + The value sum doesn't have to be 1. + The path must be the absolute path, and the file must exist. + Plus, it has to be a text file which contains proper prompt. + Each prompt must contain the following placeholders: + - {{text}}: The content string + - {{num_questions}}: The number of questions to generate + :param question_num_per_content: Number of questions to generate for each content. + Default is 1. + :param max_retries: The maximum number of retries when generated question number is not equal to the target number. + Default is 3. + :param random_state: Random seed + Default is 42. + :param batch: The batch size to process asynchronously. + Default is 4. + :return: 2-d list of dictionaries containing the query and generation_gt. + """ + prompts = list(map(lambda path: open(path, "r").read(), prompts_ratio.keys())) + assert all([validate_llama_index_prompt(prompt) for prompt in prompts]) + + content_indices = list(range(len(contents))) + random.seed(random_state) + random.shuffle(content_indices) + + slice_content_indices: List[List[str]] = distribute_list_by_ratio( + content_indices, list(prompts_ratio.values()) + ) + temp_df = pd.DataFrame({"idx": slice_content_indices, "prompt": prompts}) + temp_df = temp_df.explode("idx", ignore_index=True) + temp_df = temp_df.sort_values(by="idx", ascending=True) + + final_df = pd.DataFrame({"content": contents, "prompt": temp_df["prompt"].tolist()}) + + tasks = [ + async_qa_gen_llama_index( + content, llm, prompt, question_num_per_content, max_retries + ) + for content, prompt in zip( + final_df["content"].tolist(), final_df["prompt"].tolist() + ) + ] + + loops = get_event_loop() + results = loops.run_until_complete(process_batch(tasks, batch)) + + return results + + +async def async_qa_gen_llama_index( + content: str, + llm: LLM, + prompt: str, + question_num: int = 1, + max_retries: int = 3, +): + """ + Generate a qa set by using the given content and the llama index model. + You must select the question type. + + :param content: Content string + :param llm: Llama index model + :param prompt: The prompt to use for the qa generation. + The prompt must include the following placeholders: + - {{text}}: The content string + - {{num_questions}}: The number of questions to generate + :param question_num: The number of questions to generate + :param max_retries: Maximum number of retries when generated question number is not equal to the target number + :return: List of dictionaries containing the query and generation_gt + """ + validate_llama_index_prompt(prompt) + + async def generate(content: str, llm: LLM): + for _ in range(max_retries): + output = await llm.acomplete( + prompt.replace("{{text}}", content).replace( + "{{num_questions}}", str(question_num) + ) + ) + result = parse_output(output.text) + if len(result) == question_num: + return result + raise InterruptedError( + f"Failed to generate output of length {question_num} after {max_retries} retries." + ) + + return await generate(content, llm) + + +async def generate_basic_answer(llm: LLM, passage_str: str, query: str) -> str: + basic_answer_system_prompt = """You are an AI assistant to answer the given question in the provide evidence text. + You can find the evidence from the given text about question, and you have to write a proper answer to the given question. + You have to preserve the question's language at the answer. + For example, if the input question is Korean, the output answer must be in Korean. + """ + user_prompt = f"Text:\n<|text_start|>\n{passage_str}\n<|text_end|>\n\nQuestion:\n{query}\n\nAnswer:" + + response = await llm.achat( + messages=[ + ChatMessage(role=MessageRole.SYSTEM, content=basic_answer_system_prompt), + ChatMessage(role=MessageRole.USER, content=user_prompt), + ], + temperature=1.0, + ) + return response.message.content + + +def validate_llama_index_prompt(prompt: str) -> bool: + """ + Validate the prompt for the llama index model. + The prompt must include the following placeholders: + - {{text}}: The content string + - {{num_questions}}: The number of questions to generate + """ + if "{{text}}" not in prompt: + raise ValueError("The prompt must include the placeholder {{text}}.") + if "{{num_questions}}" not in prompt: + raise ValueError("The prompt must include the placeholder {{num_questions}}.") + return True + + +def parse_output(result: str) -> List[Dict]: + result = result.strip() + result = result.split("[Q]:") + final_result = list() + for res in result: + res = res.strip() + if res and "\n[A]:" in res: + qa = res.split("\n[A]:") + final_result.append( + {"query": qa[0].strip(), "generation_gt": qa[1].strip()} + ) + return final_result + + +def distribute_list_by_ratio(input_list, ratio) -> List[List[Any]]: + total_ratio = sum(ratio) + total_length = len(input_list) + + # Calculate the length of each slice + slice_lengths = [int((r / total_ratio) * total_length) for r in ratio] + + # Adjust the last slice in case of rounding issues + slice_lengths[-1] = total_length - sum(slice_lengths[:-1]) + + slices = [] + start = 0 + for length in slice_lengths: + end = start + length + slices.append(input_list[start:end]) + start = end + + return slices diff --git a/autorag-workspace/autorag/data/legacy/qacreation/llama_index_default_prompt.txt b/autorag-workspace/autorag/data/legacy/qacreation/llama_index_default_prompt.txt new file mode 100644 index 0000000..d9fcd5e --- /dev/null +++ b/autorag-workspace/autorag/data/legacy/qacreation/llama_index_default_prompt.txt @@ -0,0 +1,54 @@ +You're an AI tasked to convert Text into a question and answer set. +Cover as many details from Text as possible in the QnA set. + +Instructions: +1. Both Questions and Answers MUST BE extracted from given Text +2. Answers must be full sentences +3. Questions should be as detailed as possible from Text +4. Output must always have the provided number of QnAs +5. Create questions that ask about information from the Text +6. MUST include specific keywords from the Text. +7. Do not mention any of these in the questions: "in the given text", "in the provided information", etc. + +Question examples: +1. How do owen and riggs know each other? +2. What does the word fore "mean" in golf? +3. What makes charging bull in nyc popular to tourists? +4. What kind of pistol does the army use? +5. Who was the greatest violin virtuoso in the romantic period? +<|separator|> + +Text: +<|text_start|> +Mark Hamill as Luke Skywalker : One of the last living Jedi , trained by Obi - Wan and Yoda , who is also a skilled X-wing fighter pilot allied with the Rebellion . +Harrison Ford as Han Solo : A rogue smuggler , who aids the Rebellion against the Empire . Han is Luke and Leia 's friend , as well as Leia 's love interest . +Carrie Fisher as Leia Organa : The former Princess of the destroyed planet Alderaan , who joins the Rebellion ; Luke 's twin sister , and Han 's love interest . +Billy Dee Williams as Lando Calrissian : The former Baron Administrator of Cloud City and one of Han 's friends who aids the Rebellion . +Anthony Daniels as C - 3PO : A humanoid protocol droid , who sides with the Rebellion . +Peter Mayhew as Chewbacca : A Wookiee who is Han 's longtime friend , who takes part in the Rebellion . +Kenny Baker as R2 - D2 : An astromech droid , bought by Luke ; and long - time friend to C - 3PO . He also portrays a GONK power droid in the background . +Ian McDiarmid as the Emperor : The evil founding supreme ruler of the Galactic Empire , and Vader 's Sith Master . +Frank Oz as Yoda : The wise , centuries - old Grand Master of the Jedi , who is Luke 's self - exiled Jedi Master living on Dagobah . After dying , he reappears to Luke as a Force - ghost . Yoda 's Puppetry was assisted by Mike Quinn . +David Prowse as Darth Vader / Anakin Skywalker : A powerful Sith lord and the second in command of the Galactic Empire ; Luke and Leia 's father . +<|text_end|> +Output with 4 QnAs: +<|separator|> + +[Q]: who played luke father in return of the jedi +[A]: David Prowse acted as Darth Vader, a.k.a Anakin Skywalker, which is Luke and Leia's father. +[Q]: Who is Han Solo's best friend? And what species is he? +[A]: Han Solo's best friend is Chewbacca, who is a Wookiee. +[Q]: Who played luke's teacher in the return of the jedi +[A]: Yoda, the wise, centuries-old Grand Master of the Jedi, who is Luke's self-exiled Jedi Master living on Dagobah, was played by Frank Oz. +Also, there is a mention of Obi-Wan Kenobi, who trained Luke Skywalker. +But I can't find who played Obi-Wan Kenobi in the given text. +[Q]: Where Yoda lives in the return of the jedi? +[A]: Yoda, the Jedi Master, lives on Dagobah. +<|separator|> + +Text: +<|text_start|> +{{text}} +<|text_end|> +Output with {{num_questions}} QnAs: +<|separator|> diff --git a/autorag-workspace/autorag/data/legacy/qacreation/ragas.py b/autorag-workspace/autorag/data/legacy/qacreation/ragas.py new file mode 100644 index 0000000..51395ec --- /dev/null +++ b/autorag-workspace/autorag/data/legacy/qacreation/ragas.py @@ -0,0 +1,75 @@ +import uuid +from typing import Optional + +import pandas as pd +from langchain_core.embeddings import Embeddings +from langchain_core.language_models import BaseChatModel +from langchain_openai import ChatOpenAI, OpenAIEmbeddings + +from autorag.data.utils.util import corpus_df_to_langchain_documents +from autorag.utils import cast_qa_dataset + + +def generate_qa_ragas( + corpus_df: pd.DataFrame, + test_size: int, + distributions: Optional[dict] = None, + generator_llm: Optional[BaseChatModel] = None, + critic_llm: Optional[BaseChatModel] = None, + embedding_model: Optional[Embeddings] = None, + **kwargs, +) -> pd.DataFrame: + """ + QA dataset generation using RAGAS. + Returns qa dataset dataframe. + + :param corpus_df: Corpus dataframe. + :param test_size: Number of queries to generate. + :param distributions: Distributions of different types of questions. + Default is "simple is 0.5, multi_context is 0.4, and reasoning is 0.1." + Each type of questions refers to Ragas evolution types. + :param generator_llm: Generator language model from Langchain. + :param critic_llm: Critic language model from Langchain. + :param embedding_model: Embedding model from Langchain. + :param kwargs: The additional option to pass to the 'generate_with_langchain_docs' method. + You can input 'with_debugging_logs', 'is_async', 'raise_exceptions', and 'run_config'. + :return: QA dataset dataframe. + """ + from ragas.testset import TestsetGenerator + from ragas.testset.evolutions import simple, reasoning, multi_context + + if generator_llm is None: + generator_llm = ChatOpenAI(model="gpt-3.5-turbo-16k") + if critic_llm is None: + critic_llm = ChatOpenAI(model="gpt-4-turbo") + if embedding_model is None: + embedding_model = OpenAIEmbeddings() + if distributions is None: + distributions = {simple: 0.5, multi_context: 0.4, reasoning: 0.1} + + assert sum(list(distributions.values())) == 1.0, "Sum of distributions must be 1.0" + + generator = TestsetGenerator.from_langchain( + generator_llm, critic_llm, embedding_model + ) + + langchain_docs = corpus_df_to_langchain_documents(corpus_df) + + test_df = generator.generate_with_langchain_docs( + langchain_docs, test_size, distributions=distributions, **kwargs + ).to_pandas() + + result_df = pd.DataFrame( + { + "qid": [str(uuid.uuid4()) for _ in range(len(test_df))], + "query": test_df["question"].tolist(), + "generation_gt": list(map(lambda x: x, test_df["ground_truth"].tolist())), + } + ) + + result_df["retrieval_gt"] = test_df["metadata"].apply( + lambda x: list(map(lambda y: y["filename"], x)) + ) + result_df = cast_qa_dataset(result_df) + + return result_df diff --git a/autorag-workspace/autorag/data/legacy/qacreation/simple.py b/autorag-workspace/autorag/data/legacy/qacreation/simple.py new file mode 100644 index 0000000..13ea79f --- /dev/null +++ b/autorag-workspace/autorag/data/legacy/qacreation/simple.py @@ -0,0 +1,99 @@ +import os +import pathlib +import uuid +from typing import Callable + +import pandas as pd + + +def generate_qa_row(llm, corpus_data_row): + """ + this sample code to generate rag dataset using OpenAI chat model + + :param llm: guidance model + :param corpus_data_row: need "contents" column + :return: should to be dict which has "query", "generation_gt" columns at least. + """ + from guidance import gen + import guidance + + temp_llm = llm + with guidance.user(): + temp_llm += f""" + You have to found a passge to solve "the problem". + You need to build a clean and clear set of (problem, passage, answer) in json format + so that you don't have to ask about "the problem" again. + problem need to end with question mark("?"). + The process of approaching the answer based on the information of the given passage + must be clearly and neatly displayed in the answer.\n + \n + Here is set of (problem, passage, answer) in JSON format:\n + {{\n + "passage": {corpus_data_row["contents"]}\n + "problem": + """ + + with guidance.assistant(): + temp_llm += gen("query", stop="?") + with guidance.user(): + temp_llm += """ + "answer": + """ + with guidance.assistant(): + temp_llm += gen("generation_gt") + + corpus_data_row["metadata"]["qa_generation"] = "simple" + + response = {"query": temp_llm["query"], "generation_gt": temp_llm["generation_gt"]} + return response + + +def generate_simple_qa_dataset( + llm, + corpus_data: pd.DataFrame, + output_filepath: str, + generate_row_function: Callable, + **kwargs, +): + """ + corpus_data to qa_dataset + qa_dataset will be saved to filepath(file_dir/filename) + + :param llm: guidance.models.Model + :param corpus_data: pd.DataFrame. refer to the basic structure + :param output_filepath: file_dir must exist, filepath must not exist. file extension must be .parquet + :param generate_row_function: input(llm, corpus_data_row, kwargs) output(dict[columns contain "query" and "generation_gt"]) + :param kwargs: if generate_row_function requires more args, use kwargs + :return: qa_dataset as pd.DataFrame + """ + output_file_dir = pathlib.PurePath(output_filepath).parent + if not os.path.isdir(output_file_dir): + raise NotADirectoryError(f"directory {output_file_dir} not found.") + if not output_filepath.endswith("parquet"): + raise NameError( + f'file path: {output_filepath} filename extension need to be ".parquet"' + ) + if os.path.exists(output_filepath): + raise FileExistsError( + f"{output_filepath.split('/')[-1]} already exists in {output_file_dir}." + ) + + qa_data_lst = [] + for _, corpus_data_row in corpus_data.iterrows(): + response = generate_row_function( + llm=llm, corpus_data_row=corpus_data_row, **kwargs + ) + qa_data_lst.append( + { + "qid": str(uuid.uuid4()), + "query": response["query"], + "retrieval_gt": [[corpus_data_row["doc_id"]]], + "generation_gt": [response["generation_gt"]], + "metadata": corpus_data_row["metadata"], + } + ) + + qa_dataset = pd.DataFrame(qa_data_lst) + qa_dataset.to_parquet(output_filepath, index=False) + + return qa_dataset diff --git a/autorag-workspace/autorag/data/parse/__init__.py b/autorag-workspace/autorag/data/parse/__init__.py new file mode 100644 index 0000000..00a4b38 --- /dev/null +++ b/autorag-workspace/autorag/data/parse/__init__.py @@ -0,0 +1 @@ +from .langchain_parse import langchain_parse diff --git a/autorag-workspace/autorag/data/parse/base.py b/autorag-workspace/autorag/data/parse/base.py new file mode 100644 index 0000000..92f3f27 --- /dev/null +++ b/autorag-workspace/autorag/data/parse/base.py @@ -0,0 +1,79 @@ +import functools +import logging +from datetime import datetime +from glob import glob +from typing import Tuple, List, Optional +import os + +from autorag.utils import result_to_dataframe +from autorag.data.utils.util import get_file_metadata + +logger = logging.getLogger("AutoRAG") + + +def parser_node(func): + @functools.wraps(func) + @result_to_dataframe(["texts", "path", "page", "last_modified_datetime"]) + def wrapper( + data_path_glob: str, + file_type: str, + parse_method: Optional[str] = None, + **kwargs, + ) -> Tuple[List[str], List[str], List[int], List[datetime]]: + logger.info(f"Running parser - {func.__name__} module...") + + data_path_list = glob(data_path_glob) + if not data_path_list: + raise FileNotFoundError(f"data does not exits in {data_path_glob}") + + assert file_type in [ + "pdf", + "csv", + "json", + "md", + "html", + "xml", + "all_files", + ], f"search type {file_type} is not supported" + + # extract only files from data_path_list based on the file_type set in the YAML file + data_paths = ( + [ + data_path + for data_path in data_path_list + if os.path.basename(data_path).split(".")[-1] == file_type + ] + if file_type != "all_files" + else data_path_list + ) + + if func.__name__ == "langchain_parse": + parse_method = parse_method.lower() + if parse_method == "directory": + path_split_list = data_path_glob.split("/") + glob_path = path_split_list.pop() + folder_path = "/".join(path_split_list) + kwargs.update({"glob": glob_path, "path": folder_path}) + result = func( + data_path_list=data_paths, parse_method=parse_method, **kwargs + ) + else: + result = func( + data_path_list=data_paths, parse_method=parse_method, **kwargs + ) + elif func.__name__ in ["clova_ocr", "llama_parse", "table_hybrid_parse"]: + result = func(data_path_list=data_paths, **kwargs) + else: + raise ValueError(f"Unsupported module_type: {func.__name__}") + result = _add_last_modified_datetime(result) + return result + + return wrapper + + +def _add_last_modified_datetime(result): + last_modified_datetime_lst = list( + map(lambda x: get_file_metadata(x)["last_modified_datetime"], result[1]) + ) + result_with_dates = result + (last_modified_datetime_lst,) + return result_with_dates diff --git a/autorag-workspace/autorag/data/parse/clova.py b/autorag-workspace/autorag/data/parse/clova.py new file mode 100644 index 0000000..c82e68e --- /dev/null +++ b/autorag-workspace/autorag/data/parse/clova.py @@ -0,0 +1,194 @@ +import base64 +import itertools +import json +import os +from typing import List, Optional, Tuple + +import aiohttp +import fitz # PyMuPDF + +from autorag.data.parse.base import parser_node +from autorag.utils.util import process_batch, get_event_loop + + +@parser_node +def clova_ocr( + data_path_list: List[str], + url: Optional[str] = None, + api_key: Optional[str] = None, + batch: int = 5, + table_detection: bool = False, +) -> Tuple[List[str], List[str], List[int]]: + """ + Parse documents to use Naver Clova OCR. + + :param data_path_list: The list of data paths to parse. + :param url: The URL for Clova OCR. + You can get the URL with the guide at https://guide.ncloud-docs.com/docs/clovaocr-example01 + You can set the environment variable CLOVA_URL, or you can set it directly as a parameter. + :param api_key: The API key for Clova OCR. + You can get the API key with the guide at https://guide.ncloud-docs.com/docs/clovaocr-example01 + You can set the environment variable CLOVA_API_KEY, or you can set it directly as a parameter. + :param batch: The batch size for parse documents. Default is 8. + :param table_detection: Whether to enable table detection. Default is False. + :return: tuple of lists containing the parsed texts, path and pages. + """ + url = os.getenv("CLOVA_URL", None) if url is None else url + if url is None: + raise KeyError( + "Please set the URL for Clova OCR in the environment variable CLOVA_URL " + "or directly set it on the config YAML file." + ) + + api_key = os.getenv("CLOVA_API_KEY", None) if api_key is None else api_key + if api_key is None: + raise KeyError( + "Please set the API key for Clova OCR in the environment variable CLOVA_API_KEY " + "or directly set it on the config YAML file." + ) + if batch > 5: + raise ValueError("The batch size should be less than or equal to 5.") + + image_data_lst = list( + map(lambda data_path: pdf_to_images(data_path), data_path_list) + ) + image_info_lst = [ + generate_image_info(pdf_path, len(image_data)) + for pdf_path, image_data in zip(data_path_list, image_data_lst) + ] + + image_data_list = list(itertools.chain(*image_data_lst)) + image_info_list = list(itertools.chain(*image_info_lst)) + + tasks = [ + clova_ocr_pure(image_data, image_info, url, api_key, table_detection) + for image_data, image_info in zip(image_data_list, image_info_list) + ] + loop = get_event_loop() + results = loop.run_until_complete(process_batch(tasks, batch)) + + texts, path, pages = zip(*results) + return list(texts), list(path), list(pages) + + +async def clova_ocr_pure( + image_data: bytes, + image_info: dict, + url: str, + api_key: str, + table_detection: bool = False, +) -> Tuple[str, str, int]: + session = aiohttp.ClientSession() + table_html = "" + headers = {"X-OCR-SECRET": api_key, "Content-Type": "application/json"} + + # Convert image data to base64 + image_base64 = base64.b64encode(image_data).decode("utf-8") + + # Set data + data = { + "version": "V2", + "requestId": "sample_id", + "timestamp": 0, + "images": [{"format": "png", "name": "sample_image", "data": image_base64}], + "enableTableDetection": table_detection, + } + + async with session.post(url, headers=headers, data=json.dumps(data)) as response: + resp_json = await response.json() + if "images" not in resp_json: + raise RuntimeError( + f"Invalid response from Clova API: {resp_json['detail']}" + ) + if "tables" in resp_json["images"][0].keys(): + table_html = json_to_html_table( + resp_json["images"][0]["tables"][0]["cells"] + ) + page_text = extract_text_from_fields(resp_json["images"][0]["fields"]) + + if table_html: + page_text += f"\n\ntable html:\n{table_html}" + + await session.close() + return page_text, image_info["pdf_path"], image_info["pdf_page"] + + +def pdf_to_images(pdf_path: str) -> List[bytes]: + """Convert each page of the PDF to an image and return the image data.""" + pdf_document = fitz.open(pdf_path) + image_data_lst = [] + for page_num in range(len(pdf_document)): + page = pdf_document.load_page(page_num) + pix = page.get_pixmap() + img_data = pix.tobytes("png") + image_data_lst.append(img_data) + return image_data_lst + + +def generate_image_info(pdf_path: str, num_pages: int) -> List[dict]: + """Generate image names based on the PDF file name and the number of pages.""" + image_info_lst = [ + {"pdf_path": pdf_path, "pdf_page": page_num + 1} + for page_num in range(num_pages) + ] + return image_info_lst + + +def extract_text_from_fields(fields): + text = "" + for field in fields: + text += field["inferText"] + if field["lineBreak"]: + text += "\n" + else: + text += " " + return text.strip() + + +def json_to_html_table(json_data): + # Initialize the HTML table + html = '\n' + # Determine the number of rows and columns + max_row = max(cell["rowIndex"] + cell["rowSpan"] for cell in json_data) + max_col = max(cell["columnIndex"] + cell["columnSpan"] for cell in json_data) + # Create a 2D array to keep track of merged cells + table = [["" for _ in range(max_col)] for _ in range(max_row)] + # Fill the table with cell data + for cell in json_data: + row = cell["rowIndex"] + col = cell["columnIndex"] + row_span = cell["rowSpan"] + col_span = cell["columnSpan"] + cell_text = ( + " ".join( + line["inferText"] for line in cell["cellTextLines"][0]["cellWords"] + ) + if cell["cellTextLines"] + else "" + ) + # Place the cell in the table + table[row][col] = {"text": cell_text, "rowSpan": row_span, "colSpan": col_span} + # Mark merged cells as occupied + for r in range(row, row + row_span): + for c in range(col, col + col_span): + if r != row or c != col: + table[r][c] = None + # Generate HTML from the table array + for row in table: + html += " \n" + for cell in row: + if cell is None: + continue + if cell == "": + html += " \n" + else: + row_span_attr = ( + f' rowspan="{cell["rowSpan"]}"' if cell["rowSpan"] > 1 else "" + ) + col_span_attr = ( + f' colspan="{cell["colSpan"]}"' if cell["colSpan"] > 1 else "" + ) + html += f' {cell["text"]}\n' + html += " \n" + html += "
" + return html diff --git a/autorag-workspace/autorag/data/parse/langchain_parse.py b/autorag-workspace/autorag/data/parse/langchain_parse.py new file mode 100644 index 0000000..f9b3784 --- /dev/null +++ b/autorag-workspace/autorag/data/parse/langchain_parse.py @@ -0,0 +1,87 @@ +import multiprocessing as mp +from itertools import chain +from typing import List, Tuple + +from autorag.data import parse_modules +from autorag.data.parse.base import parser_node + + +@parser_node +def langchain_parse( + data_path_list: List[str], parse_method: str, **kwargs +) -> Tuple[List[str], List[str], List[int]]: + """ + Parse documents to use langchain document_loaders(parse) method + + :param data_path_list: The list of data paths to parse. + :param parse_method: A langchain document_loaders(parse) method to use. + :param kwargs: The extra parameters for creating the langchain document_loaders(parse) instance. + :return: tuple of lists containing the parsed texts, path and pages. + """ + if parse_method in ["directory", "unstructured"]: + results = parse_all_files(data_path_list, parse_method, **kwargs) + texts, path = results[0], results[1] + pages = [-1] * len(texts) + + else: + num_workers = mp.cpu_count() + # Execute parallel processing + with mp.Pool(num_workers) as pool: + results = pool.starmap( + langchain_parse_pure, + [(data_path, parse_method, kwargs) for data_path in data_path_list], + ) + + texts, path, pages = (list(chain.from_iterable(item)) for item in zip(*results)) + + return texts, path, pages + + +def langchain_parse_pure( + data_path: str, parse_method: str, kwargs +) -> Tuple[List[str], List[str], List[int]]: + """ + Parses a single file using the specified parse method. + + Args: + data_path (str): The file path to parse. + parse_method (str): The parsing method to use. + kwargs (Dict): Additional keyword arguments for the parsing method. + + Returns: + Tuple[str, str]: A tuple containing the parsed text and the file path. + """ + + parse_instance = parse_modules[parse_method](data_path, **kwargs) + + # Load the text from the file + documents = parse_instance.load() + + texts = list(map(lambda x: x.page_content, documents)) + path = [data_path] * len(texts) + if parse_method in ["pymupdf", "pdfplumber", "pypdf", "pypdfium2"]: + pages = list(range(1, len(documents) + 1)) + else: + pages = [-1] * len(texts) + + # Clean up the parse instance + del parse_instance + + return texts, path, pages + + +def parse_all_files( + data_path_list: List[str], parse_method: str, **kwargs +) -> Tuple[List[str], List[str]]: + if parse_method == "unstructured": + parse_instance = parse_modules[parse_method](data_path_list, **kwargs) + elif parse_method == "directory": + parse_instance = parse_modules[parse_method](**kwargs) + else: + raise ValueError(f"Unsupported parse method: {parse_method}") + docs = parse_instance.load() + texts = [doc.page_content for doc in docs] + file_names = [doc.metadata["source"] for doc in docs] + + del parse_instance + return texts, file_names diff --git a/autorag-workspace/autorag/data/parse/llamaparse.py b/autorag-workspace/autorag/data/parse/llamaparse.py new file mode 100644 index 0000000..9adcd23 --- /dev/null +++ b/autorag-workspace/autorag/data/parse/llamaparse.py @@ -0,0 +1,126 @@ +import os +from typing import List, Tuple +from itertools import chain + +from llama_parse import LlamaParse + +from autorag.data.parse.base import parser_node +from autorag.utils.util import process_batch, get_event_loop + + +@parser_node +def llama_parse( + data_path_list: List[str], + batch: int = 8, + use_vendor_multimodal_model: bool = False, + vendor_multimodal_model_name: str = "openai-gpt4o", + use_own_key: bool = False, + vendor_multimodal_api_key: str = None, + **kwargs, +) -> Tuple[List[str], List[str], List[int]]: + """ + Parse documents to use llama_parse. + LLAMA_CLOUD_API_KEY environment variable should be set. + You can get the key from https://cloud.llamaindex.ai/api-key + + :param data_path_list: The list of data paths to parse. + :param batch: The batch size for parse documents. Default is 8. + :param use_vendor_multimodal_model: Whether to use the vendor multimodal model. Default is False. + :param vendor_multimodal_model_name: The name of the vendor multimodal model. Default is "openai-gpt4o". + :param use_own_key: Whether to use the own API key. Default is False. + :param vendor_multimodal_api_key: The API key for the vendor multimodal model. + :param kwargs: The extra parameters for creating the llama_parse instance. + :return: tuple of lists containing the parsed texts, path and pages. + """ + if use_vendor_multimodal_model: + kwargs = _add_multimodal_params( + kwargs, + use_vendor_multimodal_model, + vendor_multimodal_model_name, + use_own_key, + vendor_multimodal_api_key, + ) + + parse_instance = LlamaParse(**kwargs) + + tasks = [ + llama_parse_pure(data_path, parse_instance) for data_path in data_path_list + ] + loop = get_event_loop() + results = loop.run_until_complete(process_batch(tasks, batch)) + + del parse_instance + + texts, path, pages = (list(chain.from_iterable(item)) for item in zip(*results)) + + return texts, path, pages + + +async def llama_parse_pure( + data_path: str, parse_instance +) -> Tuple[List[str], List[str], List[int]]: + documents = await parse_instance.aload_data(data_path) + + texts = list(map(lambda x: x.text, documents)) + path = [data_path] * len(texts) + pages = list(range(1, len(documents) + 1)) + + return texts, path, pages + + +def _add_multimodal_params( + kwargs, + use_vendor_multimodal_model, + vendor_multimodal_model_name, + use_own_key, + vendor_multimodal_api_key, +) -> dict: + kwargs["use_vendor_multimodal_model"] = use_vendor_multimodal_model + kwargs["vendor_multimodal_model_name"] = vendor_multimodal_model_name + + def set_multimodal_api_key( + multimodal_model_name: str = "openai-gpt4o", _api_key: str = None + ) -> str: + if multimodal_model_name in ["openai-gpt4o", "openai-gpt-4o-mini"]: + _api_key = ( + os.getenv("OPENAI_API_KEY", None) if _api_key is None else _api_key + ) + if _api_key is None: + raise KeyError( + "Please set the OPENAI_API_KEY in the environment variable OPENAI_API_KEY " + "or directly set it on the config YAML file." + ) + elif multimodal_model_name in ["anthropic-sonnet-3.5"]: + _api_key = ( + os.getenv("ANTHROPIC_API_KEY", None) if _api_key is None else _api_key + ) + if _api_key is None: + raise KeyError( + "Please set the ANTHROPIC_API_KEY in the environment variable ANTHROPIC_API_KEY " + "or directly set it on the config YAML file." + ) + elif multimodal_model_name in ["gemini-1.5-flash", "gemini-1.5-pro"]: + _api_key = ( + os.getenv("GEMINI_API_KEY", None) if _api_key is None else _api_key + ) + if _api_key is None: + raise KeyError( + "Please set the GEMINI_API_KEY in the environment variable GEMINI_API_KEY " + "or directly set it on the config YAML file." + ) + elif multimodal_model_name in ["custom-azure-model"]: + raise NotImplementedError( + "Custom Azure multimodal model is not supported yet." + ) + else: + raise ValueError("Invalid multimodal model name.") + + return _api_key + + if use_own_key: + api_key = set_multimodal_api_key( + vendor_multimodal_model_name, vendor_multimodal_api_key + ) + kwargs["vendor_multimodal_api_key"] = api_key + + return kwargs diff --git a/autorag-workspace/autorag/data/parse/run.py b/autorag-workspace/autorag/data/parse/run.py new file mode 100644 index 0000000..e1951e1 --- /dev/null +++ b/autorag-workspace/autorag/data/parse/run.py @@ -0,0 +1,141 @@ +import os +from typing import List, Callable, Dict +import pandas as pd +from glob import glob + +from autorag.strategy import measure_speed +from autorag.data.utils.util import get_param_combinations + +default_map = { + "pdf": { + "file_type": "pdf", + "module_type": "langchain_parse", + "parse_method": "pdfminer", + }, + "csv": { + "file_type": "csv", + "module_type": "langchain_parse", + "parse_method": "csv", + }, + "md": { + "file_type": "md", + "module_type": "langchain_parse", + "parse_method": "unstructuredmarkdown", + }, + "html": { + "file_type": "html", + "module_type": "langchain_parse", + "parse_method": "bshtml", + }, + "xml": { + "file_type": "xml", + "module_type": "langchain_parse", + "parse_method": "unstructuredxml", + }, +} + + +def run_parser( + modules: List[Callable], + module_params: List[Dict], + data_path_glob: str, + project_dir: str, + all_files: bool, +): + if not all_files: + # Set the parsing module to default if it is a file type in paths but not set in YAML. + data_path_list = glob(data_path_glob) + if not data_path_list: + raise FileNotFoundError(f"data does not exits in {data_path_glob}") + + file_types = set( + [os.path.basename(data_path).split(".")[-1] for data_path in data_path_list] + ) + set_file_types = set([module["file_type"] for module in module_params]) + + # Calculate the set difference once + file_types_to_remove = set_file_types - file_types + + # Use list comprehension to filter out unwanted elements + module_params = [ + param + for param in module_params + if param["file_type"] not in file_types_to_remove + ] + modules = [ + module + for module, param in zip(modules, module_params) + if param["file_type"] not in file_types_to_remove + ] + + # create a list of only those file_types that are in file_types but not in set_file_types + missing_file_types = list(file_types - set_file_types) + + if missing_file_types: + add_modules_list = [] + for missing_file_type in missing_file_types: + if missing_file_type == "json": + raise ValueError( + "JSON file type must have a jq_schema so you must set it in the YAML file." + ) + + add_modules_list.append(default_map[missing_file_type]) + + add_modules, add_params = get_param_combinations(add_modules_list) + modules.extend(add_modules) + module_params.extend(add_params) + + results, execution_times = zip( + *map( + lambda x: measure_speed(x[0], data_path_glob=data_path_glob, **x[1]), + zip(modules, module_params), + ) + ) + average_times = list(map(lambda x: x / len(results[0]), execution_times)) + + # save results to parquet files + if all_files: + if len(module_params) > 1: + raise ValueError( + "All files is set to True, You can only use one parsing module." + ) + filepaths = [os.path.join(project_dir, "parsed_result.parquet")] + else: + filepaths = list( + map( + lambda x: os.path.join(project_dir, f"{x['file_type']}.parquet"), + module_params, + ) + ) + + _files = {} + for result, filepath in zip(results, filepaths): + _files[filepath].append(result) if filepath in _files.keys() else _files.update( + {filepath: [result]} + ) + # Save files with a specific file type as Parquet files. + for filepath, value in _files.items(): + pd.concat(value).to_parquet(filepath, index=False) + + filenames = list(map(lambda x: os.path.basename(x), filepaths)) + + summary_df = pd.DataFrame( + { + "filename": filenames, + "module_name": list(map(lambda module: module.__name__, modules)), + "module_params": module_params, + "execution_time": average_times, + } + ) + summary_df.to_csv(os.path.join(project_dir, "summary.csv"), index=False) + + # concat all parquet files here if not all_files. + _filepaths = list(_files.keys()) + if not all_files: + dataframes = [pd.read_parquet(file) for file in _filepaths] + combined_df = pd.concat(dataframes, ignore_index=True) + combined_df.to_parquet( + os.path.join(project_dir, "parsed_result.parquet"), index=False + ) + + return summary_df diff --git a/autorag-workspace/autorag/data/parse/table_hybrid_parse.py b/autorag-workspace/autorag/data/parse/table_hybrid_parse.py new file mode 100644 index 0000000..51efa8e --- /dev/null +++ b/autorag-workspace/autorag/data/parse/table_hybrid_parse.py @@ -0,0 +1,134 @@ +import os +import tempfile +from glob import glob +from typing import List, Tuple, Dict + +from PyPDF2 import PdfFileReader, PdfFileWriter +import pdfplumber + +from autorag.support import get_support_modules +from autorag.data.parse.base import parser_node + + +@parser_node +def table_hybrid_parse( + data_path_list: List[str], + text_parse_module: str, + text_params: Dict, + table_parse_module: str, + table_params: Dict, +) -> Tuple[List[str], List[str], List[int]]: + """ + Parse documents to use table_hybrid_parse method. + The table_hybrid_parse method is a hybrid method that combines the parsing results of PDFs with and without tables. + It splits the PDF file into pages, separates pages with and without tables, and then parses and merges the results. + + :param data_path_list: The list of data paths to parse. + :param text_parse_module: The text parsing module to use. The type should be a string. + :param text_params: The extra parameters for the text parsing module. The type should be a dictionary. + :param table_parse_module: The table parsing module to use. The type should be a string. + :param table_params: The extra parameters for the table parsing module. The type should be a dictionary. + :return: tuple of lists containing the parsed texts, path and pages. + """ + # make save folder directory + with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as save_dir: + text_dir = os.path.join(save_dir, "text") + table_dir = os.path.join(save_dir, "table") + + os.makedirs(text_dir, exist_ok=True) + os.makedirs(table_dir, exist_ok=True) + + # Split PDF file into pages and Save PDFs with and without tables + path_map_dict_lst = [ + save_page_by_table(data_path, text_dir, table_dir) + for data_path in data_path_list + ] + path_map_dict = {k: v for d in path_map_dict_lst for k, v in d.items()} + + # Extract text pages + table_results, table_file_path = get_each_module_result( + table_parse_module, table_params, os.path.join(table_dir, "*") + ) + + # Extract table pages + text_results, text_file_path = get_each_module_result( + text_parse_module, text_params, os.path.join(text_dir, "*") + ) + + # Merge parsing results of PDFs with and without tables + texts = table_results + text_results + temp_path_lst = table_file_path + text_file_path + + # Sort by file names + temp_path_lst, texts = zip(*sorted(zip(temp_path_lst, texts))) + + # get original file path + path = list(map(lambda temp_path: path_map_dict[temp_path], temp_path_lst)) + + # get pages + pages = list(map(lambda x: get_page_from_path(x), temp_path_lst)) + + return list(texts), path, pages + + +# Save PDFs with and without tables +def save_page_by_table(data_path: str, text_dir: str, table_dir: str) -> Dict[str, str]: + file_name = os.path.basename(data_path).split(".pdf")[0] + + with open(data_path, "rb") as input_data: + pdf_reader = PdfFileReader(input_data) + num_pages = pdf_reader.getNumPages() + + path_map_dict = {} + for page_num in range(num_pages): + output_pdf_path = _get_output_path( + data_path, page_num, file_name, text_dir, table_dir + ) + _save_single_page(pdf_reader, page_num, output_pdf_path) + path_map_dict.update({output_pdf_path: data_path}) + + return path_map_dict + + +def _get_output_path( + data_path: str, page_num: int, file_name: str, text_dir: str, table_dir: str +) -> str: + with pdfplumber.open(data_path) as pdf: + page = pdf.pages[page_num] + tables = page.extract_tables() + directory = table_dir if tables else text_dir + return os.path.join(directory, f"{file_name}_page_{page_num + 1}.pdf") + + +def _save_single_page(pdf_reader: PdfFileReader, page_num: int, output_pdf_path: str): + pdf_writer = PdfFileWriter() + pdf_writer.addPage(pdf_reader.getPage(page_num)) + + with open(output_pdf_path, "wb") as output_file: + pdf_writer.write(output_file) + + +def get_each_module_result( + module: str, module_params: Dict, data_path_glob: str +) -> Tuple[List[str], List[str]]: + module_params["module_type"] = module + + data_path_list = glob(data_path_glob) + if not data_path_list: + return [], [] + + module_name = module_params.pop("module_type") + module_callable = get_support_modules(module_name) + module_original = module_callable.__wrapped__ + texts, path, _ = module_original(data_path_list, **module_params) + + return texts, path + + +def get_page_from_path(file_path: str) -> int: + file_name = os.path.basename(file_path) + split_result = file_name.rsplit("_page_", -1) + page_number_with_extension = split_result[1] + page_number, _ = page_number_with_extension.split(".") + + return int(page_number) diff --git a/autorag-workspace/autorag/data/qa/__init__.py b/autorag-workspace/autorag/data/qa/__init__.py new file mode 100644 index 0000000..cdf5475 --- /dev/null +++ b/autorag-workspace/autorag/data/qa/__init__.py @@ -0,0 +1,3 @@ +# This is v2 version, the next version of data creation +# The legacy (v1) version will be deprecated on AutoRAG version 0.3 +# The legacy (v1) version and new v2 data creation is not compatible with each other diff --git a/autorag-workspace/autorag/data/qa/evolve/__init__.py b/autorag-workspace/autorag/data/qa/evolve/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/autorag-workspace/autorag/data/qa/evolve/llama_index_query_evolve.py b/autorag-workspace/autorag/data/qa/evolve/llama_index_query_evolve.py new file mode 100644 index 0000000..dcef9dc --- /dev/null +++ b/autorag-workspace/autorag/data/qa/evolve/llama_index_query_evolve.py @@ -0,0 +1,64 @@ +import itertools +from typing import Dict, List + +from llama_index.core.base.llms.base import BaseLLM +from llama_index.core.base.llms.types import ChatResponse, ChatMessage, MessageRole + +from autorag.data.qa.evolve.prompt import QUERY_EVOLVE_PROMPT + + +async def llama_index_generate_base( + row: Dict, + llm: BaseLLM, + messages: List[ChatMessage], +) -> Dict: + original_query = row["query"] + context = list(itertools.chain.from_iterable(row["retrieval_gt_contents"])) + context_str = "Text:\n" + "\n".join( + [f"{i + 1}. {c}" for i, c in enumerate(context)] + ) + user_prompt = f"Question: {original_query}\nContext: {context_str}\nOutput: " + messages.append(ChatMessage(role=MessageRole.USER, content=user_prompt)) + + chat_response: ChatResponse = await llm.achat(messages=messages) + row["query"] = chat_response.message.content + return row + + +async def conditional_evolve_ragas( + row: Dict, + llm: BaseLLM, + lang: str = "en", +) -> Dict: + return await llama_index_generate_base( + row, + llm, + QUERY_EVOLVE_PROMPT["conditional_evolve_ragas"][lang], + ) + + +async def reasoning_evolve_ragas( + row: Dict, + llm: BaseLLM, + lang: str = "en", +) -> Dict: + return await llama_index_generate_base( + row, + llm, + QUERY_EVOLVE_PROMPT["reasoning_evolve_ragas"][lang], + ) + + +async def compress_ragas( + row: Dict, + llm: BaseLLM, + lang: str = "en", +) -> Dict: + original_query = row["query"] + user_prompt = f"Question: {original_query}\nOutput: " + messages = QUERY_EVOLVE_PROMPT["compress_ragas"][lang] + messages.append(ChatMessage(role=MessageRole.USER, content=user_prompt)) + + chat_response: ChatResponse = await llm.achat(messages=messages) + row["query"] = chat_response.message.content + return row diff --git a/autorag-workspace/autorag/data/qa/evolve/openai_query_evolve.py b/autorag-workspace/autorag/data/qa/evolve/openai_query_evolve.py new file mode 100644 index 0000000..acef9eb --- /dev/null +++ b/autorag-workspace/autorag/data/qa/evolve/openai_query_evolve.py @@ -0,0 +1,81 @@ +import itertools +from typing import Dict, List + +from llama_index.core.base.llms.types import ChatMessage, MessageRole +from llama_index.llms.openai.utils import to_openai_message_dicts +from openai import AsyncClient +from pydantic import BaseModel + +from autorag.data.qa.evolve.prompt import QUERY_EVOLVE_PROMPT + + +class Response(BaseModel): + evolved_query: str + + +async def query_evolve_openai_base( + row: Dict, + client: AsyncClient, + messages: List[ChatMessage], + model_name: str = "gpt-4o-2024-08-06", +): + """ + Evolve the original query to a new evolved query using OpenAI structured outputs. + """ + original_query = row["query"] + context = list(itertools.chain.from_iterable(row["retrieval_gt_contents"])) + context_str = "Text:\n" + "\n".join( + [f"{i + 1}. {c}" for i, c in enumerate(context)] + ) + user_prompt = f"Question: {original_query}\nContext: {context_str}\nOutput: " + messages.append(ChatMessage(role=MessageRole.USER, content=user_prompt)) + + completion = await client.beta.chat.completions.parse( + model=model_name, + messages=to_openai_message_dicts(messages), + response_format=Response, + ) + row["query"] = completion.choices[0].message.parsed.evolved_query + return row + + +async def conditional_evolve_ragas( + row: Dict, + client: AsyncClient, + model_name: str = "gpt-4o-2024-08-06", + lang: str = "en", +) -> Dict: + return await query_evolve_openai_base( + row, client, QUERY_EVOLVE_PROMPT["conditional_evolve_ragas"][lang], model_name + ) + + +async def reasoning_evolve_ragas( + row: Dict, + client: AsyncClient, + model_name: str = "gpt-4o-2024-08-06", + lang: str = "en", +) -> Dict: + return await query_evolve_openai_base( + row, client, QUERY_EVOLVE_PROMPT["reasoning_evolve_ragas"][lang], model_name + ) + + +async def compress_ragas( + row: Dict, + client: AsyncClient, + model_name: str = "gpt-4o-2024-08-06", + lang: str = "en", +) -> Dict: + original_query = row["query"] + messages = QUERY_EVOLVE_PROMPT["compress_ragas"][lang] + user_prompt = f"Question: {original_query}\nOutput: " + messages.append(ChatMessage(role=MessageRole.USER, content=user_prompt)) + + completion = await client.beta.chat.completions.parse( + model=model_name, + messages=to_openai_message_dicts(messages), + response_format=Response, + ) + row["query"] = completion.choices[0].message.parsed.evolved_query + return row diff --git a/autorag-workspace/autorag/data/qa/evolve/prompt.py b/autorag-workspace/autorag/data/qa/evolve/prompt.py new file mode 100644 index 0000000..525a5e9 --- /dev/null +++ b/autorag-workspace/autorag/data/qa/evolve/prompt.py @@ -0,0 +1,288 @@ +# The RAGAS prompts are coming from RAGAS under Apache-2.0 License. (English version) (the AutoRAG team translates Korean version prompt) +# You can see the original prompts at the RAGAS library at https://github.com/explodinggradients/ragas/blob/main/src/ragas/testset/prompts.py +from llama_index.core.base.llms.types import ChatMessage, MessageRole + +QUERY_EVOLVE_PROMPT = { + "conditional_evolve_ragas": { + "en": [ + ChatMessage( + role=MessageRole.SYSTEM, + content="""Rewrite the provided question to increase its complexity by introducing a conditional element. +The goal is to make the question more intricate by incorporating a scenario or condition that affects the context of the question. +Follow the rules given below while rewriting the question. + 1. The rewritten question should not be longer than 25 words. Use abbreviation wherever possible. + 2. The rewritten question must be reasonable and must be understood and responded by humans. + 3. The rewritten question must be fully answerable from information present context. + 4. phrases like 'provided context','according to the context?',etc are not allowed to appear in the question. +""", + ), + ChatMessage( + role=MessageRole.USER, + content="""Question : What is the function of the roots of a plant? +Context : The roots of a plant absorb water and nutrients from the soil, anchor the plant in the ground, and store food. +Output : """, + ), + ChatMessage( + role=MessageRole.ASSISTANT, + content="What dual purpose do plant roots serve concerning soil nutrients and stability?", + ), + ChatMessage( + role=MessageRole.USER, + content="""Question : How do vaccines protect against diseases? +Context : Vaccines protect against diseases by stimulating the body's immune response to produce antibodies, which recognize and combat pathogens. +Output : """, + ), + ChatMessage( + role=MessageRole.ASSISTANT, + content="How do vaccines utilize the body's immune system to defend against pathogens?", + ), + ], + "ko": [ + ChatMessage( + role=MessageRole.SYSTEM, + content="""제공된 질문에 조건에 관련한 내용을 추가하여 복잡성을 높이세요. +질문의 Context에 영향을 미치는 시나리오나 조건을 포함하여 질문을 더 복잡하게 만드는 것이 목표입니다. +질문을 다시 작성할 때 다음 규칙을 따르십시오. + 1. 다시 작성된 질문은 100자를 넘지 않아야 합니다. 가능한 경우 약어를 사용하십시오. + 2. 다시 작성된 질문은 합리적이어야 하며 사람이 이해하고 응답할 수 있어야 합니다. + 3. 다시 작성된 질문은 현재 Context에서 완전히 답변할 수 있어야 합니다. + 4. '제공된 글', '단락에 따르면?', 'Context에 의하면' 등의 문구는 질문에 나타날 수 없습니다. + 5. 한국어로 질문을 작성하세요. +""", + ), + ChatMessage( + role=MessageRole.USER, + content="""Question: 식물의 뿌리 기능이 뭐야? +Context: 식물의 뿌리는 토양에서 물과 영양분을 흡수하고, 식물을 땅에 고정하며, 영양분을 저장합니다. +Output: """, + ), + ChatMessage( + role=MessageRole.ASSISTANT, + content="식물의 뿌리는 토양 영양분과 안정성에 대해 어떤 역할을 하나요?", + ), + ChatMessage( + role=MessageRole.USER, + content="""Question: 백신은 질병을 어떻게 예방하나요? +Context: 백신은 신체의 면역 반응을 자극하여 병원체를 인식하고 싸우는 항체를 생성함으로써 질병으로부터 보호합니다. +Output: """, + ), + ChatMessage( + role=MessageRole.ASSISTANT, + content="백신은 신체의 면역 체계를 어떻게 활용해서 질병을 예방합니까?", + ), + ], + "ja": [ + ChatMessage( + role=MessageRole.SYSTEM, + content="""提供された質問に条件に関する内容を追加して、複雑さを高めます。 +質問のContextに影響を与えるシナリオや条件を含めて、質問をより複雑にすることが目標です。 +質問を再作成するときは、次のルールに従います。 + 1. 再作成された質問は100文字を超えてはいけません。 可能であれば略語を使ってください + 2. 再作成された質問は合理的でなければならず、人が理解して回答できるものでなければなりません。 + 3. 再作成された質問は、現在のContextで完全に答えられる必要があります。 + 4. 「提供された文」、「段落によると?」、「Contextによると」などのフレーズは質問に表示されません。 + 5. 日本語で質問を書きましょう。 +""", + ), + ChatMessage( + role=MessageRole.USER, + content="""Question: 植物の根の機能は何ですか? +Context: 植物の根は土壌から水や栄養分を吸収し、植物を地面に固定し、栄養分を蓄えます。 +Output: """, + ), + ChatMessage( + role=MessageRole.ASSISTANT, + content="植物の根は土壌栄養分と安定性に対してどのような役割をしますか?", + ), + ChatMessage( + role=MessageRole.USER, + content="""Question: ワクチンは病気をどのように予防しますか? +Context: ワクチンは、体の免疫反応を刺激して病原体を認識し、戦う抗体を生成することで病気から守ります。 +Output: """, + ), + ChatMessage( + role=MessageRole.ASSISTANT, + content="ワクチンは体の免疫システムをどのように活用して病気を予防しますか?", + ), + ], + }, + "reasoning_evolve_ragas": { + "en": [ + ChatMessage( + role=MessageRole.SYSTEM, + content="""Complicate the given question by rewriting question into a multi-hop reasoning question based on the provided context. +Answering the question should require the reader to make multiple logical connections or inferences using the information available in given context. +Rules to follow when rewriting question: +1. Ensure that the rewritten question can be answered entirely from the information present in the contexts. +2. Do not frame questions that contains more than 15 words. Use abbreviation wherever possible. +3. Make sure the question is clear and unambiguous. +4. phrases like 'based on the provided context','according to the context',etc are not allowed to appear in the question.""", + ), + ChatMessage( + role=MessageRole.USER, + content="""Question: What is the capital of France?, +Context: France is a country in Western Europe. It has several cities, including Paris, Lyon, and Marseille. Paris is not only known for its cultural landmarks like the Eiffel Tower and the Louvre Museum but also as the administrative center. +Output: """, + ), + ChatMessage( + role=MessageRole.ASSISTANT, + content="Linking the Eiffel Tower and administrative center, which city stands as both?", + ), + ChatMessage( + role=MessageRole.USER, + content="""Question: What does the append() method do in Python? +Context: In Python, lists are used to store multiple items in a single variable. Lists are one of 4 built-in data types used to store collections of data. The append() method adds a single item to the end of a list. +Output: """, + ), + ChatMessage( + role=MessageRole.ASSISTANT, + content="If a list represents a variable collection, what method extends it by one item?", + ), + ], + "ko": [ + ChatMessage( + role=MessageRole.SYSTEM, + content="""주어진 Context를 기반으로 기존 질문을 복잡하게 만들어 여러 논리적인 사고가 필요한 질문으로 다시 작성하세요. +질문에 답하려면 주어진 Context의 정보를 사용해 여러 논리적 사고나 추론을 해야 합니다. +질문을 다시 작성할 때 따라야 할 규칙: +1. 다시 작성된 질문은 Context에 있는 정보만으로 완전히 답변할 수 있어야 합니다. +2. 100자를 초과하는 질문을 작성하지 마세요. 가능한 경우 약어를 사용하세요. +3. 질문이 명확하고 모호하지 않도록 하세요. +4. '제공된 Context에 기반하여', '해당 단락에 따르면' 등의 문구는 질문에 포함되지 않아야 합니다. +5. 한국어로 질문을 작성하세요.""", + ), + ChatMessage( + role=MessageRole.USER, + content="""Question: 프랑스의 수도는 어디인가요?, +Context: 프랑스는 서유럽에 있는 나라입니다. 파리, 리옹, 마르세유를 포함한 여러 도시가 있습니다. 파리는 에펠탑과 루브르 박물관 같은 문화적 랜드마크로 유명할 뿐만 아니라 행정 중심지로도 알려져 있습니다. +Output: """, + ), + ChatMessage( + role=MessageRole.ASSISTANT, + content="에펠탑과 행정 중심지, 두 단어는 어떤 도시를 가리키나요?", + ), + ChatMessage( + role=MessageRole.USER, + content="""질문: Python에서 append() 메서드는 무엇을 하나요? +컨텍스트: Python에서 리스트는 하나의 변수에 여러 항목을 저장하는 데 사용됩니다. 리스트는 데이터를 저장하는 데 사용되는 4가지 내장 데이터 유형 중 하나입니다. append() 메서드는 리스트의 끝에 새로운 항목을 추가합니다. +출력: """, + ), + ChatMessage( + role=MessageRole.ASSISTANT, + content="리스트가 변수들을 모아 놓은 것을 나타낸다면, 어떤 메서드를 사용해야 항목을 하나 더 추가할 수 있습니까?", + ), + ], + "ja": [ + ChatMessage( + role=MessageRole.SYSTEM, + content="""与えられたContextに基づいて既存の質問を複雑にして、様々な論理的思考が必要な質問として書き直しましょう。 +質問に答えるためには、与えられたContextの情報を使って様々な論理的思考や推論をしなければなりません。 +質問を再作成するときに従うべきルール: +1. 再作成された質問は、Contextにある情報だけで完全に答えられる必要があります。 +2. 100文字を超える質問を作成してはいけません。 可能であれば略語を使ってください。 +3. 質問が明確で曖昧にならないようにしましょう。 +4. 「提供されたContextに基づいて」、「当該段落によると」などのフレーズは、質問に含まれてはいけません。 +5. 日本語で質問を書きましょう。""", + ), + ChatMessage( + role=MessageRole.USER, + content="""Question: フランスの首都はどこですか?, +Context: フランスは西ヨーロッパにある国です。 パリ、リヨン、マルセイユを含むいくつかの都市があります。 パリはエッフェル塔やルーブル博物館のような文化的ランドマークとして有名なだけでなく、行政の中心地としても知られています。 +Output: """, + ), + ChatMessage( + role=MessageRole.ASSISTANT, + content="エッフェル塔と行政の中心地、二つの単語はどんな都市を指していますか?", + ), + ChatMessage( + role=MessageRole.USER, + content="""Question: Pythonでappend() メソッドは何をしますか? +Context: Pythonで、リストは 1 つの変数に複数の項目を保存するために使用されます。 リストは、データを保存するために使用される 4 つの組み込みデータ タイプの 1 つです。 append()メソッドは、リストの最後に新しい項目を追加します。 +Output: """, + ), + ChatMessage( + role=MessageRole.ASSISTANT, + content="リストが変数を集めたものである場合、どのメソッドを使えば項目を一つ追加することができますか?", + ), + ], + }, + "compress_ragas": { + "en": [ + ChatMessage( + role=MessageRole.SYSTEM, + content="""Rewrite the following question to make it more indirect and shorter while retaining the essence of the original question. + The goal is to create a question that conveys the same meaning but in a less direct manner. The rewritten question should shorter so use abbreviation wherever possible.""", + ), + ChatMessage( + role=MessageRole.USER, + content="""Question: What is the distance between the Earth and the Moon? +Output: """, + ), + ChatMessage( + role=MessageRole.ASSISTANT, + content="How far is the Moon from Earth?", + ), + ChatMessage( + role=MessageRole.USER, + content="""Question: What ingredients are required to bake a chocolate cake? +Output: """, + ), + ChatMessage( + role=MessageRole.ASSISTANT, + content="What's needed for a chocolate cake?", + ), + ], + "ko": [ + ChatMessage( + role=MessageRole.SYSTEM, + content="""주어진 질문을 더 간접적이고 짧게 다시 작성하세요. + 목표는 질문을 원래 질문의 본질을 유지하면서 너무 직설적이지 않게 만드는 것입니다. + 약어 등을 사용하여 질문을 더 짧게 만드세요.""", + ), + ChatMessage( + role=MessageRole.USER, + content="""Question: 지구와 달 사이의 거리는 얼마입니까? +Output: """, + ), + ChatMessage( + role=MessageRole.ASSISTANT, + content="달은 지구에서 얼마나 떨어져 있나요?", + ), + ChatMessage( + role=MessageRole.USER, + content="""Question: 초콜릿 케이크를 굽기 위해 필요한 재료는 무엇입니까? +Output: """, + ), + ChatMessage( + role=MessageRole.ASSISTANT, + content="초콜릿 케이크에 필요한 것은 무엇인가요?", + ), + ], + "ja": [ + ChatMessage( + role=MessageRole.SYSTEM, + content="""与えられた質問をより間接的かつ短く書き換えます。 +目標は、質問を元の質問の本質を保ちながら、あまりストレートにならないようにすることです。 +略語などを使用して、質問をより短くします。""", + ), + ChatMessage( + role=MessageRole.USER, + content="""Question: 地球と月の間の距離はどれくらいですか? +Output: """, + ), + ChatMessage( + role=MessageRole.ASSISTANT, + content="月は地球からどれくらい離れていますか?", + ), + ChatMessage( + role=MessageRole.USER, + content="""Question: チョコレートケーキを焼くために必要な材料は何ですか? +Output: """, + ), + ChatMessage( + role=MessageRole.ASSISTANT, + content="チョコレートケーキに必要なものは何ですか?", + ), + ], + }, +} diff --git a/autorag-workspace/autorag/data/qa/extract_evidence.py b/autorag-workspace/autorag/data/qa/extract_evidence.py new file mode 100644 index 0000000..e775297 --- /dev/null +++ b/autorag-workspace/autorag/data/qa/extract_evidence.py @@ -0,0 +1 @@ +# This module is about extracting evidence from the given retrieval gt passage diff --git a/autorag-workspace/autorag/data/qa/filter/__init__.py b/autorag-workspace/autorag/data/qa/filter/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/autorag-workspace/autorag/data/qa/filter/dontknow.py b/autorag-workspace/autorag/data/qa/filter/dontknow.py new file mode 100644 index 0000000..5f58eb9 --- /dev/null +++ b/autorag-workspace/autorag/data/qa/filter/dontknow.py @@ -0,0 +1,117 @@ +from typing import Dict, List + +from llama_index.core.base.llms.base import BaseLLM +from llama_index.core.base.llms.types import ChatMessage, MessageRole, ChatResponse +from llama_index.llms.openai.utils import to_openai_message_dicts +from openai import AsyncClient +from pydantic import BaseModel + +from autorag.data.qa.filter.prompt import FILTER_PROMPT + +dont_know_phrases = { + "en": [ + "I don't know", + "I do not know", + "Don't know", + "Do not know", + ], + "ko": [ + "몰라요", + "모르겠습니다", + "모르겠어요", + "몰라", + "내가 어떻게 알아?", + "모르겠소", + "몰라유", + "모르것는디", + "모르겠어유", + "모르겠네유", + "모르겠네요", + ], + "ja": [ + "知りません", + "わかりません", + "分かりません", + "知らないです", + "よく分かってません", + "わかりかねます", + "存じません", + "お答えいたしかねます", + ], +} + + +def dontknow_filter_rule_based(row: Dict, lang: str = "en") -> bool: + assert ( + "generation_gt" in row.keys() + ), "generation_gt column is not in the DataFrame." + dont_know_phrase = dont_know_phrases[lang] + return not any( + phrase in s for phrase in dont_know_phrase for s in row["generation_gt"] + ) + + +class Response(BaseModel): + is_dont_know: bool + + +async def dontknow_filter_openai( + row: Dict, + client: AsyncClient, + model_name: str = "gpt-4o-mini-2024-07-18", + lang: str = "en", +) -> bool: + """ + This will drop rows that have a "don't know" answer. + It will drop unanswerable questions from the QA dataset. + You can use this filter with the ` batch_filter ` function at `QA` class. + + :param row: The row dict from QA dataset. + :param client: The OpenAI client. + :param model_name: The model name. + You have to use gpt-4o-2024-08-06 or gpt-4o-mini-2024-07-18. + :param lang: The supported language is en, ko or ja. + :return: False if the row generation_gt is a "don't know" meaning. + """ + assert "generation_gt" in row.keys(), "generation_gt column is not in the row." + system_prompt: List[ChatMessage] = FILTER_PROMPT["dontknow_filter"][lang] + result = [] + for gen_gt in row["generation_gt"]: + completion = await client.beta.chat.completions.parse( + model=model_name, + messages=to_openai_message_dicts( + system_prompt + [ChatMessage(role=MessageRole.USER, content=gen_gt)] + ), + response_format=Response, + ) + result.append(completion.choices[0].message.parsed.is_dont_know) + return not any(result) + + +async def dontknow_filter_llama_index( + row: Dict, + llm: BaseLLM, + lang: str = "en", +) -> bool: + """ + This will drop rows that have a "don't know" answer. + It will drop unanswerable questions from the QA dataset. + You can use this filter with the ` batch_filter ` function at `QA` class. + + :param row: The row dict from QA dataset. + :param llm: The Llama index llm instance. + It will be good if you set max tokens to low for saving tokens. + :param lang: The supported language is en, ko or ja. + :return: False if the row generation_gt is a "don't know" meaning. + """ + assert "generation_gt" in row.keys(), "generation_gt column is not in the row." + system_prompt: List[ChatMessage] = FILTER_PROMPT["dontknow_filter"][lang] + results = [] + for gen_gt in row["generation_gt"]: + response: ChatResponse = await llm.achat( + messages=system_prompt + + [ChatMessage(role=MessageRole.USER, content=gen_gt)] + ) + result_str = response.message.content + results.append("true" in result_str.lower().strip()) + return not any(results) diff --git a/autorag-workspace/autorag/data/qa/filter/passage_dependency.py b/autorag-workspace/autorag/data/qa/filter/passage_dependency.py new file mode 100644 index 0000000..77927d3 --- /dev/null +++ b/autorag-workspace/autorag/data/qa/filter/passage_dependency.py @@ -0,0 +1,88 @@ +from typing import Dict, List + +from llama_index.core.base.llms.base import BaseLLM +from llama_index.core.base.llms.types import ChatMessage, MessageRole, ChatResponse +from llama_index.llms.openai.utils import to_openai_message_dicts +from openai import AsyncClient +from pydantic import BaseModel + +from autorag.data.qa.filter.prompt import FILTER_PROMPT + + +class Response(BaseModel): + is_passage_dependent: bool + + +async def passage_dependency_filter_openai( + row: Dict, + client: AsyncClient, + model_name: str = "gpt-4o-mini-2024-07-18", + lang: str = "en", +) -> bool: + """ + This will drop passage-dependent question rows. + Passage-dependent questions are questions that the answer will change depending on what passage you choose. + The passage-dependent questions will not be good for RAG evaluation, because any retrieval system can't find the right passage with passage-dependent question. + For example, when someone asks "What is the highest score according to the table?" the answer will be different depending on the table. + And what is the table? The retrieval system can't find the right passage with this question. + You can use this filter with the ` batch_filter ` function at `QA` class. + + :param row: The row dict from QA dataset. + :param client: The OpenAI client. + :param model_name: The model name. + You have to use gpt-4o-2024-08-06 or gpt-4o-mini-2024-07-18. + :param lang: The supported language is en, ko or ja. + :return: False if the row question is a passage-dependent question (to be filtered). + """ + assert "query" in row.keys(), "query column is not in the row." + system_prompt: List[ChatMessage] = FILTER_PROMPT["passage_dependency"][lang] + query = row["query"] + completion = await client.beta.chat.completions.parse( + model=model_name, + messages=to_openai_message_dicts( + system_prompt + + [ + ChatMessage( + role=MessageRole.USER, + content=f"Question: {query}\nIs this the question passage dependent?", + ) + ] + ), + response_format=Response, + ) + return not completion.choices[0].message.parsed.is_passage_dependent + + +async def passage_dependency_filter_llama_index( + row: Dict, + llm: BaseLLM, + lang: str = "en", +) -> bool: + """ + This will drop passage-dependent question rows. + Passage-dependent questions are questions that the answer will change depending on what passage you choose. + The passage-dependent questions will not be good for RAG evaluation, because any retrieval system can't find the right passage with passage-dependent question. + For example, when someone asks "What is the highest score according to the table?" the answer will be different depending on the table. + And what is the table? The retrieval system can't find the right passage with this question. + You can use this filter with the ` batch_filter ` function at `QA` class. + + :param row: The row dict from QA dataset. + :param llm: The Llama index llm instance. + It will be good if you set max tokens to low for saving tokens. + :param lang: The supported language is en, ko or ja. + :return: False if the row question is a passage-dependent question (to be filtered). + """ + assert "query" in row.keys(), "query column is not in the row." + system_prompt: List[ChatMessage] = FILTER_PROMPT["passage_dependency"][lang] + query = row["query"] + response: ChatResponse = await llm.achat( + messages=system_prompt + + [ + ChatMessage( + role=MessageRole.USER, + content=f"Question: {query}\nIs this the question passage dependent?", + ) + ] + ) + result_str = response.message.content + return "true" not in result_str.lower().strip() diff --git a/autorag-workspace/autorag/data/qa/filter/prompt.py b/autorag-workspace/autorag/data/qa/filter/prompt.py new file mode 100644 index 0000000..ac8f594 --- /dev/null +++ b/autorag-workspace/autorag/data/qa/filter/prompt.py @@ -0,0 +1,73 @@ +from llama_index.core.base.llms.types import ChatMessage, MessageRole + +FILTER_PROMPT = { + "dontknow_filter": { + "en": [ + ChatMessage( + role=MessageRole.SYSTEM, + content="""The following sentence is an answer about a question. You have to decide the answer implies 'I don't know'. +If the answer implies 'I don't know', return True. If not, return False.""", + ), + ], + "ko": [ + ChatMessage( + role=MessageRole.SYSTEM, + content="""다음 문장은 어떠한 질문에 대한 대답입니다. 해당 문장이 질문에 대해서 '모른다고' 답한 것인지 판단하십시오. +만약 해당 문장이 '모른다고' 답한 것이라면, True를 반환하세요. 그렇지 않다면 False를 반환하세요.""", + ) + ], + "ja": [ + ChatMessage( + role=MessageRole.SYSTEM, + content="""次の文章はある質問に対する答えです。 該当文章が質問に対して「知らない」と答えたのか判断します。 +もし、その文章が「知らない」と答えたのであれば、Trueを返します。 そうでなければFalseを返します。""", + ) + ], + }, + "passage_dependency": { + "en": [ + ChatMessage( + role=MessageRole.SYSTEM, + content="""You are a classifier that recognize 'passage dependent' questions. +The 'passage dependent' is the question that the answer will be change depending on what passage you choose. +For example) 'What is the highest score according to the table?' +This sentence is the passage dependent question because the answer will be different depending on the table. + +In contrast, the following sentence is not passage dependant. +'What is the highest score of the KBO baseball history in one game?' +'What is the capital of France?' +These sentences will have the same answer regardless of the passage. + +Please return True if the input question is passage dependent. Else return False.""", + ) + ], + "ko": [ + ChatMessage( + role=MessageRole.SYSTEM, + content="""당신은 '단락 의존' 질문을 인식하는 분류기입니다. +'단락 의존'이란 어떤 단락이 선택 되는지 따라 답이 달라지는 질문을 의미합니다. +예를 들어, '주어진 표에 따르면 가장 높은 점수는 무엇인가요?'라는 질문은 단락 의존 질문입니다. 왜냐하면 표가 어떤 것인지에 따라 그 답이 달라지기 때문입니다. + +반면에, 다음 문장들은 단락 의존적이지 않습니다. +'KBO 야구 역사상 한 경기에서 가장 높은 점수는 무엇인가요?' 또는 '프랑스의 수도는 무엇인가요?' +이러한 문장은 단락에 관계 없이 동일한 답을 가집니다. + +입력된 질문이 단락 의존적이라면 True를 반환하고, 그렇지 않으면 False를 반환하세요.""", + ) + ], + "ja": [ + ChatMessage( + role=MessageRole.SYSTEM, + content="""あなたは「段落依存」の質問を認識する分類器です。 +「段落依存」とは、どの段落が選択されるかによって答えが変わる質問を意味します。 +たとえば、「与えられた表によると、最も高い点数は何ですか?」という質問は、段落依存の質問です。 なぜなら、表がどんなものかによってその答えが変わるからです。 + +一方、次の文章は段落依存的ではありません。 +KBO野球史上1試合で最も高い点数は何ですか?またはフランスの首都は何ですか?' +このような文章は段落に関係なく同じ答えを持ちます。 + +入力された質問が段落依存的である場合はTrueを返し、そうでない場合はFalseを返します。""", + ) + ], + }, +} diff --git a/autorag-workspace/autorag/data/qa/generation_gt/__init__.py b/autorag-workspace/autorag/data/qa/generation_gt/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/autorag-workspace/autorag/data/qa/generation_gt/base.py b/autorag-workspace/autorag/data/qa/generation_gt/base.py new file mode 100644 index 0000000..9690ea8 --- /dev/null +++ b/autorag-workspace/autorag/data/qa/generation_gt/base.py @@ -0,0 +1,16 @@ +from typing import Dict + + +def add_gen_gt(row: Dict, new_gen_gt: str) -> Dict: + if "generation_gt" in list(row.keys()): + if isinstance(row["generation_gt"], list): + row["generation_gt"].append(new_gen_gt) + elif isinstance(row["generation_gt"], str): + row["generation_gt"] = [row["generation_gt"], new_gen_gt] + else: + raise ValueError( + "generation_gt should be either a string or a list of strings." + ) + return row + row["generation_gt"] = [new_gen_gt] + return row diff --git a/autorag-workspace/autorag/data/qa/generation_gt/llama_index_gen_gt.py b/autorag-workspace/autorag/data/qa/generation_gt/llama_index_gen_gt.py new file mode 100644 index 0000000..a3c8785 --- /dev/null +++ b/autorag-workspace/autorag/data/qa/generation_gt/llama_index_gen_gt.py @@ -0,0 +1,41 @@ +import itertools +from typing import Dict + + +from llama_index.core.base.llms.base import BaseLLM +from llama_index.core.base.llms.types import MessageRole, ChatMessage + +from autorag.data.qa.generation_gt.base import add_gen_gt +from autorag.data.qa.generation_gt.prompt import GEN_GT_SYSTEM_PROMPT + + +async def make_gen_gt_llama_index(row: Dict, llm: BaseLLM, system_prompt: str) -> Dict: + retrieval_gt_contents = list( + itertools.chain.from_iterable(row["retrieval_gt_contents"]) + ) + query = row["query"] + passage_str = "\n".join(retrieval_gt_contents) + user_prompt = f"Text:\n<|text_start|>\n{passage_str}\n<|text_end|>\n\nQuestion:\n{query}\n\nAnswer:" + + response = await llm.achat( + messages=[ + ChatMessage(role=MessageRole.SYSTEM, content=system_prompt), + ChatMessage(role=MessageRole.USER, content=user_prompt), + ], + temperature=0.0, + ) + return add_gen_gt(row, response.message.content) + + +async def make_concise_gen_gt(row: Dict, llm: BaseLLM, lang: str = "en") -> Dict: + return await make_gen_gt_llama_index( + row, llm, GEN_GT_SYSTEM_PROMPT["concise"][lang] + ) + + +async def make_basic_gen_gt(row: Dict, llm: BaseLLM, lang: str = "en") -> Dict: + return await make_gen_gt_llama_index(row, llm, GEN_GT_SYSTEM_PROMPT["basic"][lang]) + + +async def make_custom_gen_gt(row: Dict, llm: BaseLLM, system_prompt: str) -> Dict: + return await make_gen_gt_llama_index(row, llm, system_prompt) diff --git a/autorag-workspace/autorag/data/qa/generation_gt/openai_gen_gt.py b/autorag-workspace/autorag/data/qa/generation_gt/openai_gen_gt.py new file mode 100644 index 0000000..4f4e2fd --- /dev/null +++ b/autorag-workspace/autorag/data/qa/generation_gt/openai_gen_gt.py @@ -0,0 +1,84 @@ +import itertools +from typing import Dict + +from openai import AsyncClient +from pydantic import BaseModel + +from autorag.data.qa.generation_gt.base import add_gen_gt +from autorag.data.qa.generation_gt.prompt import GEN_GT_SYSTEM_PROMPT + + +class Response(BaseModel): + answer: str + + +async def make_gen_gt_openai( + row: Dict, + client: AsyncClient, + system_prompt: str, + model_name: str = "gpt-4o-2024-08-06", +): + retrieval_gt_contents = list( + itertools.chain.from_iterable(row["retrieval_gt_contents"]) + ) + query = row["query"] + passage_str = "\n".join(retrieval_gt_contents) + user_prompt = f"Text:\n<|text_start|>\n{passage_str}\n<|text_end|>\n\nQuestion:\n{query}\n\nAnswer:" + + completion = await client.beta.chat.completions.parse( + model=model_name, + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt}, + ], + temperature=0.0, + response_format=Response, + ) + response: Response = completion.choices[0].message.parsed + return add_gen_gt(row, response.answer) + + +async def make_concise_gen_gt( + row: Dict, + client: AsyncClient, + model_name: str = "gpt-4o-2024-08-06", + lang: str = "en", +): + """ + Generate concise generation_gt using OpenAI Structured Output for preventing errors. + It generates a concise answer, so it is generally a word or just a phrase. + + :param row: The input row of the qa dataframe. + :param client: The OpenAI async client. + :param model_name: The model name that supports structured output. + It has to be "gpt-4o-2024-08-06" or "gpt-4o-mini-2024-07-18". + :param lang: The language code of the prompt. + Default is "en". + :return: The output row of the qa dataframe with added "generation_gt" in it. + """ + return await make_gen_gt_openai( + row, client, GEN_GT_SYSTEM_PROMPT["concise"][lang], model_name + ) + + +async def make_basic_gen_gt( + row: Dict, + client: AsyncClient, + model_name: str = "gpt-4o-2024-08-06", + lang: str = "en", +): + """ + Generate basic generation_gt using OpenAI Structured Output for preventing errors. + It generates a "basic" answer, and its prompt is simple. + + :param row: The input row of the qa dataframe. + :param client: The OpenAI async client. + :param model_name: The model name that supports structured output. + It has to be "gpt-4o-2024-08-06" or "gpt-4o-mini-2024-07-18". + :param lang: The language code of the prompt. + Default is "en". + :return: The output row of the qa dataframe with added "generation_gt" in it. + """ + return await make_gen_gt_openai( + row, client, GEN_GT_SYSTEM_PROMPT["basic"][lang], model_name + ) diff --git a/autorag-workspace/autorag/data/qa/generation_gt/prompt.py b/autorag-workspace/autorag/data/qa/generation_gt/prompt.py new file mode 100644 index 0000000..ec6c275 --- /dev/null +++ b/autorag-workspace/autorag/data/qa/generation_gt/prompt.py @@ -0,0 +1,27 @@ +GEN_GT_SYSTEM_PROMPT = { + "concise": { + "en": """You are an AI assistant to answer the given question in the provide evidence text. +You can find the evidence from the given text about question, and you have to write a proper answer to the given question. +Your answer have to be concise and relevant to the question. +Do not make a verbose answer and make it super clear. +It doesn't have to be an full sentence. It can be the answer is a word or a paraphrase.""", + "ko": """당신은 주어진 질문에 대해 제공된 Text 내에서 답을 찾는 AI 비서입니다. +질문에 대한 답을 Text에서 찾아 적절한 답변을 작성하세요. +답변은 간결하고 질문에 관련된 내용만 포함해야 합니다. +불필요하게 길게 답변하지 말고, 명확하게 작성하세요. +완전한 문장이 아니어도 되며, 답은 단어나 요약일 수 있습니다.""", + "ja": """ +あなたは与えられた質問に対して提供されたText内で答えを探すAI秘書です。 +質問に対する答えをTextで探して適切な答えを作成しましょう。 +回答は簡潔で、質問に関連する内容のみを含める必要があります。 +不必要に長く答えず、明確に作成しましょう。 +完全な文章でなくてもいいし、答えは単語や要約かもしれません。 +""", + }, + "basic": { + "en": """You are an AI assistant to answer the given question in the provide evidence text. +You can find the evidence from the given text about question, and you have to write a proper answer to the given question.""", + "ko": "당신은 주어진 질문에 대한 답을 제공된 Text 내에서 찾는 AI 비서입니다. 질문과 관련된 증거를 Text에서 찾아 적절한 답변을 작성하세요.", + "ja": "あなたは与えられた質問に対する答えを提供されたText内で探すAI秘書です。 質問に関する証拠をTextで探して適切な回答を作成しましょう。", + }, +} diff --git a/autorag-workspace/autorag/data/qa/query/__init__.py b/autorag-workspace/autorag/data/qa/query/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/autorag-workspace/autorag/data/qa/query/llama_gen_query.py b/autorag-workspace/autorag/data/qa/query/llama_gen_query.py new file mode 100644 index 0000000..4471fe2 --- /dev/null +++ b/autorag-workspace/autorag/data/qa/query/llama_gen_query.py @@ -0,0 +1,82 @@ +import itertools +from typing import Dict, List + +from llama_index.core.base.llms.base import BaseLLM +from llama_index.core.base.llms.types import ChatResponse, ChatMessage, MessageRole + +from autorag.data.qa.query.prompt import QUERY_GEN_PROMPT, QUERY_GEN_PROMPT_EXTRA + + +async def llama_index_generate_base( + row: Dict, + llm: BaseLLM, + messages: List[ChatMessage], +) -> Dict: + context = list(itertools.chain.from_iterable(row["retrieval_gt_contents"])) + context_str = "\n".join([f"{i + 1}. {c}" for i, c in enumerate(context)]) + user_prompt = f"Text:\n{context_str}\n\nGenerated Question from the Text:\n" + user_message = ChatMessage(role=MessageRole.USER, content=user_prompt) + new_messages = [*messages, user_message] + chat_response: ChatResponse = await llm.achat(messages=new_messages) + row["query"] = chat_response.message.content + return row + + +async def factoid_query_gen( + row: Dict, + llm: BaseLLM, + lang: str = "en", +) -> Dict: + return await llama_index_generate_base( + row, llm, QUERY_GEN_PROMPT["factoid_single_hop"][lang] + ) + + +async def concept_completion_query_gen( + row: Dict, + llm: BaseLLM, + lang: str = "en", +) -> Dict: + return await llama_index_generate_base( + row, llm, QUERY_GEN_PROMPT["concept_completion"][lang] + ) + + +async def two_hop_incremental( + row: Dict, + llm: BaseLLM, + lang: str = "en", +) -> Dict: + messages = QUERY_GEN_PROMPT["two_hop_incremental"][lang] + passages = row["retrieval_gt_contents"] + assert ( + len(passages) >= 2 + ), "You have to sample more than two passages for making two-hop questions." + context_str = f"Document 1: {passages[0][0]}\nDocument 2: {passages[1][0]}" + user_prompt = f"{context_str}\n\nGenerated two-hop Question from two Documents:\n" + messages.append(ChatMessage(role=MessageRole.USER, content=user_prompt)) + + chat_response: ChatResponse = await llm.achat(messages=messages) + response = chat_response.message.content + row["query"] = response.split(":")[-1].strip() + return row + + +async def custom_query_gen( + row: Dict, + llm: BaseLLM, + messages: List[ChatMessage], +) -> Dict: + return await llama_index_generate_base(row, llm, messages) + + +# Experimental feature: can only use factoid_single_hop +async def multiple_queries_gen( + row: Dict, + llm: BaseLLM, + lang: str = "en", + n: int = 3, +) -> Dict: + _messages = QUERY_GEN_PROMPT["factoid_single_hop"][lang] + _messages[0].content += QUERY_GEN_PROMPT_EXTRA["multiple_queries"][lang].format(n=n) + return await llama_index_generate_base(row, llm, _messages) diff --git a/autorag-workspace/autorag/data/qa/query/openai_gen_query.py b/autorag-workspace/autorag/data/qa/query/openai_gen_query.py new file mode 100644 index 0000000..eb4b906 --- /dev/null +++ b/autorag-workspace/autorag/data/qa/query/openai_gen_query.py @@ -0,0 +1,95 @@ +import itertools +from typing import Dict, List + +from llama_index.core.base.llms.types import ChatMessage, MessageRole +from llama_index.llms.openai.utils import to_openai_message_dicts +from openai import AsyncClient +from pydantic import BaseModel + +from autorag.data.qa.query.prompt import QUERY_GEN_PROMPT + + +class Response(BaseModel): + query: str + + +# Single hop QA generation OpenAI +async def query_gen_openai_base( + row: Dict, + client: AsyncClient, + messages: List[ChatMessage], + model_name: str = "gpt-4o-2024-08-06", +): + context = list(itertools.chain.from_iterable(row["retrieval_gt_contents"])) + context_str = "Text:\n" + "\n".join( + [f"{i + 1}. {c}" for i, c in enumerate(context)] + ) + user_prompt = f"{context_str}\n\nGenerated Question from the Text:\n" + messages.append(ChatMessage(role=MessageRole.USER, content=user_prompt)) + + completion = await client.beta.chat.completions.parse( + model=model_name, + messages=to_openai_message_dicts(messages), + response_format=Response, + ) + row["query"] = completion.choices[0].message.parsed.query + return row + + +async def factoid_query_gen( + row: Dict, + client: AsyncClient, + model_name: str = "gpt-4o-2024-08-06", + lang: str = "en", +) -> Dict: + return await query_gen_openai_base( + row, client, QUERY_GEN_PROMPT["factoid_single_hop"][lang], model_name + ) + + +async def concept_completion_query_gen( + row: Dict, + client: AsyncClient, + model_name: str = "gpt-4o-2024-08-06", + lang: str = "en", +) -> Dict: + return await query_gen_openai_base( + row, client, QUERY_GEN_PROMPT["factoid_single_hop"][lang], model_name + ) + + +class TwoHopIncrementalResponse(BaseModel): + answer: str + one_hop_question: str + two_hop_question: str + + +async def two_hop_incremental( + row: Dict, + client: AsyncClient, + model_name: str = "gpt-4o-2024-08-06", + lang: str = "en", +) -> Dict: + """ + Create a two-hop question using incremental prompt. + Incremental prompt is more effective to create multi-hop question. + The input retrieval_gt has to include more than one passage. + + :return: The two-hop question using openai incremental prompt + """ + messages = QUERY_GEN_PROMPT["two_hop_incremental"][lang] + passages = row["retrieval_gt_contents"] + assert ( + len(passages) >= 2 + ), "You have to sample more than two passages for making two-hop questions." + context_str = f"Document 1: {passages[0][0]}\nDocument 2: {passages[1][0]}" + user_prompt = f"{context_str}\n\nGenerated two-hop Question from two Documents:\n" + messages.append(ChatMessage(role=MessageRole.USER, content=user_prompt)) + + completion = await client.beta.chat.completions.parse( + model=model_name, + messages=to_openai_message_dicts(messages), + response_format=TwoHopIncrementalResponse, + ) + row["query"] = completion.choices[0].message.parsed.two_hop_question + return row diff --git a/autorag-workspace/autorag/data/qa/query/prompt.py b/autorag-workspace/autorag/data/qa/query/prompt.py new file mode 100644 index 0000000..b465a6d --- /dev/null +++ b/autorag-workspace/autorag/data/qa/query/prompt.py @@ -0,0 +1,202 @@ +from llama_index.core.base.llms.types import ChatMessage, MessageRole + +QUERY_GEN_PROMPT = { + "factoid_single_hop": { + "en": [ + ChatMessage( + role=MessageRole.SYSTEM, + content="""You're an AI tasked to convert Text into a factoid question. +Factoid questions are those seeking brief, factual information that can be easily verified. They typically require a yes or no answer or a brief explanation and often inquire about specific details such as dates, names, places, or events. + +Examples of factoid questions include: + +- What is the capital of France? +- Who invented the light bulb? +- When was Wikipedia founded? + +Instructions: +1. Questions MUST BE extracted from given Text +2. Questions should be as detailed as possible from Text +3. Create questions that ask about factual information from the Text +4. Do not mention any of these in the questions: "in the given text", "in the provided information", etc. +Users do not know the passage source of the question, so it should not be mentioned in the question. +5. Do not ask about the file name or the file title. Ask about the content of the file. +For example, avoid to write questions like `What is the file name of the document?`""", + ) + ], + "ko": [ + ChatMessage( + role=MessageRole.SYSTEM, + content="""당신은 주어진 Text를 '사실 질문'으로 변환하는 AI입니다. + +사실 질문(factoid questions)이란 사실적인 정보를 요구하는 질문으로, 쉽게 검증할 수 있는 답변을 필요로 합니다. 일반적으로 예/아니오 답변이나 간단한 설명을 요구하며, 날짜, 이름, 장소 또는 사건과 같은 구체적인 세부사항에 대해 묻는 질문입니다. + +사실 질문의 예는 다음과 같습니다: + + • 프랑스의 수도는 어디입니까? + • 전구를 발명한 사람은 누구입니까? + • 위키피디아는 언제 설립되었습니까? + +지침: + 1. 질문은 반드시 주어진 Text를 기반으로 작성되어야 합니다. + 2. 질문은 Text를 기반으로 가능한 한 구체적으로 작성되어야 합니다. + 3. Text에서 사실적 정보를 요구하는 질문을 만들어야 합니다. 즉, Text를 기반으로 사실 질문을 만드세요. + 4. 질문에 “주어진 Text에서” 또는 “제공된 단락에서”와 같은 표현을 포함해서는 안 됩니다. +사용자는 질문의 출처가 Text라는 것을 모르기 때문에 반드시 그 출처를 언급해서는 안 됩니다. + 5. 파일 이름이나 파일 제목에 대한 질문을 하지 마세요. 파일의 내용에 대해 물어보세요. +예를 들어, '문서의 파일 이름은 무엇입니까?'와 같은 질문을 작성하지 마세요. + 6. 질문을 한국어로 작성하세요.""", + ) + ], + "ja": [ + ChatMessage( + role=MessageRole.SYSTEM, + content="""あなたは与えられたTextを「実は質問」に変換するAIです。 + +事実質問(factoid questions)とは、事実的な情報を求める質問であり、容易に検証できる回答を必要とします。 一般的に、「はい/いいえ」の返答や簡単な説明を要求し、日付、名前、場所、または事件のような具体的な詳細事項について尋ねる質問です。 + +実は質問の例は次の通りです: + + • フランスの首都はどこですか? + • 電球を発明したのは誰ですか? + • ウィキペディアはいつ設立されましたか? + +指針: + 1. 質問は、必ず与えられたTextに基づいて作成されなければなりません。 + 2. 質問は、Textに基づいて可能な限り具体的に作成されなければなりません。 + 3. Textで事実的情報を要求する質問を作らなければなりません。 つまり、Textに基づいて質問を作成します。 + 4. 質問に「与えられたTextで」または「提供された段落で」のような表現を含めてはいけません。 +ユーザーは質問の出所がTextだということを知らないので、必ずしもその出所を言及してはいけません。 + 5. ファイル名やファイルタイトルを訊かないでください。ファイルの内容について聞いてください。 +例えば、「このドキュメントのファイル名は何ですか? + 6. 質問を日本語で作成しなさい。""", + ) + ], + }, + "concept_completion": { + "en": [ + ChatMessage( + role=MessageRole.SYSTEM, + content="""You're an AI tasked to convert Text into a "Concept Completion" Question. +A “concept completion” question asks directly about the essence or identity of a concept. + +Follow the following instructions. +Instructions: +1. Questions MUST BE extracted from given Text +2. Questions should be as detailed as possible from Text +3. Create questions that ask about information from the Text +4. MUST include specific keywords from the Text. +5. Do not mention any of these in the questions: "in the given text", "in the provided information", etc. +Users do not know the passage source of the question, so it should not be mentioned in the question. +6. Do not ask about the file name or the file title. Ask about the content of the file. +For example, avoid to write questions like `What is the file name of the document?""", + ) + ], + "ko": [ + ChatMessage( + role=MessageRole.SYSTEM, + content="""당신은 Text를 “개념 완성” 질문으로 변환하는 AI입니다. +"개념 완성" 질문은 개념의 본질이나 정체성에 대해 직접적으로 묻는 질문입니다. + +다음 지시사항을 따르세요. +지시사항: +1. 질문은 반드시 주어진 Text를 기반으로 작성되어야 합니다. +2. 질문은 Text를 기반으로 가능한 한 자세하게 작성되어야 합니다. +3. Text에서 제공된 정보를 묻는 질문을 생성하세요. +4. Text의 특정 키워드를 반드시 질문에 포함하세요. +5. 질문에 “주어진 Text에서” 또는 “제공된 단락에서”와 같은 표현을 포함해서는 안 됩니다. +사용자는 질문의 출처가 Text라는 것을 모르기 때문에 반드시 그 출처를 언급해서는 안 됩니다. +6. 파일 이름이나 파일 제목에 대한 질문을 하지 마세요. 파일의 내용에 대해 물어보세요. +예를 들어, '문서의 파일 이름은 무엇입니까?'와 같은 질문을 작성하지 마세요. +7. 질문을 한국어로 작성하세요.""", + ) + ], + "ja": [ + ChatMessage( + role=MessageRole.SYSTEM, + content="""あなたはTextを「概念完成」の質問に変換するAIです。 +「概念完成」の質問は概念の本質やアイデンティティについて直接的に尋ねる質問です。 + +次の指示に従います。 +指示事項: +1. 質問は、必ず与えられたTextに基づいて作成されなければなりません。 +2. 質問は、Textに基づいてできるだけ詳しく作成されなければなりません。 +3. Textで提供された情報を尋ねる質問を作成します。 +4. Textの特定のキーワードを必ず質問に含みます。 +5. 質問に「与えられたTextで」または「提供された段落で」のような表現を含めてはいけません。 +ユーザーは質問の出所がTextだということを知らないので、必ずしもその出所を言及してはいけません。 +6. ファイル名やファイルタイトルを訊かないでください。ファイルの内容について聞いてください。 +例えば、「このドキュメントのファイル名は何ですか? +7. 質問を日本語で書きましょう。""", + ) + ], + }, + "two_hop_incremental": { + "en": [ + ChatMessage( + role=MessageRole.SYSTEM, + content="Generate a multi-hop question for the given answer which requires reference to all of the given documents.", + ), + ChatMessage( + role=MessageRole.USER, + content="""Document 1: The Municipality of Nuevo Laredo is located in the Mexican state of Tamaulipas. +Document 2: The Ciudad Deportiva (Sports City ¨ ¨) is a sports +complex in Nuevo Laredo, Mexico. It is home to the Tecolotes de +Nuevo Laredo Mexican Baseball League team and ...""", + ), + ChatMessage( + role=MessageRole.ASSISTANT, + content="""Answer: Tamaulipas +One-hop question (using Document 1): In which Mexican state is Nuevo Laredo located? +Two-hop question (using Document 2): In which Mexican state can one find the Ciudad Deportiva, home to the Tecolotes de Nuevo Laredo?""", + ), + ], + "ko": [ + ChatMessage( + role=MessageRole.SYSTEM, + content="Generate a multi-hop question for the given answer which requires reference to all of the given documents.", + ), + ChatMessage( + role=MessageRole.USER, + content="""Document 1: The Municipality of Nuevo Laredo is located in the Mexican state of Tamaulipas. +Document 2: The Ciudad Deportiva (Sports City ¨ ¨) is a sports +complex in Nuevo Laredo, Mexico. It is home to the Tecolotes de +Nuevo Laredo Mexican Baseball League team and ...""", + ), + ChatMessage( + role=MessageRole.ASSISTANT, + content="""Answer: Tamaulipas +One-hop question (using Document 1): In which Mexican state is Nuevo Laredo located? +Two-hop question (using Document 2): In which Mexican state can one find the Ciudad Deportiva, home to the Tecolotes de Nuevo Laredo?""", + ), + ], + "ja": [ + ChatMessage( + role=MessageRole.SYSTEM, + content="与えられた答えに対するマルチホップ質問を生成し、与えられたすべての文書を参照する必要があります。", + ), + ChatMessage( + role=MessageRole.USER, + content="""Document 1: ヌエヴォ·ラレド自治体はメキシコのタマウリパス州にあります。 +Ciudad Deportiva(スポーツシティ ¨ ¨)はスポーツです +メキシコのヌエボ·ラレドにある複合施設です。 テコロテス·デ·テコロテスの故郷です +Nuevo Larredo メキシコ野球リーグのチームです···""", + ), + ChatMessage( + role=MessageRole.ASSISTANT, + content="""Answer: Tamaulipas +One-hop question (using Document 1): ヌエヴォ·ラレド自治体はどのメキシコの州にありますか? +Two-hop question (using Document 2): ヌエヴォ·ラレドのテコロテス·デ·テコロテスの故郷であるメキシコの州はどこですか?""", + ), + ], + }, +} + +# Experimental feature +QUERY_GEN_PROMPT_EXTRA = { + "multiple_queries": { + "en": "\nAdditional instructions:\n - Please make {n} questions.", + "ko": "\n추가 지침:\n - 질문은 {n}개를 만드세요.", + "ja": "\n追加指示:\n - 質問を{n}個作成してください。", + } +} diff --git a/autorag-workspace/autorag/data/qa/sample.py b/autorag-workspace/autorag/data/qa/sample.py new file mode 100644 index 0000000..b788dac --- /dev/null +++ b/autorag-workspace/autorag/data/qa/sample.py @@ -0,0 +1,26 @@ +import uuid +from typing import Iterable + +import pandas as pd + + +def random_single_hop( + corpus_df: pd.DataFrame, n: int, random_state: int = 42 +) -> pd.DataFrame: + sample_df = corpus_df.sample(n, random_state=random_state) + return pd.DataFrame( + { + "qid": [str(uuid.uuid4()) for _ in range(len(sample_df))], + "retrieval_gt": [[[id_]] for id_ in sample_df["doc_id"].tolist()], + } + ) + + +def range_single_hop(corpus_df: pd.DataFrame, idx_range: Iterable): + sample_df = corpus_df.iloc[idx_range] + return pd.DataFrame( + { + "qid": [str(uuid.uuid4()) for _ in range(len(sample_df))], + "retrieval_gt": [[[id_]] for id_ in sample_df["doc_id"].tolist()], + } + ) diff --git a/autorag-workspace/autorag/data/qa/schema.py b/autorag-workspace/autorag/data/qa/schema.py new file mode 100644 index 0000000..ed6b659 --- /dev/null +++ b/autorag-workspace/autorag/data/qa/schema.py @@ -0,0 +1,322 @@ +import logging +from typing import Callable, Optional, Dict, Awaitable, Any, Tuple, List +import uuid +import pandas as pd +from autorag.utils.util import process_batch, get_event_loop, fetch_contents + +from autorag.support import get_support_modules + +logger = logging.getLogger("AutoRAG") + + +class Raw: + """ + The Raw class that stored document parsing results. + It can do chunking. + It has two column names, 'raw_id' and 'contents'. + """ + + def __init__(self, raw_df: Optional[pd.DataFrame] = None): + self.data = raw_df + + def batch_apply( + self, fn: Callable[[Dict, Any], Awaitable[Dict]], batch_size: int = 32, **kwargs + ) -> "Raw": + raw_dicts = self.data.to_dict(orient="records") + loop = get_event_loop() + tasks = [fn(raw_dict, **kwargs) for raw_dict in raw_dicts] + results = loop.run_until_complete(process_batch(tasks, batch_size)) + return Raw(pd.DataFrame(results)) + + def map(self, fn: Callable[[pd.DataFrame, Any], pd.DataFrame], **kwargs) -> "Raw": + return Raw(fn(self.data, **kwargs)) + + def flatmap(self, fn: Callable, **kwargs) -> "Raw": + return fn(self.data, **kwargs) + + def chunk(self, module_name: str, **module_params) -> "Corpus": + chunk_module = get_support_modules(module_name) + chunked_result = chunk_module(parsed_result=self.data, **module_params) + return Corpus(chunked_result, self) + + def __add__(self, other): + assert isinstance(other, Raw), "You can only add Raw instances." + self.data = pd.concat([self.data, other.data], ignore_index=True).reset_index( + drop=True + ) + return self + + +class Corpus: + """ + The Corpus class that stored chunked passages. + It can generate qa set, linked with Raw instance. + """ + + def __init__( + self, + corpus_df: Optional[pd.DataFrame] = None, + linked_raw: Optional[Raw] = None, + ): + self.data = corpus_df + self._linked_raw = linked_raw + + @property + def linked_raw(self) -> Raw: + return self._linked_raw + + @linked_raw.setter + def linked_raw(self, raw: Raw): + raise NotImplementedError("linked_raw is read-only.") + + def to_parquet(self, save_path: str): + """ + Save the corpus to the AutoRAG compatible parquet file. + It is not for the data creation, for running AutoRAG. + If you want to save it directly, use the below code. + `corpus.data.to_parquet(save_path)` + + :param save_path: The path to save the corpus. + """ + if not save_path.endswith(".parquet"): + raise ValueError("save_path must be ended with .parquet") + save_df = self.data.reset_index(drop=True) + save_df.to_parquet(save_path) + + def batch_apply( + self, fn: Callable[[Dict, Any], Awaitable[Dict]], batch_size: int = 32, **kwargs + ) -> "Corpus": + corpus_dicts = self.data.to_dict(orient="records") + loop = get_event_loop() + tasks = [fn(corpus_dict, **kwargs) for corpus_dict in corpus_dicts] + results = loop.run_until_complete(process_batch(tasks, batch_size)) + return Corpus(pd.DataFrame(results), self.linked_raw) + + def map( + self, fn: Callable[[pd.DataFrame, Any], pd.DataFrame], **kwargs + ) -> "Corpus": + return Corpus(fn(self.data, **kwargs), self.linked_raw) + + def sample(self, fn: Callable[[pd.DataFrame, Any], pd.DataFrame], **kwargs) -> "QA": + """ + Sample the corpus for making QA. + It selects the subset of the corpus and makes QA set from it. + You can generate questions from the created question. + It is the first step to make QA set from the corpus. + If you select just one passage from each passage, it will be a single-hop QA set. + If you select multiple passages from each passage, it will be a multi-hop QA set. + + :param fn: The select function to perform. + It returns QA dataframe. + :return: QA instance that is selected. + It contains qid and retrieval_gt columns. + """ + return QA(fn(self.data, **kwargs), self) + + +class QA: + def __init__( + self, + qa_df: Optional[pd.DataFrame] = None, + linked_corpus: Optional[Corpus] = None, + ): + self.data = qa_df + self._linked_corpus = linked_corpus + + @property + def linked_corpus(self) -> Corpus: + return self._linked_corpus + + @linked_corpus.setter + def linked_corpus(self, corpus: Corpus): + raise NotImplementedError("linked_corpus is read-only.") + + def batch_apply( + self, fn: Callable[[Dict, Any], Awaitable[Dict]], batch_size: int = 32, **kwargs + ) -> "QA": + qa_dicts = self.data.to_dict(orient="records") + loop = get_event_loop() + tasks = [fn(qa_dict, **kwargs) for qa_dict in qa_dicts] + results = loop.run_until_complete(process_batch(tasks, batch_size)) + + # Experimental feature + if fn.__name__ == "multiple_queries_gen": + return self._process_multiple_queries_gen(results) + + return QA(pd.DataFrame(results), self.linked_corpus) + + def batch_filter( + self, fn: Callable[[Dict, Any], Awaitable[bool]], batch_size: int = 32, **kwargs + ) -> "QA": + qa_dicts = self.data.to_dict(orient="records") + loop = get_event_loop() + tasks = [fn(qa_dict, **kwargs) for qa_dict in qa_dicts] + masks = loop.run_until_complete(process_batch(tasks, batch_size)) + return QA(self.data[masks], self.linked_corpus) + + def filter(self, fn: Callable[[Dict, Any], bool], **kwargs) -> "QA": + qa_dicts = self.data.to_dict(orient="records") + masks = [fn(qa_dict, **kwargs) for qa_dict in qa_dicts] + return QA(self.data[masks], self.linked_corpus) + + def map(self, fn: Callable[[pd.DataFrame, Any], pd.DataFrame], **kwargs) -> "QA": + return QA(fn(self.data, **kwargs), self.linked_corpus) + + def make_retrieval_gt_contents(self) -> "QA": + """ + Make retrieval_gt_contents column from retrieval_gt column. + :return: The QA instance that has a retrieval_gt_contents column. + """ + self.data["retrieval_gt_contents"] = self.data["retrieval_gt"].apply( + lambda x: fetch_contents(self.linked_corpus.data, x) + ) + return self + + def to_parquet(self, qa_save_path: str, corpus_save_path: str): + """ + Save the qa and corpus to the AutoRAG compatible parquet file. + It is not for the data creation, for running AutoRAG. + If you want to save it directly, use the below code. + `qa.data.to_parquet(save_path)` + + :param qa_save_path: The path to save the qa dataset. + :param corpus_save_path: The path to save the corpus. + """ + if not qa_save_path.endswith(".parquet"): + raise ValueError("save_path must be ended with .parquet") + if not corpus_save_path.endswith(".parquet"): + raise ValueError("save_path must be ended with .parquet") + save_df = self.data[ + ["qid", "query", "retrieval_gt", "generation_gt"] + ].reset_index(drop=True) + save_df.to_parquet(qa_save_path) + self.linked_corpus.to_parquet(corpus_save_path) + + def update_corpus(self, new_corpus: Corpus) -> "QA": + """ + Update linked corpus. + Not just replace linked_corpus to the new Corpus, + it replaces the whole `retrieval_gt` to the new corpus using `linked_raw`. + The QA data must have a `retrieval_gt` column. + + :param new_corpus: Corpus that you want to replace. + Must have valid `linked_raw` and `raw_id`, `raw_start_idx`, `raw_end_idx` columns. + :return: The QA instance that updated linked corpus. + """ + self.data["evidence_path"] = ( + self.data["retrieval_gt"] + .apply( + lambda x: fetch_contents( + self.linked_corpus.data, + x, + column_name="path", + ) + ) + .tolist() + ) + self.data["evidence_page"] = self.data["retrieval_gt"].apply( + lambda x: list( + map( + lambda lst: list(map(lambda x: x.get("page", -1), lst)), + fetch_contents(self.linked_corpus.data, x, column_name="metadata"), + ) + ) + ) + if "evidence_start_end_idx" not in self.data.columns: + # make evidence start_end_idx + self.data["evidence_start_end_idx"] = ( + self.data["retrieval_gt"] + .apply( + lambda x: fetch_contents( + self.linked_corpus.data, + x, + column_name="start_end_idx", + ) + ) + .tolist() + ) + + # matching the new corpus with the old corpus + path_corpus_dict = QA.__make_path_corpus_dict(new_corpus.data) + new_retrieval_gt = self.data.apply( + lambda row: QA.__match_index_row( + row["evidence_start_end_idx"], + row["evidence_path"], + row["evidence_page"], + path_corpus_dict, + ), + axis=1, + ).tolist() + new_qa = self.data.copy(deep=True)[["qid", "query", "generation_gt"]] + new_qa["retrieval_gt"] = new_retrieval_gt + return QA(new_qa, new_corpus) + + @staticmethod + def __match_index(target_idx: Tuple[int, int], dst_idx: Tuple[int, int]) -> bool: + """ + Check if the target_idx is overlap by the dst_idx. + """ + target_start, target_end = target_idx + dst_start, dst_end = dst_idx + return ( + dst_start <= target_start <= dst_end or dst_start <= target_end <= dst_end + ) + + @staticmethod + def __match_index_row( + evidence_indices: List[List[Tuple[int, int]]], + evidence_paths: List[List[str]], + evidence_pages: List[List[int]], + path_corpus_dict: Dict, + ) -> List[List[str]]: + """ + Find the matched passage from new_corpus. + + :param evidence_indices: The evidence indices at the corresponding Raw. + Its shape is the same as the retrieval_gt. + :param evidence_paths: The evidence paths at the corresponding Raw. + Its shape is the same as the retrieval_gt. + :param path_corpus_dict: The key is the path name, and the value is the corpus dataframe that only contains the path in the key. + You can make it using `QA.__make_path_corpus_dict`. + :return: + """ + result = [] + for i, idx_list in enumerate(evidence_indices): + sub_result = [] + for j, idx in enumerate(idx_list): + path_corpus_df = path_corpus_dict[evidence_paths[i][j]] + if evidence_pages[i][j] >= 0: + path_corpus_df = path_corpus_df.loc[ + path_corpus_df["metadata"].apply(lambda x: x.get("page", -1)) + == evidence_pages[i][j] + ] + matched_corpus = path_corpus_df.loc[ + path_corpus_df["start_end_idx"].apply( + lambda x: QA.__match_index(idx, x) + ) + ] + sub_result.extend(matched_corpus["doc_id"].tolist()) + result.append(sub_result) + return result + + @staticmethod + def __make_path_corpus_dict(corpus_df: pd.DataFrame) -> Dict[str, pd.DataFrame]: + return { + path: corpus_df[corpus_df["path"] == path] + for path in corpus_df["path"].unique() + } + + # Experimental feature + def _process_multiple_queries_gen(self, results: List[Dict]) -> "QA": + data = [] + for result in results: + queries = result["query"].split("\n") + for query in queries: + new_result = { + key: (str(uuid.uuid4()) if key == "qid" else result[key]) + for key in result.keys() + } + new_result["query"] = query + data.append(new_result) + df = pd.DataFrame(data) + return QA(df, self.linked_corpus) diff --git a/autorag-workspace/autorag/data/utils/__init__.py b/autorag-workspace/autorag/data/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/autorag-workspace/autorag/data/utils/util.py b/autorag-workspace/autorag/data/utils/util.py new file mode 100644 index 0000000..c1fea52 --- /dev/null +++ b/autorag-workspace/autorag/data/utils/util.py @@ -0,0 +1,103 @@ +import mimetypes +import os +from datetime import datetime +from pathlib import Path +from typing import Dict, List, Tuple, Callable + +import pandas as pd +import yaml +from langchain_core.documents import Document +from llama_index.core.schema import NodeRelationship + +from autorag.schema import Module +from autorag.utils.util import make_combinations, explode + + +def get_file_metadata(file_path: str) -> Dict: + """Get some handy metadate from filesystem. + + Args: + file_path: str: file path in str + """ + return { + "file_path": file_path, + "file_name": os.path.basename(file_path), + "file_type": mimetypes.guess_type(file_path)[0], + "file_size": os.path.getsize(file_path), + "creation_datetime": datetime.fromtimestamp( + Path(file_path).stat().st_ctime + ).strftime("%Y-%m-%d"), + "last_modified_datetime": datetime.fromtimestamp( + Path(file_path).stat().st_mtime + ).strftime("%Y-%m-%d"), + "last_accessed_datetime": datetime.fromtimestamp( + Path(file_path).stat().st_atime + ).strftime("%Y-%m-%d"), + } + + +def add_essential_metadata(metadata: Dict) -> Dict: + if "last_modified_datetime" not in metadata: + metadata["last_modified_datetime"] = datetime.now() + return metadata + + +def corpus_df_to_langchain_documents(corpus_df: pd.DataFrame) -> List[Document]: + page_contents = corpus_df["contents"].tolist() + ids = corpus_df["doc_id"].tolist() + metadatas = corpus_df["metadata"].tolist() + return list( + map( + lambda x: Document(page_content=x[0], metadata={"filename": x[1], **x[2]}), + zip(page_contents, ids, metadatas), + ) + ) + + +def add_essential_metadata_llama_text_node(metadata: Dict, relationships: Dict) -> Dict: + if "last_modified_datetime" not in metadata: + metadata["last_modified_datetime"] = datetime.now() + + if "prev_id" not in metadata: + if NodeRelationship.PREVIOUS in relationships: + prev_node = relationships.get(NodeRelationship.PREVIOUS, None) + if prev_node: + metadata["prev_id"] = prev_node.node_id + + if "next_id" not in metadata: + if NodeRelationship.NEXT in relationships: + next_node = relationships.get(NodeRelationship.NEXT, None) + if next_node: + metadata["next_id"] = next_node.node_id + return metadata + + +def load_yaml(yaml_path: str): + if not os.path.exists(yaml_path): + raise ValueError(f"YAML file {yaml_path} does not exist.") + with open(yaml_path, "r", encoding="utf-8") as stream: + try: + yaml_dict = yaml.safe_load(stream) + except yaml.YAMLError as exc: + raise ValueError(f"YAML file {yaml_path} could not be loaded.") from exc + return yaml_dict["modules"] + + +def get_param_combinations(modules: List[Dict]) -> Tuple[List[Callable], List[Dict]]: + module_callable_list, module_params_list = [], [] + for module in modules: + module_instance = Module.from_dict(module) + module_params_list.append(module_instance.module_param) + module_callable_list.append(module_instance.module) + + combinations = list(map(make_combinations, module_params_list)) + module_list, combination_list = explode(module_callable_list, combinations) + return module_list, combination_list + + +def get_start_end_idx(original_text: str, search_str: str) -> Tuple[int, int]: + start_idx = original_text.find(search_str) + if start_idx == -1: + return 0, 0 + end_idx = start_idx + len(search_str) + return start_idx, end_idx - 1 diff --git a/autorag-workspace/autorag/deploy/__init__.py b/autorag-workspace/autorag/deploy/__init__.py new file mode 100644 index 0000000..4d78e7d --- /dev/null +++ b/autorag-workspace/autorag/deploy/__init__.py @@ -0,0 +1,9 @@ +from .base import ( + extract_node_line_names, + extract_node_strategy, + summary_df_to_yaml, + extract_best_config, + Runner, +) +from .api import ApiRunner +from .gradio import GradioRunner diff --git a/autorag-workspace/autorag/deploy/api.py b/autorag-workspace/autorag/deploy/api.py new file mode 100644 index 0000000..ae7184d --- /dev/null +++ b/autorag-workspace/autorag/deploy/api.py @@ -0,0 +1,293 @@ +import logging +import os +import pathlib +import uuid +from typing import Dict, Optional, List, Union, Literal + +import pandas as pd +from quart import Quart, request, jsonify +from quart.helpers import stream_with_context +from pydantic import BaseModel, ValidationError + +from autorag.deploy.base import BaseRunner +from autorag.nodes.generator.base import BaseGenerator +from autorag.nodes.promptmaker.base import BasePromptMaker +from autorag.utils.util import fetch_contents, to_list + +logger = logging.getLogger("AutoRAG") + +deploy_dir = pathlib.Path(__file__).parent +root_dir = pathlib.Path(__file__).parent.parent + +VERSION_PATH = os.path.join(root_dir, "VERSION") + + +class QueryRequest(BaseModel): + query: str + result_column: Optional[str] = "generated_texts" + + +class RetrievedPassage(BaseModel): + content: str + doc_id: str + score: float + filepath: Optional[str] = None + file_page: Optional[int] = None + start_idx: Optional[int] = None + end_idx: Optional[int] = None + + +class RunResponse(BaseModel): + result: Union[str, List[str]] + retrieved_passage: List[RetrievedPassage] + + +class RetrievalResponse(BaseModel): + passages: List[RetrievedPassage] + + +class StreamResponse(BaseModel): + """ + When the type is generated_text, only generated_text is returned. The other fields are None. + When the type is retrieved_passage, only retrieved_passage and passage_index are returned. The other fields are None. + """ + + type: Literal["generated_text", "retrieved_passage"] + generated_text: Optional[str] + retrieved_passage: Optional[RetrievedPassage] + passage_index: Optional[int] + + +class VersionResponse(BaseModel): + version: str + + +class ApiRunner(BaseRunner): + def __init__(self, config: Dict, project_dir: Optional[str] = None): + super().__init__(config, project_dir) + self.app = Quart(__name__) + + data_dir = os.path.join(project_dir, "data") + self.corpus_df = pd.read_parquet( + os.path.join(data_dir, "corpus.parquet"), engine="pyarrow" + ) + self.__add_api_route() + + def __add_api_route(self): + @self.app.route("/v1/run", methods=["POST"]) + async def run_query(): + try: + data = await request.get_json() + data = QueryRequest(**data) + except ValidationError as e: + return jsonify(e.errors()), 400 + + previous_result = pd.DataFrame( + { + "qid": str(uuid.uuid4()), + "query": [data.query], + "retrieval_gt": [[]], + "generation_gt": [""], + } + ) # pseudo qa data for execution + for module_instance, module_param in zip( + self.module_instances, self.module_params + ): + new_result = module_instance.pure( + previous_result=previous_result, **module_param + ) + duplicated_columns = previous_result.columns.intersection( + new_result.columns + ) + drop_previous_result = previous_result.drop(columns=duplicated_columns) + previous_result = pd.concat([drop_previous_result, new_result], axis=1) + + # Simulate processing the query + generated_text = previous_result[data.result_column].tolist()[0] + retrieved_passage = self.extract_retrieve_passage(previous_result) + + response = RunResponse( + result=generated_text, retrieved_passage=retrieved_passage + ) + + return jsonify(response.model_dump()), 200 + + @self.app.route("/v1/retrieve", methods=["POST"]) + async def run_retrieve_only(): + data = await request.get_json() + query = data.get("query", None) + if query is None: + return jsonify( + { + "error": "Invalid request. You need to include 'query' in the request body." + } + ), 400 + + previous_result = pd.DataFrame( + { + "qid": str(uuid.uuid4()), + "query": [query], + "retrieval_gt": [[]], + "generation_gt": [""], + } + ) # pseudo qa data for execution + for module_instance, module_param in zip( + self.module_instances, self.module_params + ): + if isinstance(module_instance, BasePromptMaker) or isinstance( + module_instance, BaseGenerator + ): + continue + new_result = module_instance.pure( + previous_result=previous_result, **module_param + ) + duplicated_columns = previous_result.columns.intersection( + new_result.columns + ) + drop_previous_result = previous_result.drop(columns=duplicated_columns) + previous_result = pd.concat([drop_previous_result, new_result], axis=1) + + # Simulate processing the query + retrieved_passages = self.extract_retrieve_passage(previous_result) + + retrieval_response = RetrievalResponse(passages=retrieved_passages) + return jsonify(retrieval_response.model_dump()), 200 + + @self.app.route("/v1/stream", methods=["POST"]) + async def stream_query(): + try: + data = await request.get_json() + data = QueryRequest(**data) + except ValidationError as e: + return jsonify(e.errors()), 400 + + @stream_with_context + async def generate(): + previous_result = pd.DataFrame( + { + "qid": str(uuid.uuid4()), + "query": [data.query], + "retrieval_gt": [[]], + "generation_gt": [""], + } + ) # pseudo qa data for execution + + for module_instance, module_param in zip( + self.module_instances, self.module_params + ): + if not isinstance(module_instance, BaseGenerator): + new_result = module_instance.pure( + previous_result=previous_result, **module_param + ) + duplicated_columns = previous_result.columns.intersection( + new_result.columns + ) + drop_previous_result = previous_result.drop( + columns=duplicated_columns + ) + previous_result = pd.concat( + [drop_previous_result, new_result], axis=1 + ) + else: + retrieved_passages = self.extract_retrieve_passage( + previous_result + ) + for i, retrieved_passage in enumerate(retrieved_passages): + yield ( + StreamResponse( + type="retrieved_passage", + generated_text=None, + retrieved_passage=retrieved_passage, + passage_index=i, + ) + .model_dump_json() + .encode("utf-8") + ) + # Start streaming of the result + assert len(previous_result) == 1 + prompt: str = previous_result["prompts"].tolist()[0] + async for delta in module_instance.astream( + prompt=prompt, **module_param + ): + response = StreamResponse( + type="generated_text", + generated_text=delta, + retrieved_passage=None, + passage_index=None, + ) + yield response.model_dump_json().encode("utf-8") + + return generate(), 200, {"X-Something": "value"} + + @self.app.route("/version", methods=["GET"]) + def get_version(): + with open(VERSION_PATH, "r") as f: + version = f.read().strip() + response = VersionResponse(version=version) + return jsonify(response.model_dump()), 200 + + def run_api_server( + self, host: str = "0.0.0.0", port: int = 8000, remote: bool = True, **kwargs + ): + """ + Run the pipeline as an api server. + Here is api endpoint documentation => https://docs.auto-rag.com/deploy/api_endpoint.html + + :param host: The host of the api server. + :param port: The port of the api server. + :param remote: Whether to expose the api server to the public internet using ngrok. + :param kwargs: Other arguments for Flask app.run. + """ + logger.info(f"Run api server at {host}:{port}") + if remote: + from pyngrok import ngrok + + http_tunnel = ngrok.connect(str(port), "http") + public_url = http_tunnel.public_url + logger.info(f"Public API URL: {public_url}") + self.app.run(host=host, port=port, **kwargs) + + def extract_retrieve_passage(self, df: pd.DataFrame) -> List[RetrievedPassage]: + retrieved_ids: List[str] = df["retrieved_ids"].tolist()[0] + contents = fetch_contents(self.corpus_df, [retrieved_ids])[0] + scores = df["retrieve_scores"].tolist()[0] + if "path" in self.corpus_df.columns: + paths = fetch_contents(self.corpus_df, [retrieved_ids], column_name="path")[ + 0 + ] + else: + paths = [None] * len(retrieved_ids) + metadatas = fetch_contents( + self.corpus_df, [retrieved_ids], column_name="metadata" + )[0] + if "start_end_idx" in self.corpus_df.columns: + start_end_indices = fetch_contents( + self.corpus_df, [retrieved_ids], column_name="start_end_idx" + )[0] + else: + start_end_indices = [None] * len(retrieved_ids) + start_end_indices = to_list(start_end_indices) + return list( + map( + lambda content, + doc_id, + score, + path, + metadata, + start_end_idx: RetrievedPassage( + content=content, + doc_id=doc_id, + score=score, + filepath=path, + file_page=metadata.get("page", None), + start_idx=start_end_idx[0] if start_end_idx else None, + end_idx=start_end_idx[1] if start_end_idx else None, + ), + contents, + retrieved_ids, + scores, + paths, + metadatas, + start_end_indices, + ) + ) diff --git a/autorag-workspace/autorag/deploy/base.py b/autorag-workspace/autorag/deploy/base.py new file mode 100644 index 0000000..ab065f1 --- /dev/null +++ b/autorag-workspace/autorag/deploy/base.py @@ -0,0 +1,235 @@ +import logging +import os +import pathlib +import uuid +from copy import deepcopy +from typing import Optional, Dict, List + +import pandas as pd +import yaml + +from autorag.support import get_support_modules +from autorag.utils.util import load_summary_file, load_yaml_config + +logger = logging.getLogger("AutoRAG") + + +def extract_node_line_names(config_dict: Dict) -> List[str]: + """ + Extract node line names with the given config dictionary order. + + :param config_dict: The YAML configuration dict for the pipeline. + You can load this to access trail_folder/config.yaml. + :return: The list of node line names. + It is the order of the node line names in the pipeline. + """ + return [node_line["node_line_name"] for node_line in config_dict["node_lines"]] + + +def extract_node_strategy(config_dict: Dict) -> Dict: + """ + Extract node strategies with the given config dictionary. + The return value is a dictionary of the node type and its strategy. + + :param config_dict: The YAML configuration dict for the pipeline. + You can load this to access trail_folder/config.yaml. + :return: Key is node_type and value is strategy dict. + """ + return { + node["node_type"]: node.get("strategy", {}) + for node_line in config_dict["node_lines"] + for node in node_line["nodes"] + } + + +def summary_df_to_yaml(summary_df: pd.DataFrame, config_dict: Dict) -> Dict: + """ + Convert trial summary dataframe to config yaml file. + + :param summary_df: The trial summary dataframe of the evaluated trial. + :param config_dict: The yaml configuration dict for the pipeline. + You can load this to access trail_folder/config.yaml. + :return: Dictionary of config yaml file. + You can save this dictionary to yaml file. + """ + + # summary_df columns : 'node_line_name', 'node_type', 'best_module_filename', + # 'best_module_name', 'best_module_params', 'best_execution_time' + node_line_names = extract_node_line_names(config_dict) + node_strategies = extract_node_strategy(config_dict) + strategy_df = pd.DataFrame( + { + "node_type": list(node_strategies.keys()), + "strategy": list(node_strategies.values()), + } + ) + summary_df = summary_df.merge(strategy_df, on="node_type", how="left") + summary_df["categorical_node_line_name"] = pd.Categorical( + summary_df["node_line_name"], categories=node_line_names, ordered=True + ) + summary_df = summary_df.sort_values(by="categorical_node_line_name") + grouped = summary_df.groupby("categorical_node_line_name", observed=False) + + node_lines = [ + { + "node_line_name": node_line_name, + "nodes": [ + { + "node_type": row["node_type"], + "strategy": row["strategy"], + "modules": [ + { + "module_type": row["best_module_name"], + **row["best_module_params"], + } + ], + } + for _, row in node_line.iterrows() + ], + } + for node_line_name, node_line in grouped + ] + return {"node_lines": node_lines} + + +def extract_best_config(trial_path: str, output_path: Optional[str] = None) -> Dict: + """ + Extract the optimal pipeline from the evaluated trial. + + :param trial_path: The path to the trial directory that you want to extract the pipeline from. + Must already be evaluated. + :param output_path: Output path that pipeline yaml file will be saved. + Must be .yaml or .yml file. + If None, it does not save YAML file and just returns dict values. + Default is None. + :return: The dictionary of the extracted pipeline. + """ + summary_path = os.path.join(trial_path, "summary.csv") + if not os.path.exists(summary_path): + raise ValueError(f"summary.csv does not exist in {trial_path}.") + trial_summary_df = load_summary_file( + summary_path, dict_columns=["best_module_params"] + ) + config_yaml_path = os.path.join(trial_path, "config.yaml") + with open(config_yaml_path, "r") as f: + config_dict = yaml.safe_load(f) + yaml_dict = summary_df_to_yaml(trial_summary_df, config_dict) + yaml_dict["vectordb"] = extract_vectordb_config(trial_path) + if output_path is not None: + with open(output_path, "w") as f: + yaml.safe_dump(yaml_dict, f) + return yaml_dict + + +def extract_vectordb_config(trial_path: str) -> List[Dict]: + # get vectordb.yaml file + project_dir = pathlib.PurePath(os.path.realpath(trial_path)).parent + vectordb_config_path = os.path.join(project_dir, "resources", "vectordb.yaml") + if not os.path.exists(vectordb_config_path): + raise ValueError(f"vectordb.yaml does not exist in {vectordb_config_path}.") + with open(vectordb_config_path, "r") as f: + vectordb_dict = yaml.safe_load(f) + result = vectordb_dict.get("vectordb", []) + if len(result) != 0: + return result + # return default setting of chroma + return [ + { + "name": "default", + "db_type": "chroma", + "client_type": "persistent", + "embedding_model": "openai", + "collection_name": "openai", + "path": os.path.join(project_dir, "resources", "chroma"), + } + ] + + +class BaseRunner: + def __init__(self, config: Dict, project_dir: Optional[str] = None): + self.config = config + project_dir = os.getcwd() if project_dir is None else project_dir + os.environ["PROJECT_DIR"] = project_dir + + # init modules + node_lines = deepcopy(self.config["node_lines"]) + self.module_instances = [] + self.module_params = [] + for node_line in node_lines: + for node in node_line["nodes"]: + if len(node["modules"]) != 1: + raise ValueError( + "The number of modules in a node must be 1 for using runner." + "Please use extract_best_config method for extracting yaml file from evaluated trial." + ) + module = node["modules"][0] + module_type = module.pop("module_type") + module_params = module + module_instance = get_support_modules(module_type)( + project_dir=project_dir, + **module_params, + ) + self.module_instances.append(module_instance) + self.module_params.append(module_params) + + @classmethod + def from_yaml(cls, yaml_path: str, project_dir: Optional[str] = None): + """ + Load Runner from the YAML file. + Must be extracted YAML file from the evaluated trial using the extract_best_config method. + + :param yaml_path: The path of the YAML file. + :param project_dir: The path of the project directory. + Default is the current directory. + :return: Initialized Runner. + """ + config = load_yaml_config(yaml_path) + return cls(config, project_dir=project_dir) + + @classmethod + def from_trial_folder(cls, trial_path: str): + """ + Load Runner from the evaluated trial folder. + Must already be evaluated using Evaluator class. + It sets the project_dir as the parent directory of the trial folder. + + :param trial_path: The path of the trial folder. + :return: Initialized Runner. + """ + config = extract_best_config(trial_path) + return cls(config, project_dir=os.path.dirname(trial_path)) + + +class Runner(BaseRunner): + def run(self, query: str, result_column: str = "generated_texts"): + """ + Run the pipeline with query. + The loaded pipeline must start with a single query, + so the first module of the pipeline must be `query_expansion` or `retrieval` module. + + :param query: The query of the user. + :param result_column: The result column name for the answer. + Default is `generated_texts`, which is the output of the `generation` module. + :return: The result of the pipeline. + """ + previous_result = pd.DataFrame( + { + "qid": str(uuid.uuid4()), + "query": [query], + "retrieval_gt": [[]], + "generation_gt": [""], + } + ) # pseudo qa data for execution + for module_instance, module_param in zip( + self.module_instances, self.module_params + ): + new_result = module_instance.pure( + previous_result=previous_result, **module_param + ) + duplicated_columns = previous_result.columns.intersection( + new_result.columns + ) + drop_previous_result = previous_result.drop(columns=duplicated_columns) + previous_result = pd.concat([drop_previous_result, new_result], axis=1) + + return previous_result[result_column].tolist()[0] diff --git a/autorag-workspace/autorag/deploy/gradio.py b/autorag-workspace/autorag/deploy/gradio.py new file mode 100644 index 0000000..a56332b --- /dev/null +++ b/autorag-workspace/autorag/deploy/gradio.py @@ -0,0 +1,74 @@ +import logging +import uuid + +import pandas as pd + +from autorag.deploy.base import BaseRunner + +import gradio as gr + + +logger = logging.getLogger("AutoRAG") + + +class GradioRunner(BaseRunner): + def run_web( + self, + server_name: str = "0.0.0.0", + server_port: int = 7680, + share: bool = False, + **kwargs, + ): + """ + Run web interface to interact pipeline. + You can access the web interface at `http://server_name:server_port` in your browser + + :param server_name: The host of the web. Default is 0.0.0.0. + :param server_port: The port of the web. Default is 7680. + :param share: Whether to create a publicly shareable link. Default is False. + :param kwargs: Other arguments for gr.ChatInterface.launch. + """ + + logger.info(f"Run web interface at http://{server_name}:{server_port}") + + def get_response(message, _): + return self.run(message) + + gr.ChatInterface( + get_response, title="📚 AutoRAG", retry_btn=None, undo_btn=None + ).launch( + server_name=server_name, server_port=server_port, share=share, **kwargs + ) + + def run(self, query: str, result_column: str = "generated_texts"): + """ + Run the pipeline with query. + The loaded pipeline must start with a single query, + so the first module of the pipeline must be `query_expansion` or `retrieval` module. + + :param query: The query of the user. + :param result_column: The result column name for the answer. + Default is `generated_texts`, which is the output of the `generation` module. + :return: The result of the pipeline. + """ + previous_result = pd.DataFrame( + { + "qid": str(uuid.uuid4()), + "query": [query], + "retrieval_gt": [[]], + "generation_gt": [""], + } + ) # pseudo qa data for execution + for module_instance, module_param in zip( + self.module_instances, self.module_params + ): + new_result = module_instance.pure( + previous_result=previous_result, **module_param + ) + duplicated_columns = previous_result.columns.intersection( + new_result.columns + ) + drop_previous_result = previous_result.drop(columns=duplicated_columns) + previous_result = pd.concat([drop_previous_result, new_result], axis=1) + + return previous_result[result_column].tolist()[0] diff --git a/autorag-workspace/autorag/deploy/swagger.yml b/autorag-workspace/autorag/deploy/swagger.yml new file mode 100644 index 0000000..390422c --- /dev/null +++ b/autorag-workspace/autorag/deploy/swagger.yml @@ -0,0 +1,202 @@ +openapi: 3.0.0 +info: + title: Example API + version: 1.0.1 +paths: + /v1/run: + post: + summary: Run a query and get generated text with retrieved passages + requestBody: + required: true + content: + application/json: + schema: + type: object + properties: + query: + type: string + description: The query string + result_column: + type: string + description: The result column name + default: generated_texts + required: + - query + responses: + '200': + description: Successful response + content: + application/json: + schema: + type: object + properties: + type: + type: string + enum: + - generated_text + - retrieved_passage + description: | + When the type is "generated_text", only "generated_text" is returned. + The other fields are None. When the type is "retrieved_passage", only + "retrieved_passage" and "passage_index" are returned. The other fields are None. + generated_text: + type: string + nullable: true + description: | + The generated text, only present when "type" is "generated_text". + retrieved_passage: + type: object + nullable: true + properties: + content: + type: string + doc_id: + type: string + filepath: + type: string + nullable: true + file_page: + type: integer + nullable: true + start_idx: + type: integer + nullable: true + end_idx: + type: integer + nullable: true + passage_index: + type: integer + nullable: true + description: | + The index of the retrieved passage, only present when "type" is "retrieved_passage". + required: + - type + oneOf: + - required: + - generated_text + - required: + - retrieved_passage + - passage_index + /v1/retrieve: + post: + summary: Retrieve documents based on a query + operationId: runRetrieveOnly + requestBody: + required: true + content: + application/json: + schema: + type: object + properties: + query: + type: string + description: The query string to retrieve documents. + required: + - query + responses: + '200': + description: Successful retrieval of documents + content: + application/json: + schema: + type: object + properties: + passages: + type: array + items: + type: object + properties: + doc_id: + type: string + description: The unique identifier for the document. + content: + type: string + description: The content of the retrieved document. + score: + type: number + format: float + description: The score of the retrieved document. + '400': + description: Invalid request due to missing query parameter + content: + application/json: + schema: + type: object + properties: + error: + type: string + description: Error message explaining the issue. + /v1/stream: + post: + summary: Stream generated text with retrieved passages + description: > + This endpoint streams the generated text line by line. The `retrieved_passage` + is sent first, followed by the `result` streamed incrementally. + requestBody: + required: true + content: + application/json: + schema: + type: object + properties: + query: + type: string + description: The query string + result_column: + type: string + description: The result column name + default: generated_texts + required: + - query + responses: + '200': + description: Successful response with streaming + content: + text/event-stream: + schema: + type: object + properties: + result: + oneOf: + - type: string + - type: array + items: + type: string + description: The result text or list of texts (streamed line by line) + retrieved_passage: + type: array + items: + type: object + properties: + content: + type: string + doc_id: + type: string + filepath: + type: string + nullable: true + file_page: + type: integer + nullable: true + start_idx: + type: integer + nullable: true + end_idx: + type: integer + nullable: true + + /version: + get: + summary: Get the API version + description: Returns the current version of the API as a string. + responses: + '200': + description: Successful response + content: + application/json: + schema: + type: object + properties: + version: + type: string + description: The version of the API diff --git a/autorag-workspace/autorag/embedding/__init__.py b/autorag-workspace/autorag/embedding/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/autorag-workspace/autorag/embedding/base.py b/autorag-workspace/autorag/embedding/base.py new file mode 100644 index 0000000..e103fd9 --- /dev/null +++ b/autorag-workspace/autorag/embedding/base.py @@ -0,0 +1,141 @@ +import logging +import sys + +from random import random +from typing import List, Union, Dict + +from llama_index.core.embeddings.mock_embed_model import MockEmbedding +from llama_index.embeddings.openai import OpenAIEmbedding +from llama_index.embeddings.openai import OpenAIEmbeddingModelType +from llama_index.embeddings.ollama import OllamaEmbedding +from langchain_openai.embeddings import OpenAIEmbeddings + +from autorag import LazyInit + +logger = logging.getLogger("AutoRAG") + + +class MockEmbeddingRandom(MockEmbedding): + """Mock embedding with random vectors.""" + + def _get_vector(self) -> List[float]: + return [random() for _ in range(self.embed_dim)] + + +embedding_models = { + # llama index + "openai": LazyInit( + OpenAIEmbedding + ), # default model is OpenAIEmbeddingModelType.TEXT_EMBED_ADA_002 + "openai_embed_3_large": LazyInit( + OpenAIEmbedding, model_name=OpenAIEmbeddingModelType.TEXT_EMBED_3_LARGE + ), + "openai_embed_3_small": LazyInit( + OpenAIEmbedding, model_name=OpenAIEmbeddingModelType.TEXT_EMBED_3_SMALL + ), + "mock": LazyInit(MockEmbeddingRandom, embed_dim=768), + # langchain + "openai_langchain": LazyInit(OpenAIEmbeddings), + "ollama": LazyInit(OllamaEmbedding), +} + +try: + # you can use your own model in this way. + from llama_index.embeddings.huggingface import HuggingFaceEmbedding + + embedding_models["huggingface_baai_bge_small"] = LazyInit( + HuggingFaceEmbedding, model_name="BAAI/bge-small-en-v1.5" + ) + embedding_models["huggingface_cointegrated_rubert_tiny2"] = LazyInit( + HuggingFaceEmbedding, model_name="cointegrated/rubert-tiny2" + ) + embedding_models["huggingface_all_mpnet_base_v2"] = LazyInit( + HuggingFaceEmbedding, + model_name="sentence-transformers/all-mpnet-base-v2", + max_length=512, + ) + embedding_models["huggingface_bge_m3"] = LazyInit( + HuggingFaceEmbedding, model_name="BAAI/bge-m3" + ) + embedding_models["huggingface_multilingual_e5_large"] = LazyInit( + HuggingFaceEmbedding, model_name="intfloat/multilingual-e5-large-instruct" + ) + embedding_models["huggingface_all_mpnet_base_v2"] = LazyInit( + HuggingFaceEmbedding, model_name="sentence-transformers/all-mpnet-base-v2" + ) # 230313 추가 - 김용연 + embedding_models["huggingface_KURE-v1"] = LazyInit( + HuggingFaceEmbedding, model_name="nlpai-lab/KURE-v1" + ) # 230313 추가 - 김용연 + embedding_models["huggingface_drangonku-v2-ko"] = LazyInit( + HuggingFaceEmbedding, model_name="dragonkue/snowflake-arctic-embed-l-v2.0-ko" + ) # 230313 추가 - 김용연 + +except ImportError: + logger.info( + "You are using API version of AutoRAG." + "To use local version, run pip install 'AutoRAG[gpu]'" + ) + + +class EmbeddingModel: + @staticmethod + def load(config: Union[str, Dict, List[Dict]]): + if isinstance(config, str): + return EmbeddingModel.load_from_str(config) + elif isinstance(config, dict): + return EmbeddingModel.load_from_dict(config) + elif isinstance(config, list): + return EmbeddingModel.load_from_list(config) + else: + raise ValueError("Invalid type of config") + + @staticmethod + def load_from_str(name: str): + try: + return embedding_models[name] + except KeyError: + raise ValueError(f"Embedding model '{name}' is not supported") + + @staticmethod + def load_from_list(option: List[dict]): + if len(option) != 1: + raise ValueError("Only one embedding model is supported") + return EmbeddingModel.load_from_dict(option[0]) + + @staticmethod + def load_from_dict(option: dict): + def _check_keys(target: dict): + if "type" not in target or "model_name" not in target: + raise ValueError("Both 'type' and 'model_name' must be provided") + if target["type"] not in ["openai", "huggingface", "mock", "ollama"]: + raise ValueError( + f"Embedding model type '{target['type']}' is not supported" + ) + + def _get_huggingface_class(): + module = sys.modules.get("llama_index.embeddings.huggingface") + if not module: + logger.info( + "You are using API version of AutoRAG. " + "To use local version, run `pip install 'AutoRAG[gpu]'`." + ) + return None + return getattr(module, "HuggingFaceEmbedding", None) + + _check_keys(option) + + model_options = option + model_type = model_options.pop("type") + + embedding_map = { + "openai": OpenAIEmbedding, + "mock": MockEmbeddingRandom, + "huggingface": _get_huggingface_class(), + "ollama": OllamaEmbedding, + } + + embedding_class = embedding_map.get(model_type) + if not embedding_class: + raise ValueError(f"Embedding model type '{model_type}' is not supported") + + return LazyInit(embedding_class, **model_options) diff --git a/autorag-workspace/autorag/evaluation/__init__.py b/autorag-workspace/autorag/evaluation/__init__.py new file mode 100644 index 0000000..e25eb07 --- /dev/null +++ b/autorag-workspace/autorag/evaluation/__init__.py @@ -0,0 +1,3 @@ +from .retrieval import evaluate_retrieval +from .generation import evaluate_generation +from .retrieval_contents import evaluate_retrieval_contents diff --git a/autorag-workspace/autorag/evaluation/generation.py b/autorag-workspace/autorag/evaluation/generation.py new file mode 100644 index 0000000..0e1f95f --- /dev/null +++ b/autorag-workspace/autorag/evaluation/generation.py @@ -0,0 +1,86 @@ +import functools +import warnings +from typing import List, Callable, Union, Dict + +import pandas as pd + +from autorag.evaluation.metric.generation import ( + bleu, + meteor, + rouge, + sem_score, + g_eval, + bert_score, + deepeval_faithfulness, +) +from autorag.evaluation.util import cast_metrics +from autorag.schema.metricinput import MetricInput + +GENERATION_METRIC_FUNC_DICT = { + func.__name__: func + for func in [ + bleu, + meteor, + rouge, + sem_score, + g_eval, + bert_score, + deepeval_faithfulness, + ] +} + + +def evaluate_generation( + metric_inputs: List[MetricInput], metrics: Union[List[str], List[Dict]] +): + def decorator_evaluate_generation(func: Callable): + @functools.wraps(func) + def wrapper(*args, **kwargs) -> pd.DataFrame: + generation_result = func(*args, **kwargs) + if type(generation_result) is tuple: + assert ( + type(generation_result[0]) is list + and type(generation_result[0][0]) is str + ), "Input func must return string list as generated answer at the first return value." + generated_str = generation_result[0] + elif type(generation_result) is list: + assert ( + type(generation_result[0]) is str + ), "Input func must return string list as generated answer at the first return value." + generated_str = generation_result + else: + raise ValueError( + "Input func must return string list as generated answer at the first return value." + ) + for metric_input, generated_text in zip(metric_inputs, generated_str): + metric_input.generated_texts = generated_text + + metric_scores = {} + metric_names, metric_params = cast_metrics(metrics) + + for metric_name, metric_param in zip(metric_names, metric_params): + if metric_name not in GENERATION_METRIC_FUNC_DICT: + warnings.warn( + f"metric {metric_name} is not in supported metrics: {GENERATION_METRIC_FUNC_DICT.keys()}" + f"{metric_name} will be ignored." + ) + else: + metric_scores[metric_name] = GENERATION_METRIC_FUNC_DICT[ + metric_name + ]( + metric_inputs=metric_inputs, + **metric_param, + ) + + metric_result_df = pd.DataFrame(metric_scores) + execution_result_df = pd.DataFrame({"generated_texts": generated_str}) + if type(generation_result) is tuple: + execution_result_df["generated_tokens"] = generation_result[1] + execution_result_df["generated_log_probs"] = generation_result[2] + + result_df = pd.concat([execution_result_df, metric_result_df], axis=1) + return result_df + + return wrapper + + return decorator_evaluate_generation diff --git a/autorag-workspace/autorag/evaluation/metric/__init__.py b/autorag-workspace/autorag/evaluation/metric/__init__.py new file mode 100644 index 0000000..e9f99b7 --- /dev/null +++ b/autorag-workspace/autorag/evaluation/metric/__init__.py @@ -0,0 +1,22 @@ +from .generation import ( + bleu, + meteor, + rouge, + sem_score, + g_eval, + bert_score, + deepeval_faithfulness, +) +from .retrieval import ( + retrieval_f1, + retrieval_recall, + retrieval_precision, + retrieval_mrr, + retrieval_ndcg, + retrieval_map, +) +from .retrieval_contents import ( + retrieval_token_f1, + retrieval_token_precision, + retrieval_token_recall, +) diff --git a/autorag-workspace/autorag/evaluation/metric/deepeval_prompt.py b/autorag-workspace/autorag/evaluation/metric/deepeval_prompt.py new file mode 100644 index 0000000..e0a89de --- /dev/null +++ b/autorag-workspace/autorag/evaluation/metric/deepeval_prompt.py @@ -0,0 +1,322 @@ +class FaithfulnessTemplate: + @staticmethod + def generate_claims(text, lang: str = "en"): + if lang == "en": + return f"""Based on the given text, please generate a comprehensive list of FACTUAL claims that can inferred from the provided text. + + Example: + Example Text: + "Einstein won the noble prize in 1968 for his discovery of the photoelectric effect." + + Example JSON: + {{ + "claims": [ + "Einstein won the noble prize for his discovery of the photoelectric effect.", + "Einstein won the noble prize in 1968." + ] + }} + ===== END OF EXAMPLE ====== + + ** + IMPORTANT: Please make sure to only return in JSON format, with the "claims" key as a list of strings. No words or explanation is needed. + Only include claims that are factual, and the claims you extract should include the full context it was presented in, NOT cherry picked facts. + You should NOT include any prior knowledge, and take the text at face value when extracting claims. + ** + + Text: + {text} + + JSON: + """ + elif lang == "ko": + return f"""주어진 텍스트에서 찾을 수 있는 사실적 정보들의 목록을 생성하세요. + +예시: +예시 텍스트: +“아인슈타인은 1968년에 광전 효과 발견으로 노벨상을 수상했다.” + +예시 JSON: +{{ +“claims”: [ +“아인슈타인은 광전 효과 발견으로 노벨상을 수상했다.”, +“아인슈타인은 1968년에 노벨상을 수상했다.” +] +}} +===== 예시 끝 ====== + +** +중요: 오직 JSON 형식으로 “claims” 키가 문자열 목록으로 반환되도록 해야 합니다. 다른 단어나 설명은 필요하지 않습니다. +사실에 기반한 주장만 포함하며, 추출한 주장은 전체 맥락을 유지해야 하며, 부분적으로 선택된 사실을 포함하지 않아야 합니다. +사전 지식은 포함하지 말고, 텍스트에만 기초해 주장들을 추출해야 합니다. +** + +텍스트: +{text} + +JSON: +""" + elif lang == "ja": + return f"""与えられたテキストに基づいて、そこから推測できる事実に基づく主張のリストを生成してください。 + +例: +例のテキスト: +「アインシュタインは1968年に光電効果の発見でノーベル賞を受賞しました。」 + +例のJSON: +{{ + "claims": [ + "アインシュタインは光電効果の発見でノーベル賞を受賞しました。", + "アインシュタインは1968年にノーベル賞を受賞しました。" + ] +}} +===== 例の終わり ====== + +** +重要: 必ずJSON形式で"claims"キーが文字列のリストとして返されるようにしてください。説明や余計な言葉は不要です。 +事実に基づく主張のみを含め、抽出された主張は提示された文脈全体を含むものでなければなりません。一部の事実のみを抜粋することは避けてください。 +事前知識を使用せず、テキストに基づいて主張を抽出してください。 +** + +テキスト: +{text} + +JSON: +""" + else: + raise ValueError(f"Language {lang} is not supported.") + + @staticmethod + def generate_truths(text, lang: str = "en"): + if lang == "en": + return f"""Based on the given text, please generate a comprehensive list of FACTUAL, undisputed truths that can inferred from the provided text. + + Example: + Example Text: + "Einstein won the noble prize in 1968 for his discovery of the photoelectric effect." + + Example JSON: + {{ + "truths": [ + "Einstein won the noble prize for his discovery of the photoelectric effect.", + "Einstein won the noble prize in 1968." + ] + }} + ===== END OF EXAMPLE ====== + + ** + IMPORTANT: Please make sure to only return in JSON format, with the "truths" key as a list of strings. No words or explanation is needed. + Only include truths that are factual. + ** + + Text: + {text} + + JSON: + """ + elif lang == "ko": + return f"""주어진 텍스트에서 추출할 수 있는 사실적이고 논란이 없는 진실들의 목록을 생성하세요. + +예시: +예시 텍스트: +"아인슈타인은 1968년에 광전 효과 발견으로 노벨상을 수상했다." + +예시 JSON: +{{ + "truths": [ + "아인슈타인은 광전 효과 발견으로 노벨상을 수상했다.", + "아인슈타인은 1968년에 노벨상을 수상했다." + ] +}} +===== 예시 끝 ====== + +** +중요: 오직 JSON 형식으로 "truths" 키가 문자열 목록으로 반환되도록 해야 합니다. 다른 단어나 설명은 필요하지 않습니다. +사실에 기반한 진실만 포함해야 합니다. +** + +텍스트: +{text} + +JSON: +""" + elif lang == "ja": + return f"""与えられたテキストに基づいて、そこから推測できる事実で議論の余地のない真実のリストを生成してください。 + +例: +例のテキスト: +「アインシュタインは1968年に光電効果の発見でノーベル賞を受賞しました。」 + +例のJSON: +{{ + "truths": [ + "アインシュタインは光電効果の発見でノーベル賞を受賞しました。", + "アインシュタインは1968年にノーベル賞を受賞しました。" + ] +}} +===== 例の終わり ====== + +** +重要: 必ずJSON形式で"truths"キーが文字列のリストとして返されるようにしてください。説明や余計な言葉は不要です。 +事実に基づく真実のみを含めてください。 +** + +テキスト: +{text} + +JSON: +""" + else: + raise ValueError(f"Language {lang} is not supported.") + + @staticmethod + def generate_verdicts(claims, retrieval_context, lang: str = "en"): + if lang == "en": + return f"""Based on the given claims, which is a list of strings, generate a list of JSON objects to indicate whether EACH claim contradicts any facts in the retrieval context. The JSON will have 2 fields: 'verdict' and 'reason'. + The 'verdict' key should STRICTLY be either 'yes', 'no', or 'idk', which states whether the given claim agrees with the context. + Provide a 'reason' ONLY if the answer is 'no'. + The provided claim is drawn from the actual output. Try to provide a correction in the reason using the facts in the retrieval context. + + ** + IMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' key as a list of JSON objects. + Example retrieval contexts: "Einstein won the Nobel Prize for his discovery of the photoelectric effect. Einstein won the Nobel Prize in 1968. Einstein is a German Scientist." + Example claims: ["Barack Obama is a caucasian male.", "Zurich is a city in London", "Einstein won the Nobel Prize for the discovery of the photoelectric effect which may have contributed to his fame.", "Einstein won the Nobel Prize in 1969 for his discovery of the photoelectric effect.", "Einstein was a Germen chef."] + + Example: + {{ + "verdicts": [ + {{ + "verdict": "idk" + }}, + {{ + "verdict": "idk" + }}, + {{ + "verdict": "yes" + }}, + {{ + "verdict": "no", + "reason": "The actual output claims Einstein won the Nobel Prize in 1969, which is untrue as the retrieval context states it is 1968 instead." + }}, + {{ + "verdict": "no", + "reason": "The actual output claims Einstein is a Germen chef, which is not correct as the retrieval context states he was a German scientist instead." + }}, + ] + }} + ===== END OF EXAMPLE ====== + + The length of 'verdicts' SHOULD BE STRICTLY EQUAL to that of claims. + You DON'T have to provide a reason if the answer is 'yes' or 'idk'. + ONLY provide a 'no' answer if the retrieval context DIRECTLY CONTRADICTS the claims. YOU SHOULD NEVER USE YOUR PRIOR KNOWLEDGE IN YOUR JUDGEMENT. + Claims made using vague, suggestive, speculative language such as 'may have', 'possibility due to', does NOT count as a contradiction. + Claims that is not backed up due to a lack of information/is not mentioned in the retrieval contexts MUST be answered 'idk', otherwise I WILL DIE. + ** + + Retrieval Contexts: + {retrieval_context} + + Claims: + {claims} + + JSON: + """ + elif lang == "ko": + return f"""주어진 주장에 대해, 각 주장이 주어진 문맥의 사실들과 모순되는지를 나타내는 JSON 객체 목록을 생성하세요. JSON은 두 개의 필드인 'verdict'와 'reason'으로 구성됩니다. +'verdict'는 'yes', 'no', 또는 'idk' 중 하나여야 하며, 주어진 주장이 문맥과 일치하는지를 나타냅니다. +'verdict'가 'no'인 경우에만 'reason'을 제공하세요. 'reason'에는 문맥에 따라 주장을 수정하는 내용이 포함되어야 합니다. + +** +중요: 오직 JSON 형식으로 'verdicts' 키가 JSON 객체 목록으로 반환되도록 해야 합니다. +예시 문맥: "아인슈타인은 광전 효과 발견으로 노벨상을 수상했다. 아인슈타인은 1968년에 노벨상을 수상했다. 아인슈타인은 독일 과학자이다." +예시 주장: ["버락 오바마는 백인 남성이다.", "취리히는 런던에 있는 도시이다.", "아인슈타인은 광전 효과 발견으로 노벨상을 수상했으며, 이는 그의 명성에 기여했을 것이다.", "아인슈타인은 1969년에 광전 효과 발견으로 노벨상을 수상했다.", "아인슈타인은 독일 요리사였다."] + +예시: +{{ + "verdicts": [ + {{ + "verdict": "idk" + }}, + {{ + "verdict": "idk" + }}, + {{ + "verdict": "yes" + }}, + {{ + "verdict": "no", + "reason": "실제 출력은 아인슈타인이 1969년에 노벨상을 수상했다고 주장하지만, 문맥에서는 1968년이라고 명시되어 있습니다." + }}, + {{ + "verdict": "no", + "reason": "실제 출력은 아인슈타인이 독일 요리사라고 주장하지만, 문맥에서는 그가 독일 과학자라고 명시되어 있습니다." + }}, + ] +}} +===== 예시 끝 ====== + +'verdicts' 리스트의 길이는 반드시 주장들의 길이와 같아야 합니다. +'yes' 또는 'idk'일 경우 'reason'을 제공할 필요가 없습니다. +검색된 문맥과 직접적으로 모순되는 경우에만 'no' 답변을 제공하세요. 절대로 선험적인 지식을 사용하지 마세요. +'~일 수 있다', '가능성이 있다'와 같은 모호한 표현은 모순으로 간주하지 마세요. +문맥에 대한 정보 부족으로 뒷받침되지 않거나 언급되지 않은 주장은 반드시 'idk'로 답변하세요, 그렇지 않으면 내가 죽습니다. +** + +주어진 문맥: +{retrieval_context} + +주장: +{claims} + +JSON: +""" + elif lang == "ja": + return f"""与えられた主張について、それぞれの主張が取得された文脈の事実と矛盾しているかどうかを示すJSONオブジェクトのリストを生成してください。JSONには2つのフィールド、'verdict'と'reason'があります。 +'verdict'フィールドは、主張が文脈に一致するかどうかを示すため、厳密に'yes', 'no', 'idk'のいずれかを使用します。 +'verdict'が'no'の場合にのみ、'reason'を提供してください。'reason'には、文脈に基づいて主張を修正する内容が含まれている必要があります。 + +** +重要: 必ずJSON形式で'verdicts'キーがJSONオブジェクトのリストとして返されるようにしてください。 +例の文脈:「アインシュタインは光電効果の発見でノーベル賞を受賞しました。アインシュタインは1968年にノーベル賞を受賞しました。アインシュタインはドイツの科学者です。」 +例の主張: ["バラク・オバマは白人男性です。", "チューリッヒはロンドンにある都市です。", "アインシュタインは光電効果の発見でノーベル賞を受賞し、これが彼の名声に貢献したかもしれません。", "アインシュタインは1969年に光電効果の発見でノーベル賞を受賞しました。", "アインシュタインはドイツのシェフでした。"] + +例のJSON: +{{ + "verdicts": [ + {{ + "verdict": "idk" + }}, + {{ + "verdict": "idk" + }}, + {{ + "verdict": "yes" + }}, + {{ + "verdict": "no", + "reason": "実際の出力は、アインシュタインが1969年にノーベル賞を受賞したと主張していますが、文脈では1968年と述べられています。" + }}, + {{ + "verdict": "no", + "reason": "実際の出力は、アインシュタインがドイツのシェフだと主張していますが、文脈では彼がドイツの科学者であると述べられています。" + }}, + ] +}} +===== 例の終わり ====== + +'verdicts'のリストの長さは、主張のリストの長さと必ず等しくなければなりません。 +'yes'または'idk'の場合、'reason'を提供する必要はありません。 +文脈と直接矛盾する場合にのみ、'no'を提供してください。決して事前知識を使用しないでください。 +「〜かもしれない」や「〜の可能性がある」といった曖昧な表現は矛盾とは見なされません。 +情報が不足している、または文脈で言及されていない主張には必ず'idk'で答えてください。さもないと私は死んでしまいます。 +** + +文脈: +{retrieval_context} + +主張: +{claims} + +JSON: +""" + else: + raise ValueError(f"Language {lang} is not supported.") diff --git a/autorag-workspace/autorag/evaluation/metric/g_eval_prompts/coh_detailed.txt b/autorag-workspace/autorag/evaluation/metric/g_eval_prompts/coh_detailed.txt new file mode 100644 index 0000000..6aa2df4 --- /dev/null +++ b/autorag-workspace/autorag/evaluation/metric/g_eval_prompts/coh_detailed.txt @@ -0,0 +1,32 @@ +You will be given one summary written for a news article. + +Your task is to rate the summary on one metric. + +Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed. + +Evaluation Criteria: + +Coherence (1-5) - the collective quality of all sentences. We align this dimension with the DUC quality question of structure and coherence whereby "the summary should be well-structured and well-organized. The summary should not just be a heap of related information, but should build from sentence to a coherent body of information about a topic." + +Evaluation Steps: + +1. Read the news article carefully and identify the main topic and key points. +2. Read the summary and compare it to the news article. Check if the summary covers the main topic and key points of the news article, and if it presents them in a clear and logical order. +3. Assign a score for coherence on a scale of 1 to 5, where 1 is the lowest and 5 is the highest based on the Evaluation Criteria. + + +Example: + + +Source Text: + +{{Document}} + +Summary: + +{{Summary}} + + +Evaluation Form (scores ONLY): + +- Coherence: diff --git a/autorag-workspace/autorag/evaluation/metric/g_eval_prompts/con_detailed.txt b/autorag-workspace/autorag/evaluation/metric/g_eval_prompts/con_detailed.txt new file mode 100644 index 0000000..104153b --- /dev/null +++ b/autorag-workspace/autorag/evaluation/metric/g_eval_prompts/con_detailed.txt @@ -0,0 +1,33 @@ +You will be given a news article. You will then be given one summary written for this article. + +Your task is to rate the summary on one metric. + +Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed. + + +Evaluation Criteria: + +Consistency (1-5) - the factual alignment between the summary and the summarized source. A factually consistent summary contains only statements that are entailed by the source document. Annotators were also asked to penalize summaries that contained hallucinated facts. + +Evaluation Steps: + +1. Read the news article carefully and identify the main facts and details it presents. +2. Read the summary and compare it to the article. Check if the summary contains any factual errors that are not supported by the article. +3. Assign a score for consistency based on the Evaluation Criteria. + + +Example: + + +Source Text: + +{{Document}} + +Summary: + +{{Summary}} + + +Evaluation Form (scores ONLY): + +- Consistency: diff --git a/autorag-workspace/autorag/evaluation/metric/g_eval_prompts/flu_detailed.txt b/autorag-workspace/autorag/evaluation/metric/g_eval_prompts/flu_detailed.txt new file mode 100644 index 0000000..8ed51a3 --- /dev/null +++ b/autorag-workspace/autorag/evaluation/metric/g_eval_prompts/flu_detailed.txt @@ -0,0 +1,26 @@ +You will be given one summary written for a news article. + +Your task is to rate the summary on one metric. + +Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed. + + +Evaluation Criteria: + +Fluency (1-3): the quality of the summary in terms of grammar, spelling, punctuation, word choice, and sentence structure. + +- 1: Poor. The summary has many errors that make it hard to understand or sound unnatural. +- 2: Fair. The summary has some errors that affect the clarity or smoothness of the text, but the main points are still comprehensible. +- 3: Good. The summary has few or no errors and is easy to read and follow. + + +Example: + +Summary: + +{{Summary}} + + +Evaluation Form (scores ONLY): + +- Fluency (1-3): diff --git a/autorag-workspace/autorag/evaluation/metric/g_eval_prompts/rel_detailed.txt b/autorag-workspace/autorag/evaluation/metric/g_eval_prompts/rel_detailed.txt new file mode 100644 index 0000000..b7b4330 --- /dev/null +++ b/autorag-workspace/autorag/evaluation/metric/g_eval_prompts/rel_detailed.txt @@ -0,0 +1,33 @@ +You will be given one summary written for a news article. + +Your task is to rate the summary on one metric. + +Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed. + +Evaluation Criteria: + +Relevance (1-5) - selection of important content from the source. The summary should include only important information from the source document. Annotators were instructed to penalize summaries which contained redundancies and excess information. + +Evaluation Steps: + +1. Read the summary and the source document carefully. +2. Compare the summary to the source document and identify the main points of the article. +3. Assess how well the summary covers the main points of the article, and how much irrelevant or redundant information it contains. +4. Assign a relevance score from 1 to 5. + + +Example: + + +Source Text: + +{{Document}} + +Summary: + +{{Summary}} + + +Evaluation Form (scores ONLY): + +- Relevance: diff --git a/autorag-workspace/autorag/evaluation/metric/generation.py b/autorag-workspace/autorag/evaluation/metric/generation.py new file mode 100644 index 0000000..38bb0f6 --- /dev/null +++ b/autorag-workspace/autorag/evaluation/metric/generation.py @@ -0,0 +1,504 @@ +import asyncio +import itertools +import os +from typing import List, Optional + +import evaluate +import nltk +import pandas as pd +from llama_index.core.embeddings import BaseEmbedding +from llama_index.embeddings.openai import OpenAIEmbedding +from openai import AsyncOpenAI +from pydantic import BaseModel +from rouge_score import tokenizers +from rouge_score.rouge_scorer import RougeScorer +from sacrebleu.metrics.bleu import BLEU + +from autorag.embedding.base import embedding_models +from autorag.evaluation.metric.deepeval_prompt import FaithfulnessTemplate +from autorag.evaluation.metric.util import ( + autorag_metric_loop, + calculate_cosine_similarity, +) +from autorag.nodes.generator import OpenAILLM +from autorag.nodes.generator.base import BaseGenerator +from autorag.schema.metricinput import MetricInput +from autorag.support import get_support_modules +from autorag.utils.util import ( + get_event_loop, + process_batch, + openai_truncate_by_token, + convert_inputs_to_list, + pop_params, + empty_cuda_cache, +) + + +@convert_inputs_to_list +def huggingface_evaluate( + instance, key: str, metric_inputs: List[MetricInput], **kwargs +) -> List[float]: + """ + Compute huggingface evaluate metric. + + :param instance: The instance of huggingface evaluates metric. + :param key: The key to retrieve result score from huggingface evaluate result. + :param metric_inputs: A list of MetricInput schema + :param kwargs: The additional arguments for metric function. + :return: The list of scores. + """ + + def compute_score(gt: List[str], pred: str) -> float: + return max( + list( + map( + lambda x: instance.compute( + predictions=[pred], references=[x], **kwargs + )[key], + gt, + ) + ) + ) + + result = list( + map(lambda x: compute_score(x.generation_gt, x.generated_texts), metric_inputs) + ) + return result + + +def make_generator_instance(generator_module_type: str, llm: str, **kwargs): + llm_class = get_support_modules(generator_module_type) + init_params = pop_params(llm_class.__init__, kwargs) + return llm_class(project_dir="", llm=llm, **init_params) + + +@autorag_metric_loop(fields_to_check=["retrieval_gt_contents", "generated_texts"]) +def deepeval_faithfulness( + metric_inputs: List[MetricInput], + generator_module_type: str = "openai_llm", + lang: str = "en", + llm: str = "gpt-4o-2024-08-06", + batch: int = 16, + **kwargs, +) -> List[float]: + """ + Compute deepeval faithfulness metric. + Its default model is gpt-4o-2024-08-06. + Since it uses OpenAI model, please be aware of the expensive cost. + + :param metric_inputs: The list of MetricInput schema (Required Field -> "generation_gt", "generated_texts") + :param generator_module_type: Generator module type. + The default is "openai_llm". + You can use like "llama_index_llm" or "vllm". + :param lang: The prompt language that you want to use. + "en", "ko" and "ja" are supported. + Korean prompt is not officially supported by DeepEval, but it can be translated by AutoRAG developers. + Default is "en". + :param llm: The model name to use for generation. + Or llm if using llama_index_llm. + The default is "gpt-4o-2024-08-06". + :param batch: The batch size for processing. + Default is 16. + :param kwargs: The extra parameters for initializing the llm instance. + :return: The metric scores. + """ + + class Truth(BaseModel): + truths: List[str] + + class Claim(BaseModel): + claims: List[str] + + class Verdict(BaseModel): + verdict: str + reason: Optional[str] + + class FaithfulnessVerdicts(BaseModel): + verdicts: List[Verdict] + + def calculate_score(verdicts: List[Verdict]) -> float: + number_of_verdicts = len(verdicts) + if number_of_verdicts == 0: + return 1 + + faithfulness_count = 0 + for verdict in verdicts: + if verdict.verdict.strip().lower() != "no": + faithfulness_count += 1 + + score = faithfulness_count / number_of_verdicts + return score + + retrieval_contexts = list(map(lambda x: x.retrieval_gt_contents, metric_inputs)) + truth_prompts = list( + map(lambda x: FaithfulnessTemplate.generate_truths(x, lang), retrieval_contexts) + ) + + generated_texts = list(map(lambda x: x.generated_texts, metric_inputs)) + claim_prompts = list( + map(lambda x: FaithfulnessTemplate.generate_claims(x, lang), generated_texts) + ) + + generator: BaseGenerator = make_generator_instance( + generator_module_type, llm=llm, batch=batch, **kwargs + ) + if isinstance(generator, OpenAILLM): # Because of the event loop error at the httpx + # TODO: Fix the httpx APIConnectionError at the many repetitive request to the OpenAILLM on the same instance + truth_responses: List[Truth] = generator.structured_output(truth_prompts, Truth) + claim_responses: List[Claim] = make_generator_instance( + generator_module_type, llm=llm, batch=batch, **kwargs + ).structured_output(claim_prompts, Claim) + verdict_prompts = list( + map( + lambda claim, truth: FaithfulnessTemplate.generate_verdicts( + "\n\n".join(claim.claims), "\n\n".join(truth.truths), lang + ), + claim_responses, + truth_responses, + ) + ) + verdict_responses: List[FaithfulnessVerdicts] = make_generator_instance( + generator_module_type, llm=llm, batch=batch, **kwargs + ).structured_output(verdict_prompts, FaithfulnessVerdicts) + else: + truth_responses: List[Truth] = generator.structured_output(truth_prompts, Truth) + claim_responses: List[Claim] = generator.structured_output(claim_prompts, Claim) + verdict_prompts = list( + map( + lambda claim, truth: FaithfulnessTemplate.generate_verdicts( + "\n\n".join(claim.claims), "\n\n".join(truth.truths), lang + ), + claim_responses, + truth_responses, + ) + ) + verdict_responses: List[FaithfulnessVerdicts] = generator.structured_output( + verdict_prompts, FaithfulnessVerdicts + ) + + result = list(map(lambda x: calculate_score(x.verdicts), verdict_responses)) + return result + + +@autorag_metric_loop(fields_to_check=["generation_gt", "generated_texts"]) +def bleu( + metric_inputs: List[MetricInput], + tokenize: Optional[str] = None, + smooth_method: str = "exp", + smooth_value: Optional[float] = None, + max_ngram_order: int = 4, + trg_lang: str = "", + effective_order: bool = True, + **kwargs, +) -> List[float]: + """ + Computes the BLEU metric given pred and ground-truth. + + :param metric_inputs: A list of MetricInput schema (Required Field -> "generation_gt", "generated_texts") + :param tokenize: The tokenizer to use. If None, defaults to language-specific tokenizers with '13a' as the fallback default. check #https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/metrics/bleu.py + :param smooth_method: The smoothing method to use ('floor', 'add-k', 'exp' or 'none'). + :param smooth_value: The smoothing value for `floor` and `add-k` methods. `None` falls back to default value. + :param max_ngram_order: If given, it overrides the maximum n-gram order (default: 4) when computing precisions. + :param trg_lang: An optional language code to raise potential tokenizer warnings. + :param effective_order: If `True`, stop including n-gram orders for which precision is 0. This should be + `True`, if sentence-level BLEU will be computed. + """ + bleu_instance = BLEU( + tokenize=tokenize, + smooth_method=smooth_method, + smooth_value=smooth_value, + max_ngram_order=max_ngram_order, + trg_lang=trg_lang, + effective_order=effective_order, + **kwargs, + ) + + result = list( + map( + lambda x: bleu_instance.sentence_score( + x.generated_texts, x.generation_gt + ).score, + metric_inputs, + ) + ) + return result + + +@autorag_metric_loop(fields_to_check=["generation_gt", "generated_texts"]) +def meteor( + metric_inputs: List[MetricInput], + alpha: float = 0.9, + beta: float = 3.0, + gamma: float = 0.5, +) -> List[float]: + """ + Compute meteor score for generation. + + :param metric_inputs: A list of MetricInput schema (Required Field -> "generation_gt", "generated_texts") + :param alpha: Parameter for controlling relative weights of precision and recall. + Default is 0.9. + :param beta: Parameter for controlling shape of penalty as a + function of as a function of fragmentation. + Default is 3.0. + :param gamma: Relative weight assigned to fragmentation penalty. + Default is 0.5. + :return: A list of computed metric scores. + """ + nltk.download("punkt", quiet=True) + meteor_instance = evaluate.load("meteor") + result = huggingface_evaluate( + meteor_instance, + "meteor", + metric_inputs, + alpha=alpha, + beta=beta, + gamma=gamma, + ) + del meteor_instance + return result + + +@autorag_metric_loop(fields_to_check=["generation_gt", "generated_texts"]) +def rouge( + metric_inputs: List[MetricInput], + rouge_type: Optional[str] = "rougeL", + use_stemmer: bool = False, + split_summaries: bool = False, + batch: int = os.cpu_count(), +) -> List[float]: + """ + Compute rouge score for generation. + + :param metric_inputs: A list of MetricInput schema (Required Field -> "generation_gt", "generated_texts") + :param rouge_type: A rouge type to use for evaluation. + Default is 'RougeL'. + Choose between rouge1, rouge2, rougeL, and rougeLSum. + - rouge1: unigram (1-gram) based scoring. + - rouge2: bigram (2-gram) based scoring. + - rougeL: Longest Common Subsequence based scoring. + - rougeLSum: splits text using "\n" + :param use_stemmer: Bool indicating whether Porter stemmer should be used to + strip word suffixes to improve matching. This arg is used in the + DefaultTokenizer, but other tokenizers might or might not choose to + use this. Default is False. + :param split_summaries: Whether to add newlines between sentences for rougeLsum. + Default is False. + :param batch: The batch size for processing. + Default is your cpu count. + :return: A list of computed metric scores. + """ + rouge_instance = RougeScorer( + rouge_types=[rouge_type], + use_stemmer=use_stemmer, + split_summaries=split_summaries, + tokenizer=tokenizers.DefaultTokenizer(use_stemmer), + ) + + async def compute(gt: List[str], pred: str) -> float: + return rouge_instance.score_multi(targets=gt, prediction=pred)[ + rouge_type + ].fmeasure + + tasks = [ + compute(metric_input.generation_gt, metric_input.generated_texts) + for metric_input in metric_inputs + ] + loop = get_event_loop() + result = loop.run_until_complete(process_batch(tasks, batch_size=batch)) + + del rouge_instance + return result + + +@autorag_metric_loop(fields_to_check=["generation_gt", "generated_texts"]) +def sem_score( + metric_inputs: List[MetricInput], + embedding_model: Optional[BaseEmbedding] = None, + batch: int = 128, +) -> List[float]: + """ + Compute sem score between generation gt and pred with cosine similarity. + + :param metric_inputs: A list of MetricInput schema (Required Field -> "generation_gt", "generated_texts") + :param embedding_model: Embedding model to use for compute cosine similarity. + Default is all-mpnet-base-v2 embedding model. + The paper used this embedding model. + :param batch: The batch size for processing. + Default is 128 + :return: A list of computed metric scores. + """ + generations = [metric_input.generated_texts for metric_input in metric_inputs] + generation_gt = [metric_input.generation_gt for metric_input in metric_inputs] + if embedding_model is None: + embedding_model = embedding_models.get("huggingface_all_mpnet_base_v2")() + + embedding_model.embed_batch_size = batch + + openai_embedding_max_length = 8000 + if isinstance(embedding_model, OpenAIEmbedding): + generations = openai_truncate_by_token( + generations, openai_embedding_max_length, embedding_model.model_name + ) + + embedded_pred: List[List[float]] = embedding_model.get_text_embedding_batch( + generations, show_progress=True + ) + gt_lengths = list(map(len, generation_gt)) + flatten_gt = list(itertools.chain.from_iterable(generation_gt)) + if isinstance(embedding_model, OpenAIEmbedding): + flatten_gt = openai_truncate_by_token( + flatten_gt, openai_embedding_max_length, embedding_model.model_name + ) + embedded_gt_flatten = embedding_model.get_text_embedding_batch( + flatten_gt, show_progress=True + ) + # re-group embedded_gt_flatten with gt_lengths + iterator = iter(embedded_gt_flatten) + embedded_gt: List[List[List[float]]] = [ + list(itertools.islice(iterator, length)) for length in gt_lengths + ] + + result = [] + for gt, pred in zip(embedded_gt, embedded_pred): + similarity_scores: List[float] = list( + map(lambda x: calculate_cosine_similarity(x, pred), gt) + ) + result.append(max(similarity_scores)) + + del embedding_model + empty_cuda_cache() + + return result + + +@autorag_metric_loop(fields_to_check=["generation_gt", "generated_texts"]) +def g_eval( + metric_inputs: List[MetricInput], + metrics: Optional[List[str]] = None, + model: str = "gpt-4-0125-preview", + batch_size: int = 8, +) -> List[float]: + """ + Calculate G-Eval score. + G-eval is a metric that uses high-performance LLM model to evaluate generation performance. + It evaluates the generation result by coherence, consistency, fluency, and relevance. + It uses only 'openai' model, and we recommend to use gpt-4 for evaluation accuracy. + + :param metric_inputs: A list of MetricInput schema (Required Field -> "generation_gt", "generated_texts") + :param metrics: A list of metrics to use for evaluation. + Default is all metrics, which is ['coherence', 'consistency', 'fluency', 'relevance']. + :param model: OpenAI model name. + Default is 'gpt-4-0125-preview'. + :param batch_size: The batch size for processing. + Default is 8. + :return: G-Eval score. + """ + generations = [metric_input.generated_texts for metric_input in metric_inputs] + generation_gt = [metric_input.generation_gt for metric_input in metric_inputs] + loop = get_event_loop() + tasks = [ + async_g_eval(gt, pred, metrics, model) + for gt, pred in zip(generation_gt, generations) + ] + result = loop.run_until_complete(process_batch(tasks, batch_size=batch_size)) + return result + + +async def async_g_eval( + generation_gt: List[str], + pred: str, + metrics: Optional[List[str]] = None, + model: str = "gpt-4-0125-preview", +) -> float: + available_metrics = ["coherence", "consistency", "fluency", "relevance"] + if metrics is None: + metrics = available_metrics + else: + assert len(metrics) > 0, "metrics must be a list of string" + metrics = [metric for metric in metrics if metric in available_metrics] + + current_path = os.path.dirname(os.path.realpath(__file__)) + prompt_path = os.path.join(current_path, "g_eval_prompts") + g_eval_prompts = { + "coherence": open(os.path.join(prompt_path, "coh_detailed.txt")).read(), + "consistency": open(os.path.join(prompt_path, "con_detailed.txt")).read(), + "fluency": open(os.path.join(prompt_path, "flu_detailed.txt")).read(), + "relevance": open(os.path.join(prompt_path, "rel_detailed.txt")).read(), + } + + client = AsyncOpenAI() + + async def g_eval_score(prompt: str, gen_gt: List[str], pred: str): + scores = [] + for gt in gen_gt: + input_prompt = prompt.replace("{{Document}}", gt).replace( + "{{Summary}}", pred + ) + response = await client.chat.completions.create( + model=model, + messages=[{"role": "system", "content": input_prompt}], + logprobs=True, + top_logprobs=5, + temperature=0, + max_tokens=2, + frequency_penalty=0, + presence_penalty=0, + stop=None, + n=20, + ) + if "(1-3):" in prompt: + scores.append(get_g_eval_score(response, max_score=3)) + else: + scores.append(get_g_eval_score(response)) + return max(scores) + + def get_g_eval_score(responses, max_score: int = 5) -> int: + target_tokens = {str(i): 0 for i in range(1, max_score + 1)} + for choice in responses.choices: + first_top_log_probs = choice.logprobs.content[0].top_logprobs + for i, top_log_prob in enumerate( + list(map(lambda x: x.token, first_top_log_probs)) + ): + if top_log_prob in target_tokens: + target_tokens[top_log_prob] += 5 - i + + return int(max(target_tokens, key=target_tokens.get)) + + g_eval_scores = await asyncio.gather( + *(g_eval_score(g_eval_prompts[x], generation_gt, pred) for x in metrics) + ) + return sum(g_eval_scores) / len(g_eval_scores) + + +@autorag_metric_loop(fields_to_check=["generation_gt", "generated_texts"]) +def bert_score( + metric_inputs: List[MetricInput], + lang: str = "en", + batch: int = 128, + n_threads: int = os.cpu_count(), +) -> List[float]: + generations = [metric_input.generated_texts for metric_input in metric_inputs] + generation_gt = [metric_input.generation_gt for metric_input in metric_inputs] + evaluator = evaluate.load("bertscore") + + df = pd.DataFrame( + { + "reference": generation_gt, + "prediction": generations, + "lang": lang, + } + ) + + df = df.explode("reference", ignore_index=False) + df["bert_score"] = evaluator.compute( + predictions=df["prediction"].tolist(), + references=df["reference"].tolist(), + lang=lang, + nthreads=n_threads, + batch_size=batch, + )["f1"] + + del evaluator + empty_cuda_cache() + + return df.groupby(level=0)["bert_score"].max().tolist() diff --git a/autorag-workspace/autorag/evaluation/metric/retrieval.py b/autorag-workspace/autorag/evaluation/metric/retrieval.py new file mode 100644 index 0000000..1327b99 --- /dev/null +++ b/autorag-workspace/autorag/evaluation/metric/retrieval.py @@ -0,0 +1,115 @@ +import itertools +import math + +from autorag.evaluation.metric.util import autorag_metric +from autorag.schema.metricinput import MetricInput + + +@autorag_metric(fields_to_check=["retrieval_gt", "retrieved_ids"]) +def retrieval_f1(metric_input: MetricInput): + """ + Compute f1 score for retrieval. + + :param metric_input: The MetricInput schema for AutoRAG metric. + :return: The f1 score. + """ + recall_score = retrieval_recall.__wrapped__(metric_input) + precision_score = retrieval_precision.__wrapped__(metric_input) + if recall_score + precision_score == 0: + return 0 + else: + return 2 * (recall_score * precision_score) / (recall_score + precision_score) + + +@autorag_metric(fields_to_check=["retrieval_gt", "retrieved_ids"]) +def retrieval_recall(metric_input: MetricInput) -> float: + gt, pred = metric_input.retrieval_gt, metric_input.retrieved_ids + + gt_sets = [frozenset(g) for g in gt] + pred_set = set(pred) + hits = sum(any(pred_id in gt_set for pred_id in pred_set) for gt_set in gt_sets) + recall = hits / len(gt) if len(gt) > 0 else 0.0 + return recall + + +@autorag_metric(fields_to_check=["retrieval_gt", "retrieved_ids"]) +def retrieval_precision(metric_input: MetricInput) -> float: + gt, pred = metric_input.retrieval_gt, metric_input.retrieved_ids + + gt_sets = [frozenset(g) for g in gt] + pred_set = set(pred) + hits = sum(any(pred_id in gt_set for gt_set in gt_sets) for pred_id in pred_set) + precision = hits / len(pred) if len(pred) > 0 else 0.0 + return precision + + +@autorag_metric(fields_to_check=["retrieval_gt", "retrieved_ids"]) +def retrieval_ndcg(metric_input: MetricInput) -> float: + gt, pred = metric_input.retrieval_gt, metric_input.retrieved_ids + + gt_sets = [frozenset(g) for g in gt] + pred_set = set(pred) + relevance_scores = { + pred_id: 1 if any(pred_id in gt_set for gt_set in gt_sets) else 0 + for pred_id in pred_set + } + + dcg = sum( + (2 ** relevance_scores[doc_id] - 1) / math.log2(i + 2) + for i, doc_id in enumerate(pred) + ) + + len_flatten_gt = len(list(itertools.chain.from_iterable(gt))) + len_pred = len(pred) + ideal_pred = [1] * min(len_flatten_gt, len_pred) + [0] * max( + 0, len_pred - len_flatten_gt + ) + idcg = sum(relevance / math.log2(i + 2) for i, relevance in enumerate(ideal_pred)) + + ndcg = dcg / idcg if idcg > 0 else 0 + return ndcg + + +@autorag_metric(fields_to_check=["retrieval_gt", "retrieved_ids"]) +def retrieval_mrr(metric_input: MetricInput) -> float: + """ + Reciprocal Rank (RR) is the reciprocal of the rank of the first relevant item. + Mean of RR in whole queries is MRR. + """ + gt, pred = metric_input.retrieval_gt, metric_input.retrieved_ids + + # Flatten the ground truth list of lists into a single set of relevant documents + gt_sets = [frozenset(g) for g in gt] + + rr_list = [] + for gt_set in gt_sets: + for i, pred_id in enumerate(pred): + if pred_id in gt_set: + rr_list.append(1.0 / (i + 1)) + break + return sum(rr_list) / len(gt_sets) if rr_list else 0.0 + + +@autorag_metric(fields_to_check=["retrieval_gt", "retrieved_ids"]) +def retrieval_map(metric_input: MetricInput) -> float: + """ + Mean Average Precision (MAP) is the mean of Average Precision (AP) for all queries. + """ + gt, pred = metric_input.retrieval_gt, metric_input.retrieved_ids + + gt_sets = [frozenset(g) for g in gt] + + ap_list = [] + + for gt_set in gt_sets: + pred_hits = [1 if pred_id in gt_set else 0 for pred_id in pred] + precision_list = [ + sum(pred_hits[: i + 1]) / (i + 1) + for i, hit in enumerate(pred_hits) + if hit == 1 + ] + ap_list.append( + sum(precision_list) / len(precision_list) if precision_list else 0.0 + ) + + return sum(ap_list) / len(gt_sets) if ap_list else 0.0 diff --git a/autorag-workspace/autorag/evaluation/metric/retrieval_contents.py b/autorag-workspace/autorag/evaluation/metric/retrieval_contents.py new file mode 100644 index 0000000..ef9ec60 --- /dev/null +++ b/autorag-workspace/autorag/evaluation/metric/retrieval_contents.py @@ -0,0 +1,65 @@ +""" +This file contains the retrieval contents metric, +which means calculate the metric based on the contents of the retrieved items. +""" + +import itertools +from collections import Counter + +import numpy as np + +from autorag.evaluation.metric.util import autorag_metric +from autorag.schema.metricinput import MetricInput +from autorag.utils.util import normalize_string + + +def single_token_f1(ground_truth: str, prediction: str): + prediction_tokens = normalize_string(prediction).split() + ground_truth_tokens = normalize_string(ground_truth).split() + common = Counter(prediction_tokens) & Counter(ground_truth_tokens) + num_same = sum(common.values()) + if num_same == 0: + return 0, 0, 0 + precision = 1.0 * num_same / len(prediction_tokens) + recall = 1.0 * num_same / len(ground_truth_tokens) + f1 = (2 * precision * recall) / (precision + recall) + return precision, recall, f1 + + +@autorag_metric(fields_to_check=["retrieved_contents", "retrieval_gt_contents"]) +def retrieval_token_f1(metric_input: MetricInput): + pred = metric_input.retrieved_contents + gt = itertools.chain.from_iterable(metric_input.retrieval_gt_contents) + + calculated_results = list( + map(lambda x: single_token_f1(x[1], x[0]), list(itertools.product(pred, gt))) + ) + _, _, result = zip(*calculated_results) + result_np = np.array(list(result)).reshape(len(pred), -1) + return result_np.max(axis=1).mean() + + +@autorag_metric(fields_to_check=["retrieved_contents", "retrieval_gt_contents"]) +def retrieval_token_precision(metric_input: MetricInput): + pred = metric_input.retrieved_contents + gt = itertools.chain.from_iterable(metric_input.retrieval_gt_contents) + + calculated_results = list( + map(lambda x: single_token_f1(x[1], x[0]), list(itertools.product(pred, gt))) + ) + result, _, _ = zip(*calculated_results) + result_np = np.array(list(result)).reshape(len(pred), -1) + return result_np.max(axis=1).mean() + + +@autorag_metric(fields_to_check=["retrieved_contents", "retrieval_gt_contents"]) +def retrieval_token_recall(metric_input: MetricInput): + pred = metric_input.retrieved_contents + gt = itertools.chain.from_iterable(metric_input.retrieval_gt_contents) + + calculated_results = list( + map(lambda x: single_token_f1(x[1], x[0]), list(itertools.product(pred, gt))) + ) + _, result, _ = zip(*calculated_results) + result_np = np.array(list(result)).reshape(len(pred), -1) + return result_np.max(axis=1).mean() diff --git a/autorag-workspace/autorag/evaluation/metric/util.py b/autorag-workspace/autorag/evaluation/metric/util.py new file mode 100644 index 0000000..0c53b65 --- /dev/null +++ b/autorag-workspace/autorag/evaluation/metric/util.py @@ -0,0 +1,88 @@ +import functools +from typing import List + +import numpy as np + +from autorag.schema.metricinput import MetricInput +from autorag.utils.util import convert_inputs_to_list + + +def calculate_cosine_similarity(a, b): + dot_product = np.dot(a, b) + norm_a = np.linalg.norm(a) + norm_b = np.linalg.norm(b) + similarity = dot_product / (norm_a * norm_b) + return similarity + + +def calculate_l2_distance(a, b): + return np.linalg.norm(a - b) + + +def calculate_inner_product(a, b): + return np.dot(a, b) + + +def autorag_metric(fields_to_check: List[str]): + def decorator_autorag_metric(func): + @functools.wraps(func) + @convert_inputs_to_list + def wrapper(metric_inputs: List[MetricInput], **kwargs) -> List[float]: + """ + Use 'for loop' to run each metric input. + Put the single metric input into the metric function. + + :param metric_inputs: A list MetricInput schema for AutoRAG metric. + :param kwargs: The additional arguments for metric function. + :return: A list of computed metric scores. + """ + results = [] + for metric_input in metric_inputs: + if metric_input.is_fields_notnone(fields_to_check=fields_to_check): + results.append(func(metric_input, **kwargs)) + else: + results.append(None) + return results + + return wrapper + + return decorator_autorag_metric + + +def autorag_metric_loop(fields_to_check: List[str]): + def decorator_autorag_generation_metric(func): + @functools.wraps(func) + @convert_inputs_to_list + def wrapper(metric_inputs: List[MetricInput], **kwargs) -> List[float]: + """ + Put the list of metric inputs into the metric function. + + :param metric_inputs: A list MetricInput schema for AutoRAG metric. + :param kwargs: The additional arguments for metric function. + :return: A list of computed metric scores. + """ + bool_list = [ + metric_input.is_fields_notnone(fields_to_check=fields_to_check) + for metric_input in metric_inputs + ] + valid_inputs = [ + metric_input + for metric_input, is_valid in zip(metric_inputs, bool_list) + if is_valid + ] + + results = [None] * len(metric_inputs) + if valid_inputs: + processed_valid = func(valid_inputs, **kwargs) + + valid_index = 0 + for i, is_valid in enumerate(bool_list): + if is_valid: + results[i] = processed_valid[valid_index] + valid_index += 1 + + return results + + return wrapper + + return decorator_autorag_generation_metric diff --git a/autorag-workspace/autorag/evaluation/retrieval.py b/autorag-workspace/autorag/evaluation/retrieval.py new file mode 100644 index 0000000..417e6e8 --- /dev/null +++ b/autorag-workspace/autorag/evaluation/retrieval.py @@ -0,0 +1,83 @@ +import functools +import warnings +from typing import List, Callable, Any, Tuple, Union, Dict + +import pandas as pd + +from autorag.evaluation.metric import ( + retrieval_recall, + retrieval_precision, + retrieval_f1, + retrieval_ndcg, + retrieval_mrr, + retrieval_map, +) +from autorag.evaluation.util import cast_metrics +from autorag.schema.metricinput import MetricInput + +RETRIEVAL_METRIC_FUNC_DICT = { + func.__name__: func + for func in [ + retrieval_recall, + retrieval_precision, + retrieval_f1, + retrieval_ndcg, + retrieval_mrr, + retrieval_map, + ] +} + + +def evaluate_retrieval( + metric_inputs: List[MetricInput], + metrics: Union[List[str], List[Dict]], +): + def decorator_evaluate_retrieval( + func: Callable[ + [Any], Tuple[List[List[str]], List[List[str]], List[List[float]]] + ], + ): + """ + Decorator for evaluating retrieval results. + You can use this decorator to any method that returns (contents, scores, ids), + which is the output of conventional retrieval modules. + + :param func: Must return (contents, scores, ids) + :return: wrapper function that returns pd.DataFrame, which is the evaluation result. + """ + + @functools.wraps(func) + def wrapper(*args, **kwargs) -> pd.DataFrame: + contents, pred_ids, scores = func(*args, **kwargs) + for metric_input, pred_id in zip(metric_inputs, pred_ids): + metric_input.retrieved_ids = pred_id + + metric_scores = {} + metric_names, metric_params = cast_metrics(metrics) + + for metric_name, metric_param in zip(metric_names, metric_params): + if metric_name in RETRIEVAL_METRIC_FUNC_DICT: + metric_func = RETRIEVAL_METRIC_FUNC_DICT[metric_name] + metric_scores[metric_name] = metric_func( + metric_inputs=metric_inputs, **metric_param + ) + else: + warnings.warn( + f"metric {metric_name} is not in supported metrics: {RETRIEVAL_METRIC_FUNC_DICT.keys()}" + f"{metric_name} will be ignored." + ) + + metric_result_df = pd.DataFrame(metric_scores) + execution_result_df = pd.DataFrame( + { + "retrieved_contents": contents, + "retrieved_ids": pred_ids, + "retrieve_scores": scores, + } + ) + result_df = pd.concat([execution_result_df, metric_result_df], axis=1) + return result_df + + return wrapper + + return decorator_evaluate_retrieval diff --git a/autorag-workspace/autorag/evaluation/retrieval_contents.py b/autorag-workspace/autorag/evaluation/retrieval_contents.py new file mode 100644 index 0000000..5afae2a --- /dev/null +++ b/autorag-workspace/autorag/evaluation/retrieval_contents.py @@ -0,0 +1,65 @@ +import functools +from typing import List, Callable, Any, Tuple + +import pandas as pd + +from autorag.evaluation.metric import ( + retrieval_token_f1, + retrieval_token_precision, + retrieval_token_recall, +) +from autorag.schema.metricinput import MetricInput + + +def evaluate_retrieval_contents(metric_inputs: List[MetricInput], metrics: List[str]): + def decorator_evaluate_retrieval_contents( + func: Callable[ + [Any], Tuple[List[List[str]], List[List[str]], List[List[float]]] + ], + ): + """ + Decorator for evaluating retrieval contents. + You can use this decorator to any method that returns (contents, scores, ids), + which is the output of conventional retrieval modules. + + :param func: Must return (contents, scores, ids) + :return: pd.DataFrame, which is the evaluation result and function result. + """ + + @functools.wraps(func) + def wrapper(*args, **kwargs) -> pd.DataFrame: + contents, pred_ids, scores = func(*args, **kwargs) + metric_funcs = { + retrieval_token_recall.__name__: retrieval_token_recall, + retrieval_token_precision.__name__: retrieval_token_precision, + retrieval_token_f1.__name__: retrieval_token_f1, + } + for metric_input, content in zip(metric_inputs, contents): + metric_input.retrieved_contents = content + + metrics_scores = {} + for metric in metrics: + if metric not in metric_funcs: + raise ValueError( + f"metric {metric} is not in supported metrics: {metric_funcs.keys()}" + ) + else: + metric_func = metric_funcs[metric] + # Extract each required field from all payloads + metric_scores = metric_func(metric_inputs=metric_inputs) + metrics_scores[metric] = metric_scores + + metric_result_df = pd.DataFrame(metrics_scores) + execution_result_df = pd.DataFrame( + { + "retrieved_contents": contents, + "retrieved_ids": pred_ids, + "retrieve_scores": scores, + } + ) + result_df = pd.concat([execution_result_df, metric_result_df], axis=1) + return result_df + + return wrapper + + return decorator_evaluate_retrieval_contents diff --git a/autorag-workspace/autorag/evaluation/util.py b/autorag-workspace/autorag/evaluation/util.py new file mode 100644 index 0000000..feecb9d --- /dev/null +++ b/autorag-workspace/autorag/evaluation/util.py @@ -0,0 +1,43 @@ +from copy import deepcopy +from typing import Union, List, Dict, Tuple, Any + +from autorag.embedding.base import EmbeddingModel + + +def cast_metrics( + metrics: Union[List[str], List[Dict]], +) -> Tuple[List[str], List[Dict[str, Any]]]: + """ + Turn metrics to list of metric names and parameter list. + + :param metrics: List of string or dictionary. + :return: The list of metric names and dictionary list of metric parameters. + """ + metrics_copy = deepcopy(metrics) + if not isinstance(metrics_copy, list): + raise ValueError("metrics must be a list of string or dictionary.") + if isinstance(metrics_copy[0], str): + return metrics_copy, [{} for _ in metrics_copy] + elif isinstance(metrics_copy[0], dict): + # pop 'metric_name' key from dictionary + metric_names = list(map(lambda x: x.pop("metric_name"), metrics_copy)) + metric_params = [ + dict( + map( + lambda x, y: cast_embedding_model(x, y), + metric.keys(), + metric.values(), + ) + ) + for metric in metrics_copy + ] + return metric_names, metric_params + else: + raise ValueError("metrics must be a list of string or dictionary.") + + +def cast_embedding_model(key, value): + if key == "embedding_model": + return key, EmbeddingModel.load(value)() + else: + return key, value diff --git a/autorag-workspace/autorag/evaluator.py b/autorag-workspace/autorag/evaluator.py new file mode 100644 index 0000000..027c0a6 --- /dev/null +++ b/autorag-workspace/autorag/evaluator.py @@ -0,0 +1,560 @@ +import glob +import json +import logging +import os +import shutil +from datetime import datetime +from itertools import chain +from typing import List, Dict, Optional +from rich.progress import Progress, BarColumn, TimeElapsedColumn + +import pandas as pd +import yaml + +from autorag.node_line import run_node_line +from autorag.nodes.retrieval.base import get_bm25_pkl_name +from autorag.nodes.retrieval.bm25 import bm25_ingest +from autorag.nodes.retrieval.vectordb import ( + vectordb_ingest, + filter_exist_ids, + filter_exist_ids_from_retrieval_gt, +) +from autorag.schema import Node +from autorag.schema.node import ( + module_type_exists, + extract_values_from_nodes, + extract_values_from_nodes_strategy, +) +from autorag.utils import ( + cast_qa_dataset, + cast_corpus_dataset, + validate_qa_from_corpus_dataset, +) +from autorag.utils.util import ( + load_summary_file, + explode, + load_yaml_config, + get_event_loop, +) +from autorag.vectordb import load_all_vectordb_from_yaml + +logger = logging.getLogger("AutoRAG") + +ascii_art = """ + _ _____ _____ + /\ | | | __ \ /\ / ____| + / \ _ _| |_ ___ | |__) | / \ | | __ + / /\ \| | | | __/ _ \| _ / / /\ \| | |_ | + / ____ \ |_| | || (_) | | \ \ / ____ \ |__| | + /_/ \_\__,_|\__\___/|_| \_\/_/ \_\_____| + +""" + + +class Evaluator: + def __init__( + self, + qa_data_path: str, + corpus_data_path: str, + project_dir: Optional[str] = None, + ): + """ + Initialize an Evaluator object. + + :param qa_data_path: The path to the QA dataset. + Must be parquet file. + :param corpus_data_path: The path to the corpus dataset. + Must be parquet file. + :param project_dir: The path to the project directory. + Default is the current directory. + """ + # validate data paths + if not os.path.exists(qa_data_path): + raise ValueError(f"QA data path {qa_data_path} does not exist.") + if not os.path.exists(corpus_data_path): + raise ValueError(f"Corpus data path {corpus_data_path} does not exist.") + if not qa_data_path.endswith(".parquet"): + raise ValueError(f"QA data path {qa_data_path} is not a parquet file.") + if not corpus_data_path.endswith(".parquet"): + raise ValueError( + f"Corpus data path {corpus_data_path} is not a parquet file." + ) + self.qa_data_path = qa_data_path + self.corpus_data_path = corpus_data_path + self.qa_data = pd.read_parquet(qa_data_path, engine="pyarrow") + self.corpus_data = pd.read_parquet(corpus_data_path, engine="pyarrow") + self.qa_data = cast_qa_dataset(self.qa_data) + self.corpus_data = cast_corpus_dataset(self.corpus_data) + self.project_dir = project_dir if project_dir is not None else os.getcwd() + if not os.path.exists(self.project_dir): + os.makedirs(self.project_dir) + + validate_qa_from_corpus_dataset(self.qa_data, self.corpus_data) + + # copy dataset to the project directory + if not os.path.exists(os.path.join(self.project_dir, "data")): + os.makedirs(os.path.join(self.project_dir, "data")) + qa_path_in_project = os.path.join(self.project_dir, "data", "qa.parquet") + if not os.path.exists(qa_path_in_project): + self.qa_data.to_parquet(qa_path_in_project, index=False) + corpus_path_in_project = os.path.join( + self.project_dir, "data", "corpus.parquet" + ) + if not os.path.exists(corpus_path_in_project): + self.corpus_data.to_parquet(corpus_path_in_project, index=False) + + def start_trial( + self, yaml_path: str, skip_validation: bool = False, full_ingest: bool = True + ): + """ + Start AutoRAG trial. + The trial means one experiment to optimize the RAG pipeline. + It consists of ingesting corpus data, running all nodes and modules, evaluating and finding the optimal modules. + + :param yaml_path: The config YAML path + :param skip_validation: If True, it skips the validation step. + The validation step checks the input config YAML file is well formatted, + and there is any problem with the system settings. + Default is False. + :param full_ingest: If True, it checks the whole corpus data from corpus.parquet that exists in the Vector DB. + If your corpus is huge and don't want to check the whole vector DB, please set it to False. + :return: None + """ + # Make Resources directory + os.makedirs(os.path.join(self.project_dir, "resources"), exist_ok=True) + + if not skip_validation: + logger.info(ascii_art) + logger.info( + "Start Validation input data and config YAML file first. " + "If you want to skip this, put the --skip_validation flag or " + "`skip_validation` at the start_trial function." + ) + from autorag.validator import Validator # resolve circular import + + validator = Validator( + qa_data_path=self.qa_data_path, corpus_data_path=self.corpus_data_path + ) + validator.validate(yaml_path) + + os.environ["PROJECT_DIR"] = self.project_dir + + trial_name = self.__get_new_trial_name() + self.__make_trial_dir(trial_name) + + # copy YAML file to the trial directory + shutil.copy( + yaml_path, os.path.join(self.project_dir, trial_name, "config.yaml") + ) + yaml_dict = load_yaml_config(yaml_path) + vectordb = yaml_dict.get("vectordb", []) + + vectordb_config_path = os.path.join( + self.project_dir, "resources", "vectordb.yaml" + ) + with open(vectordb_config_path, "w") as f: + yaml.safe_dump({"vectordb": vectordb}, f) + + node_lines = self._load_node_lines(yaml_path) + self.__ingest_bm25_full(node_lines) + + with Progress( + "[progress.description]{task.description}", + BarColumn(), + "[progress.percentage]{task.percentage:>3.0f}%", + "[progress.bar]{task.completed}/{task.total}", + TimeElapsedColumn(), + ) as progress: + # Ingest VectorDB corpus + if any( + list( + map( + lambda nodes: module_type_exists(nodes, "vectordb"), + node_lines.values(), + ) + ) + ): + task_ingest = progress.add_task("[cyan]Ingesting VectorDB...", total=1) + + loop = get_event_loop() + loop.run_until_complete(self.__ingest_vectordb(yaml_path, full_ingest)) + + progress.update(task_ingest, completed=1) + + trial_summary_df = pd.DataFrame( + columns=[ + "node_line_name", + "node_type", + "best_module_filename", + "best_module_name", + "best_module_params", + "best_execution_time", + ] + ) + task_eval = progress.add_task( + "[cyan]Evaluating...", total=sum(map(len, node_lines.values())) + ) + + for i, (node_line_name, node_line) in enumerate(node_lines.items()): + node_line_dir = os.path.join( + self.project_dir, trial_name, node_line_name + ) + os.makedirs(node_line_dir, exist_ok=False) + if i == 0: + previous_result = self.qa_data + logger.info(f"Running node line {node_line_name}...") + previous_result = run_node_line( + node_line, node_line_dir, previous_result, progress, task_eval + ) + + trial_summary_df = self._append_node_line_summary( + node_line_name, node_line_dir, trial_summary_df + ) + + trial_summary_df.to_csv( + os.path.join(self.project_dir, trial_name, "summary.csv"), index=False + ) + + logger.info("Evaluation complete.") + + def __ingest_bm25_full(self, node_lines: Dict[str, List[Node]]): + if any( + list( + map( + lambda nodes: module_type_exists(nodes, "bm25"), node_lines.values() + ) + ) + ): + logger.info("Embedding BM25 corpus...") + bm25_tokenizer_list = list( + chain.from_iterable( + map( + lambda nodes: self._find_bm25_tokenizer(nodes), + node_lines.values(), + ) + ) + ) + + if len(bm25_tokenizer_list) == 0: + bm25_tokenizer_list = ["porter_stemmer"] + for bm25_tokenizer in bm25_tokenizer_list: + bm25_dir = os.path.join( + self.project_dir, "resources", get_bm25_pkl_name(bm25_tokenizer) + ) + if not os.path.exists(os.path.dirname(bm25_dir)): + os.makedirs(os.path.dirname(bm25_dir)) + # ingest because bm25 supports update new corpus data + bm25_ingest(bm25_dir, self.corpus_data, bm25_tokenizer=bm25_tokenizer) + logger.info("BM25 corpus embedding complete.") + + def __get_new_trial_name(self) -> str: + trial_json_path = os.path.join(self.project_dir, "trial.json") + if not os.path.exists(trial_json_path): + return "0" + with open(trial_json_path, "r") as f: + trial_json = json.load(f) + return str(int(trial_json[-1]["trial_name"]) + 1) + + def __make_trial_dir(self, trial_name: str): + trial_json_path = os.path.join(self.project_dir, "trial.json") + if os.path.exists(trial_json_path): + with open(trial_json_path, "r") as f: + trial_json = json.load(f) + else: + trial_json = [] + + trial_json.append( + { + "trial_name": trial_name, + "start_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + } + ) + os.makedirs(os.path.join(self.project_dir, trial_name)) + with open(trial_json_path, "w") as f: + json.dump(trial_json, f, indent=4) + + @staticmethod + def _load_node_lines(yaml_path: str) -> Dict[str, List[Node]]: + yaml_dict = load_yaml_config(yaml_path) + node_lines = yaml_dict["node_lines"] + node_line_dict = {} + for node_line in node_lines: + node_line_dict[node_line["node_line_name"]] = list( + map(lambda node: Node.from_dict(node), node_line["nodes"]) + ) + return node_line_dict + + def restart_trial(self, trial_path: str): + logger.info(ascii_art) + os.environ["PROJECT_DIR"] = self.project_dir + # Check if trial_path exists + if not os.path.exists(trial_path): + raise ValueError(f"Trial path {trial_path} does not exist.") + # Check if trial is completed + if os.path.exists(os.path.join(trial_path, "summary.csv")): + raise ValueError(f"Trial path {trial_path} is already completed.") + + # Extract node lines from config.yaml + yaml_path = os.path.join(trial_path, "config.yaml") + node_lines = self._load_node_lines(yaml_path) + + node_line_names = list(node_lines.keys()) + nodes = list(node_lines.values()) + node_names = list( + map(lambda node: list(map(lambda n: n.node_type, node)), nodes) + ) + + # If the First Node Line folder hasn't even been created, proceed to start_trial + if not os.path.exists(os.path.join(trial_path, node_line_names[0])): + self.start_trial(yaml_path) + return None + + # Find conflict node line and node + conflict_line_name, conflict_node_name = self.__find_conflict_point( + trial_path, node_line_names, node_lines + ) + node_dir = os.path.join(trial_path, conflict_line_name, conflict_node_name) + if os.path.exists(node_dir): + shutil.rmtree(node_dir) + + # Set remain_nodes and remain_lines + remain_nodes, completed_node_names, remain_lines, remain_line_names = ( + self._set_remain_nodes_and_lines( + node_line_names, + nodes, + node_names, + conflict_node_name, + conflict_line_name, + ) + ) + # Set previous_result + previous_result = self.__set_previous_result( + node_line_names, node_names, trial_path, conflict_node_name + ) + + # Run Node + if remain_nodes: + conflict_line_dir = os.path.join(trial_path, conflict_line_name) + summary_lst = [] + # Get already run node summary and append to summary_lst + for completed_node_name in completed_node_names: + summary_lst = self._append_node_summary( + conflict_line_dir, completed_node_name, summary_lst + ) + for node in remain_nodes: + previous_result = node.run(previous_result, conflict_line_dir) + summary_lst = self._append_node_summary( + conflict_line_dir, node.node_type, summary_lst + ) + pd.DataFrame(summary_lst).to_csv( + os.path.join(conflict_line_dir, "summary.csv"), index=False + ) + + # Run node line + trial_summary_df = pd.DataFrame( + columns=[ + "node_line_name", + "node_type", + "best_module_filename", + "best_module_name", + "best_module_params", + "best_execution_time", + ] + ) + completed_line_names = node_line_names[ + : node_line_names.index(conflict_line_name) + ] + # Get already run node line's summary and append to trial_summary_df + if completed_line_names: + for line_name in completed_line_names: + node_line_dir = os.path.join(trial_path, line_name) + trial_summary_df = self._append_node_line_summary( + line_name, node_line_dir, trial_summary_df + ) + if remain_lines: + for node_line_name, node_line in zip(remain_line_names, remain_lines): + node_line_dir = os.path.join(trial_path, node_line_name) + if not os.path.exists(node_line_dir): + os.makedirs(node_line_dir) + logger.info(f"Running node line {node_line_name}...") + previous_result = run_node_line( + node_line, node_line_dir, previous_result + ) + trial_summary_df = self._append_node_line_summary( + node_line_name, node_line_dir, trial_summary_df + ) + trial_summary_df.to_csv(os.path.join(trial_path, "summary.csv"), index=False) + + logger.info("Evaluation complete.") + + def __find_conflict_point( + self, + trial_path: str, + node_line_names: List[str], + node_lines: Dict[str, List[Node]], + ) -> tuple[str, str]: + for node_line_name in node_line_names: + node_line_dir = os.path.join(trial_path, node_line_name) + if not os.path.exists(node_line_dir): + return node_line_name, node_lines[node_line_name][0].node_type + + if not os.path.exists(os.path.join(node_line_dir, "summary.csv")): + conflict_node_name = self._find_conflict_node_name( + node_line_dir, node_lines[node_line_name] + ) + return node_line_name, conflict_node_name + + raise ValueError(f"No error node line found in {trial_path}.") + + @staticmethod + def _find_conflict_node_name(node_line_dir: str, node_line: List[Node]) -> str: + for node in node_line: + node_dir = os.path.join(node_line_dir, node.node_type) + if not os.path.exists(node_dir) or not os.path.exists( + os.path.join(node_dir, "summary.csv") + ): + return node.node_type + raise TypeError("No conflict node name found.") + + def __set_previous_result( + self, + node_line_names: List[str], + node_names: List[List[str]], + trial_path: str, + conflict_node_name: str, + ): + exploded_node_line, exploded_node = explode(node_line_names, node_names) + conflict_node_index = exploded_node.index(conflict_node_name) + # Set previous_result + if conflict_node_index == 0: + previous_result = self.qa_data + else: + previous_node_line = exploded_node_line[conflict_node_index - 1] + previous_node = exploded_node[conflict_node_index - 1] + + previous_node_dir = os.path.join( + trial_path, previous_node_line, previous_node + ) + best_file_pattern = f"{previous_node_dir}/best_*.parquet" + previous_result = pd.read_parquet( + glob.glob(best_file_pattern)[0], engine="pyarrow" + ) + return previous_result + + @staticmethod + def _set_remain_nodes_and_lines( + node_line_names: List[str], + nodes: List[List[Node]], + node_names: List[List[str]], + conflict_node_name: str, + conflict_node_line_name: str, + ): + conflict_node_line_index = node_line_names.index(conflict_node_line_name) + full_conflict_node_line_nodes = nodes[conflict_node_line_index] + full_conflict_node_line_node_names = node_names[conflict_node_line_index] + + if conflict_node_name == full_conflict_node_line_node_names[0]: + remain_nodes = None + completed_node_names = None + remain_node_lines = nodes[conflict_node_line_index:] + remain_node_line_names = node_line_names[conflict_node_line_index:] + else: + conflict_node_index = full_conflict_node_line_node_names.index( + conflict_node_name + ) + remain_nodes = full_conflict_node_line_nodes[conflict_node_index:] + completed_node_names = full_conflict_node_line_node_names[ + :conflict_node_index + ] + if conflict_node_line_index + 1 >= len(node_line_names): + remain_node_lines = None + remain_node_line_names = None + else: + remain_node_lines = nodes[conflict_node_line_index + 1 :] + remain_node_line_names = node_line_names[conflict_node_line_index + 1 :] + return ( + remain_nodes, + completed_node_names, + remain_node_lines, + remain_node_line_names, + ) + + @staticmethod + def _append_node_line_summary( + node_line_name: str, node_line_dir: str, trial_summary_df: pd.DataFrame + ): + summary_df = load_summary_file( + os.path.join(node_line_dir, "summary.csv"), + dict_columns=["best_module_params"], + ) + summary_df = summary_df.assign(node_line_name=node_line_name) + summary_df = summary_df[list(trial_summary_df.columns)] + if len(trial_summary_df) <= 0: + trial_summary_df = summary_df + else: + trial_summary_df = pd.concat( + [trial_summary_df, summary_df], ignore_index=True + ) + return trial_summary_df + + @staticmethod + def _append_node_summary( + node_line_dir: str, node_name: str, summary_lst: List[Dict] + ): + node_summary_df = load_summary_file( + os.path.join(node_line_dir, node_name, "summary.csv") + ) + best_node_row = node_summary_df.loc[node_summary_df["is_best"]] + summary_lst.append( + { + "node_type": node_name, + "best_module_filename": best_node_row["filename"].values[0], + "best_module_name": best_node_row["module_name"].values[0], + "best_module_params": best_node_row["module_params"].values[0], + "best_execution_time": best_node_row["execution_time"].values[0], + } + ) + return summary_lst + + @staticmethod + def _find_bm25_tokenizer(nodes: List[Node]): + bm25_tokenizer_list = extract_values_from_nodes(nodes, "bm25_tokenizer") + strategy_tokenizer_list = list( + chain.from_iterable( + extract_values_from_nodes_strategy(nodes, "bm25_tokenizer") + ) + ) + return list(set(bm25_tokenizer_list + strategy_tokenizer_list)) + + @staticmethod + def _find_embedding_model(nodes: List[Node]): + embedding_models_list = extract_values_from_nodes(nodes, "embedding_model") + retrieval_module_dicts = extract_values_from_nodes_strategy( + nodes, "retrieval_modules" + ) + for retrieval_modules in retrieval_module_dicts: + vectordb_modules = list( + filter(lambda x: x["module_type"] == "vectordb", retrieval_modules) + ) + embedding_models_list.extend( + list(map(lambda x: x.get("embedding_model", None), vectordb_modules)) + ) + embedding_models_list = list( + filter(lambda x: x is not None, embedding_models_list) + ) + return list(set(embedding_models_list)) + + async def __ingest_vectordb(self, yaml_path, full_ingest: bool): + vectordb_list = load_all_vectordb_from_yaml(yaml_path, self.project_dir) + if full_ingest is True: + # get the target ingest corpus from the whole corpus + for vectordb in vectordb_list: + target_corpus = await filter_exist_ids(vectordb, self.corpus_data) + await vectordb_ingest(vectordb, target_corpus) + else: + # get the target ingest corpus from the retrieval gt only + for vectordb in vectordb_list: + target_corpus = await filter_exist_ids_from_retrieval_gt( + vectordb, self.qa_data, self.corpus_data + ) + await vectordb_ingest(vectordb, target_corpus) diff --git a/autorag-workspace/autorag/generator_models.py b/autorag-workspace/autorag/generator_models.py new file mode 100644 index 0000000..d8b80d4 --- /dev/null +++ b/autorag-workspace/autorag/generator_models.py @@ -0,0 +1,4 @@ +import autorag +from llama_index.llms.ollama import Ollama + +autorag.generator_models["ollama"] = Ollama \ No newline at end of file diff --git a/autorag-workspace/autorag/node_line.py b/autorag-workspace/autorag/node_line.py new file mode 100644 index 0000000..a829046 --- /dev/null +++ b/autorag-workspace/autorag/node_line.py @@ -0,0 +1,73 @@ +import os +import pathlib +from typing import Dict, List, Optional +from rich.progress import Progress + +import pandas as pd + +from autorag.schema import Node +from autorag.utils.util import load_summary_file + + +def make_node_lines(node_line_dict: Dict) -> List[Node]: + """ + This method makes a list of nodes from node line dictionary. + :param node_line_dict: Node_line_dict loaded from yaml file, or get from user input. + :return: List of Nodes inside this node line. + """ + nodes = node_line_dict.get("nodes") + if nodes is None: + raise ValueError("Node line must have 'nodes' key.") + node_objects = list(map(lambda x: Node.from_dict(x), nodes)) + return node_objects + + +def run_node_line( + nodes: List[Node], + node_line_dir: str, + previous_result: Optional[pd.DataFrame] = None, + progress: Progress = None, + task_eval: Progress.tasks = None, +): + """ + Run the whole node line by running each node. + + :param nodes: A list of nodes. + :param node_line_dir: This node line's directory. + :param previous_result: A result of the previous node line. + If None, it loads qa data from data/qa.parquet. + :param progress: Rich Progress object. + :param task_eval: Progress task object + :return: The final result of the node line. + """ + if previous_result is None: + project_dir = pathlib.PurePath(node_line_dir).parent.parent + qa_path = os.path.join(project_dir, "data", "qa.parquet") + if not os.path.exists(qa_path): + raise ValueError(f"qa.parquet does not exist in {qa_path}.") + previous_result = pd.read_parquet(qa_path, engine="pyarrow") + + summary_lst = [] + for node in nodes: + previous_result = node.run(previous_result, node_line_dir) + node_summary_df = load_summary_file( + os.path.join(node_line_dir, node.node_type, "summary.csv") + ) + best_node_row = node_summary_df.loc[node_summary_df["is_best"]] + summary_lst.append( + { + "node_type": node.node_type, + "best_module_filename": best_node_row["filename"].values[0], + "best_module_name": best_node_row["module_name"].values[0], + "best_module_params": best_node_row["module_params"].values[0], + "best_execution_time": best_node_row["execution_time"].values[0], + } + ) + # Update progress for each node + if progress: + progress.update(task_eval, advance=1) + + pd.DataFrame(summary_lst).to_csv( + os.path.join(node_line_dir, "summary.csv"), index=False + ) + return previous_result diff --git a/autorag-workspace/autorag/nodes/__init__.py b/autorag-workspace/autorag/nodes/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/autorag-workspace/autorag/nodes/generator/__init__.py b/autorag-workspace/autorag/nodes/generator/__init__.py new file mode 100644 index 0000000..9fce695 --- /dev/null +++ b/autorag-workspace/autorag/nodes/generator/__init__.py @@ -0,0 +1,4 @@ +from .llama_index_llm import LlamaIndexLLM +from .openai_llm import OpenAILLM +from .vllm import Vllm +from .vllm_api import VllmAPI diff --git a/autorag-workspace/autorag/nodes/generator/base.py b/autorag-workspace/autorag/nodes/generator/base.py new file mode 100644 index 0000000..94f1833 --- /dev/null +++ b/autorag-workspace/autorag/nodes/generator/base.py @@ -0,0 +1,103 @@ +import abc +import functools +import logging +from pathlib import Path +from typing import Union, Tuple, List + +import pandas as pd +from llama_index.core.output_parsers import PydanticOutputParser + +from autorag import generator_models +from autorag.schema import BaseModule +from autorag.utils import result_to_dataframe + +logger = logging.getLogger("AutoRAG") + + +class BaseGenerator(BaseModule, metaclass=abc.ABCMeta): + def __init__(self, project_dir: str, llm: str, *args, **kwargs): + logger.info(f"Initialize generator node - {self.__class__.__name__}") + self.llm = llm + + def __del__(self): + logger.info(f"Deleting generator module - {self.__class__.__name__}") + + def cast_to_run(self, previous_result: pd.DataFrame, *args, **kwargs): + logger.info(f"Running generator node - {self.__class__.__name__} module...") + assert ( + "prompts" in previous_result.columns + ), "previous_result must contain prompts column." + prompts = previous_result["prompts"].tolist() + return prompts + + def structured_output(self, prompts: List[str], output_cls): + response, _, _ = self._pure(prompts) + parser = PydanticOutputParser(output_cls) + result = [] + for res in response: + try: + result.append(parser.parse(res)) + except Exception as e: + logger.warning( + f"Error parsing response: {e} \nSo returning None instead in this case." + ) + result.append(None) + return result + + @abc.abstractmethod + async def astream(self, prompt: str, **kwargs): + pass + + @abc.abstractmethod + def stream(self, prompt: str, **kwargs): + pass + + +def generator_node(func): + @functools.wraps(func) + @result_to_dataframe(["generated_texts", "generated_tokens", "generated_log_probs"]) + def wrapper( + project_dir: Union[str, Path], previous_result: pd.DataFrame, llm: str, **kwargs + ) -> Tuple[List[str], List[List[int]], List[List[float]]]: + """ + This decorator makes a generator module to be a node. + It automatically extracts prompts from previous_result and runs the generator function. + Plus, it retrieves the llm instance from autorag.generator_models. + + :param project_dir: The project directory. + :param previous_result: The previous result that contains prompts, + :param llm: The llm name that you want to use. + :param kwargs: The extra parameters for initializing the llm instance. + :return: Pandas dataframe that contains generated texts, generated tokens, and generated log probs. + Each column is "generated_texts", "generated_tokens", and "generated_log_probs". + """ + logger.info(f"Running generator node - {func.__name__} module...") + assert ( + "prompts" in previous_result.columns + ), "previous_result must contain prompts column." + prompts = previous_result["prompts"].tolist() + if func.__name__ == "llama_index_llm": + if llm not in generator_models: + raise ValueError( + f"{llm} is not a valid llm name. Please check the llm name." + "You can check valid llm names from autorag.generator_models." + ) + batch = kwargs.pop("batch", 16) + if llm == "huggingfacellm": + model_name = kwargs.pop("model", None) + if model_name is not None: + kwargs["model_name"] = model_name + else: + if "model_name" not in kwargs.keys(): + raise ValueError( + "`model` or `model_name` parameter must be provided for using huggingfacellm." + ) + kwargs["tokenizer_name"] = kwargs["model_name"] + llm_instance = generator_models[llm](**kwargs) + result = func(prompts=prompts, llm=llm_instance, batch=batch) + del llm_instance + return result + else: + return func(prompts=prompts, llm=llm, **kwargs) + + return wrapper diff --git a/autorag-workspace/autorag/nodes/generator/llama_index_llm.py b/autorag-workspace/autorag/nodes/generator/llama_index_llm.py new file mode 100644 index 0000000..a3a99f8 --- /dev/null +++ b/autorag-workspace/autorag/nodes/generator/llama_index_llm.py @@ -0,0 +1,97 @@ +from typing import List, Tuple + +import pandas as pd +from llama_index.core.base.llms.base import BaseLLM +from transformers import AutoTokenizer + +from autorag import generator_models +from autorag.nodes.generator.base import BaseGenerator +from autorag.utils.util import ( + get_event_loop, + process_batch, + result_to_dataframe, + pop_params, +) + + +class LlamaIndexLLM(BaseGenerator): + def __init__(self, project_dir: str, llm: str, batch: int = 16, *args, **kwargs): + """ + Initialize the Llama Index LLM module. + + :param project_dir: The project directory. + :param llm: A llama index LLM instance. + :param batch: The batch size for llm. + Set low if you face some errors. + Default is 16. + :param kwargs: The extra parameters for initializing the llm instance. + """ + super().__init__(project_dir=project_dir, llm=llm) + if self.llm not in generator_models.keys(): + raise ValueError( + f"{self.llm} is not a valid llm name. Please check the llm name." + "You can check valid llm names from autorag.generator_models." + ) + self.batch = batch + llm_class = generator_models[self.llm] + + if llm_class.class_name() in [ + "HuggingFace_LLM", + "HuggingFaceInferenceAPI", + "TextGenerationInference", + ]: + model_name = kwargs.pop("model", None) + if model_name is not None: + kwargs["model_name"] = model_name + else: + if "model_name" not in kwargs.keys(): + raise ValueError( + "`model` or `model_name` parameter must be provided for using huggingfacellm." + ) + kwargs["tokenizer_name"] = kwargs["model_name"] + self.llm_instance: BaseLLM = llm_class(**pop_params(llm_class.__init__, kwargs)) + + def __del__(self): + super().__del__() + del self.llm_instance + + @result_to_dataframe(["generated_texts", "generated_tokens", "generated_log_probs"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + prompts = self.cast_to_run(previous_result=previous_result) + return self._pure(prompts) + + def _pure( + self, + prompts: List[str], + ) -> Tuple[List[str], List[List[int]], List[List[float]]]: + """ + Llama Index LLM module. + It gets the LLM instance from llama index, and returns generated text by the input prompt. + It does not generate the right log probs, but it returns the pseudo log probs, + which are not meant to be used for other modules. + + :param prompts: A list of prompts. + :return: A tuple of three elements. + The first element is a list of a generated text. + The second element is a list of generated text's token ids, used tokenizer is GPT2Tokenizer. + The third element is a list of generated text's pseudo log probs. + """ + tasks = [self.llm_instance.acomplete(prompt) for prompt in prompts] + loop = get_event_loop() # get_event_loop() + results = loop.run_until_complete(process_batch(tasks, batch_size=self.batch)) + + generated_texts = list(map(lambda x: x.text, results)) + tokenizer = AutoTokenizer.from_pretrained("gpt2", use_fast=False) + tokenized_ids = tokenizer(generated_texts).data["input_ids"] + pseudo_log_probs = list(map(lambda x: [0.5] * len(x), tokenized_ids)) + return generated_texts, tokenized_ids, pseudo_log_probs + + async def astream(self, prompt: str, **kwargs): + async for completion_response in await self.llm_instance.astream_complete( + prompt + ): + yield completion_response.text + + def stream(self, prompt: str, **kwargs): + for completion_response in self.llm_instance.stream_complete(prompt): + yield completion_response.text diff --git a/autorag-workspace/autorag/nodes/generator/openai_llm.py b/autorag-workspace/autorag/nodes/generator/openai_llm.py new file mode 100644 index 0000000..cdf2fc4 --- /dev/null +++ b/autorag-workspace/autorag/nodes/generator/openai_llm.py @@ -0,0 +1,296 @@ +import logging +from typing import List, Tuple + +import pandas as pd +import tiktoken +from openai import AsyncOpenAI +from tiktoken import Encoding + +from autorag.nodes.generator.base import BaseGenerator +from autorag.utils.util import ( + get_event_loop, + process_batch, + pop_params, + result_to_dataframe, +) + +logger = logging.getLogger("AutoRAG") + +MAX_TOKEN_DICT = { # model name : token limit + "gpt-4.5-preview": 128_000, + "gpt-4.5-preview-2025-02-27": 128_000, + "o1": 200_000, + "o1-preview": 128_000, + "o1-preview-2024-09-12": 128_000, + "o1-mini": 128_000, + "o1-mini-2024-09-12": 128_000, + "o3-mini": 200_000, + "gpt-4o-mini": 128_000, + "gpt-4o-mini-2024-07-18": 128_000, + "gpt-4o": 128_000, + "gpt-4o-2024-08-06": 128_000, + "gpt-4o-2024-05-13": 128_000, + "chatgpt-4o-latest": 128_000, + "gpt-4-turbo": 128_000, + "gpt-4-turbo-2024-04-09": 128_000, + "gpt-4-turbo-preview": 128_000, + "gpt-4-0125-preview": 128_000, + "gpt-4-1106-preview": 128_000, + "gpt-4-vision-preview": 128_000, + "gpt-4-1106-vision-preview": 128_000, + "gpt-4": 8_192, + "gpt-4-0613": 8_192, + "gpt-4-32k": 32_768, + "gpt-4-32k-0613": 32_768, + "gpt-3.5-turbo-0125": 16_385, + "gpt-3.5-turbo": 16_385, + "gpt-3.5-turbo-1106": 16_385, + "gpt-3.5-turbo-instruct": 4_096, + "gpt-3.5-turbo-16k": 16_385, + "gpt-3.5-turbo-0613": 4_096, + "gpt-3.5-turbo-16k-0613": 16_385, +} + + +class OpenAILLM(BaseGenerator): + def __init__(self, project_dir, llm: str, batch: int = 16, *args, **kwargs): + super().__init__(project_dir, llm, *args, **kwargs) + assert batch > 0, "batch size must be greater than 0." + self.batch = batch + + client_init_params = pop_params(AsyncOpenAI.__init__, kwargs) + self.client = AsyncOpenAI(**client_init_params) + + if self.llm.startswith("gpt-4.5"): + self.tokenizer = tiktoken.get_encoding("o200k_base") + else: + self.tokenizer = tiktoken.encoding_for_model(self.llm) + + self.max_token_size = ( + MAX_TOKEN_DICT.get(self.llm) - 7 + ) # because of chat token usage + if self.max_token_size is None: + raise ValueError( + f"Model {self.llm} does not supported. " + f"Please select the model between {list(MAX_TOKEN_DICT.keys())}" + ) + + @result_to_dataframe(["generated_texts", "generated_tokens", "generated_log_probs"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + prompts = self.cast_to_run(previous_result) + return self._pure(prompts, **kwargs) + + def _pure( + self, + prompts: List[str], + truncate: bool = True, + **kwargs, + ) -> Tuple[List[str], List[List[int]], List[List[float]]]: + """ + OpenAI generator module. + Uses an official openai library for generating answer from the given prompt. + It returns real token ids and log probs, so you must use this for using token ids and log probs. + + :param prompts: A list of prompts. + :param llm: A model name for openai. + Default is gpt-3.5-turbo. + :param batch: Batch size for openai api call. + If you get API limit errors, you should lower the batch size. + Default is 16. + :param truncate: Whether to truncate the input prompt. + Default is True. + :param api_key: OpenAI API key. You can set this by passing env variable `OPENAI_API_KEY` + :param kwargs: The optional parameter for openai api call `openai.chat.completion` + See https://platform.openai.com/docs/api-reference/chat/create for more details. + :return: A tuple of three elements. + The first element is a list of generated text. + The second element is a list of generated text's token ids. + The third element is a list of generated text's log probs. + """ + if kwargs.get("logprobs") is not None: + kwargs.pop("logprobs") + logger.warning( + "parameter logprob does not effective. It always set to True." + ) + if kwargs.get("n") is not None: + kwargs.pop("n") + logger.warning("parameter n does not effective. It always set to 1.") + + # TODO: fix this after updating tiktoken for the gpt-4.5 model. It is not yet supported yet. + if truncate: + prompts = list( + map( + lambda prompt: truncate_by_token( + prompt, self.tokenizer, self.max_token_size + ), + prompts, + ) + ) + + openai_chat_params = pop_params(self.client.chat.completions.create, kwargs) + loop = get_event_loop() + if self.llm.startswith("o1") or self.llm.startswith("o3"): + tasks = [ + self.get_result_o1(prompt, **openai_chat_params) for prompt in prompts + ] + else: + tasks = [ + self.get_result(prompt, **openai_chat_params) for prompt in prompts + ] + result = loop.run_until_complete(process_batch(tasks, self.batch)) + answer_result = list(map(lambda x: x[0], result)) + token_result = list(map(lambda x: x[1], result)) + logprob_result = list(map(lambda x: x[2], result)) + return answer_result, token_result, logprob_result + + def structured_output(self, prompts: List[str], output_cls, **kwargs): + supported_models = [ + "gpt-4o-mini-2024-07-18", + "gpt-4o-2024-08-06", + ] + if self.llm not in supported_models: + raise ValueError( + f"{self.llm} is not a valid model name for structured output. " + f"Please select the model between {supported_models}" + ) + + if kwargs.get("logprobs") is not None: + kwargs.pop("logprobs") + logger.warning( + "parameter logprob does not effective. It always set to False." + ) + if kwargs.get("n") is not None: + kwargs.pop("n") + logger.warning("parameter n does not effective. It always set to 1.") + + # TODO: fix this after updating tiktoken for the gpt-4.5 model. It is not yet supported yet. + prompts = list( + map( + lambda prompt: truncate_by_token( + prompt, self.tokenizer, self.max_token_size + ), + prompts, + ) + ) + + openai_chat_params = pop_params(self.client.beta.chat.completions.parse, kwargs) + loop = get_event_loop() + tasks = [ + self.get_structured_result(prompt, output_cls, **openai_chat_params) + for prompt in prompts + ] + result = loop.run_until_complete(process_batch(tasks, self.batch)) + return result + + async def astream(self, prompt: str, **kwargs): + # TODO: gpt-4.5-preview does not support logprobs. It should be fixed after the openai update. + if kwargs.get("logprobs") is not None: + kwargs.pop("logprobs") + logger.warning( + "parameter logprob does not effective. It always set to False." + ) + if kwargs.get("n") is not None: + kwargs.pop("n") + logger.warning("parameter n does not effective. It always set to 1.") + + prompt = truncate_by_token(prompt, self.tokenizer, self.max_token_size) + + openai_chat_params = pop_params(self.client.chat.completions.create, kwargs) + + stream = await self.client.chat.completions.create( + model=self.llm, + messages=[ + {"role": "user", "content": prompt}, + ], + logprobs=False, + n=1, + stream=True, + **openai_chat_params, + ) + result = "" + async for chunk in stream: + if chunk.choices[0].delta.content is not None: + result += chunk.choices[0].delta.content + yield result + + def stream(self, prompt: str, **kwargs): + raise NotImplementedError("stream method is not implemented yet.") + + async def get_structured_result(self, prompt: str, output_cls, **kwargs): + logprobs = True + if self.llm.startswith("gpt-4.5"): + logprobs = False + response = await self.client.beta.chat.completions.parse( + model=self.llm, + messages=[ + {"role": "user", "content": prompt}, + ], + response_format=output_cls, + logprobs=logprobs, + n=1, + **kwargs, + ) + return response.choices[0].message.parsed + + async def get_result(self, prompt: str, **kwargs): + # TODO: gpt-4.5-preview does not support logprobs. It should be fixed after the openai update. + logprobs = True + if self.llm.startswith("gpt-4.5"): + logprobs = False + response = await self.client.chat.completions.create( + model=self.llm, + messages=[ + {"role": "user", "content": prompt}, + ], + logprobs=logprobs, + n=1, + **kwargs, + ) + choice = response.choices[0] + answer = choice.message.content + # TODO: gpt-4.5-preview does not support logprobs. It should be fixed after the openai update. + if self.llm.startswith("gpt-4.5"): + tokens = self.tokenizer.encode(answer, allowed_special="all") + logprobs = [0.5] * len(tokens) + logger.warning("gpt-4.5-preview does not support logprobs yet.") + else: + logprobs = list(map(lambda x: x.logprob, choice.logprobs.content)) + tokens = list( + map( + lambda x: self.tokenizer.encode(x.token, allowed_special="all")[0], + choice.logprobs.content, + ) + ) + assert len(tokens) == len( + logprobs + ), "tokens and logprobs size is different." + return answer, tokens, logprobs + + async def get_result_o1(self, prompt: str, **kwargs): + assert self.llm.startswith("o1") or self.llm.startswith( + "o3" + ), "This function only supports o1 or o3 model." + # The default temperature for the o1 model is 1. 1 is only supported. + # See https://platform.openai.com/docs/guides/reasoning about beta limitation of o1 models. + kwargs["temperature"] = 1 + kwargs["top_p"] = 1 + kwargs["presence_penalty"] = 0 + kwargs["frequency_penalty"] = 0 + response = await self.client.chat.completions.create( + model=self.llm, + messages=[ + {"role": "user", "content": prompt}, + ], + logprobs=False, + n=1, + **kwargs, + ) + answer = response.choices[0].message.content + tokens = self.tokenizer.encode(answer, allowed_special="all") + pseudo_log_probs = [0.5] * len(tokens) + return answer, tokens, pseudo_log_probs + + +def truncate_by_token(prompt: str, tokenizer: Encoding, max_token_size: int): + tokens = tokenizer.encode(prompt, allowed_special="all") + return tokenizer.decode(tokens[:max_token_size]) diff --git a/autorag-workspace/autorag/nodes/generator/run.py b/autorag-workspace/autorag/nodes/generator/run.py new file mode 100644 index 0000000..aee3238 --- /dev/null +++ b/autorag-workspace/autorag/nodes/generator/run.py @@ -0,0 +1,144 @@ +import os +import pathlib +from typing import List, Dict, Union + +import pandas as pd + +from autorag.evaluation import evaluate_generation +from autorag.evaluation.util import cast_metrics +from autorag.schema.metricinput import MetricInput +from autorag.strategy import measure_speed, filter_by_threshold, select_best +from autorag.utils.util import to_list + + +def run_generator_node( + modules: List, + module_params: List[Dict], + previous_result: pd.DataFrame, + node_line_dir: str, + strategies: Dict, +) -> pd.DataFrame: + """ + Run evaluation and select the best module among generator node results. + And save the results and summary to generator node directory. + + :param modules: Generator modules to run. + :param module_params: Generator module parameters. + Including node parameters, which is used for every module in this node. + :param previous_result: Previous result dataframe. + Could be prompt maker node's result. + :param node_line_dir: This node line's directory. + :param strategies: Strategies for generator node. + :return: The best result dataframe. + It contains previous result columns and generator node's result columns. + """ + if not os.path.exists(node_line_dir): + os.makedirs(node_line_dir) + project_dir = pathlib.PurePath(node_line_dir).parent.parent + node_dir = os.path.join(node_line_dir, "generator") # node name + if not os.path.exists(node_dir): + os.makedirs(node_dir) + qa_data = pd.read_parquet( + os.path.join(project_dir, "data", "qa.parquet"), engine="pyarrow" + ) + if "generation_gt" not in qa_data.columns: + raise ValueError("You must have 'generation_gt' column in qa.parquet.") + + results, execution_times = zip( + *map( + lambda x: measure_speed( + x[0].run_evaluator, + project_dir=project_dir, + previous_result=previous_result, + **x[1], + ), + zip(modules, module_params), + ) + ) + average_times = list(map(lambda x: x / len(results[0]), execution_times)) + + # get average token usage + token_usages = list(map(lambda x: x["generated_tokens"].apply(len).mean(), results)) + + # make rows to metric_inputs + generation_gt = to_list(qa_data["generation_gt"].tolist()) + + metric_inputs = [MetricInput(generation_gt=gen_gt) for gen_gt in generation_gt] + + metric_names, metric_params = cast_metrics(strategies.get("metrics")) + if metric_names is None or len(metric_names) <= 0: + raise ValueError("You must at least one metrics for generator evaluation.") + results = list( + map( + lambda result: evaluate_generator_node( + result, metric_inputs, strategies.get("metrics") + ), + results, + ) + ) + + # save results to folder + filepaths = list( + map(lambda x: os.path.join(node_dir, f"{x}.parquet"), range(len(modules))) + ) + list( + map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths)) + ) # execute save to parquet + filenames = list(map(lambda x: os.path.basename(x), filepaths)) + + summary_df = pd.DataFrame( + { + "filename": filenames, + "module_name": list(map(lambda module: module.__name__, modules)), + "module_params": module_params, + "execution_time": average_times, + "average_output_token": token_usages, + **{ + metric: list(map(lambda x: x[metric].mean(), results)) + for metric in metric_names + }, + } + ) + + # filter by strategies + if strategies.get("speed_threshold") is not None: + results, filenames = filter_by_threshold( + results, average_times, strategies["speed_threshold"], filenames + ) + if strategies.get("token_threshold") is not None: + results, filenames = filter_by_threshold( + results, token_usages, strategies["token_threshold"], filenames + ) + selected_result, selected_filename = select_best( + results, metric_names, filenames, strategies.get("strategy", "mean") + ) + best_result = pd.concat([previous_result, selected_result], axis=1) + + # add 'is_best' column at summary file + summary_df["is_best"] = summary_df["filename"] == selected_filename + + # save files + summary_df.to_csv(os.path.join(node_dir, "summary.csv"), index=False) + best_result.to_parquet( + os.path.join( + node_dir, f"best_{os.path.splitext(selected_filename)[0]}.parquet" + ), + index=False, + ) + return best_result + + +def evaluate_generator_node( + result_df: pd.DataFrame, + metric_inputs: List[MetricInput], + metrics: Union[List[str], List[Dict]], +): + @evaluate_generation(metric_inputs=metric_inputs, metrics=metrics) + def evaluate_generation_module(df: pd.DataFrame): + return ( + df["generated_texts"].tolist(), + df["generated_tokens"].tolist(), + df["generated_log_probs"].tolist(), + ) + + return evaluate_generation_module(result_df) diff --git a/autorag-workspace/autorag/nodes/generator/vllm.py b/autorag-workspace/autorag/nodes/generator/vllm.py new file mode 100644 index 0000000..c6684d8 --- /dev/null +++ b/autorag-workspace/autorag/nodes/generator/vllm.py @@ -0,0 +1,121 @@ +import gc +from copy import deepcopy +from typing import List, Tuple + +import pandas as pd + +from autorag.nodes.generator.base import BaseGenerator +from autorag.utils import result_to_dataframe +from autorag.utils.util import pop_params, to_list + + +class Vllm(BaseGenerator): + def __init__(self, project_dir: str, llm: str, **kwargs): + super().__init__(project_dir, llm, **kwargs) + try: + from vllm import SamplingParams, LLM + except ImportError: + raise ImportError( + "Please install vllm library. You can install it by running `pip install vllm`." + ) + + model_from_kwargs = kwargs.pop("model", None) + model = llm if model_from_kwargs is None else model_from_kwargs + + input_kwargs = deepcopy(kwargs) + sampling_params_init_params = pop_params( + SamplingParams.from_optional, input_kwargs + ) + self.vllm_model = LLM(model, **input_kwargs) + + # delete not sampling param keys in the kwargs + kwargs_keys = list(kwargs.keys()) + for key in kwargs_keys: + if key not in sampling_params_init_params: + kwargs.pop(key) + + def __del__(self): + try: + import torch + import contextlib + + if torch.cuda.is_available(): + from vllm.distributed.parallel_state import ( + destroy_model_parallel, + destroy_distributed_environment, + ) + + destroy_model_parallel() + destroy_distributed_environment() + del self.vllm_model.llm_engine.model_executor + del self.vllm_model + with contextlib.suppress(AssertionError): + torch.distributed.destroy_process_group() + gc.collect() + torch.cuda.empty_cache() + torch.cuda.synchronize() + except ImportError: + del self.vllm_model + + super().__del__() + + @result_to_dataframe(["generated_texts", "generated_tokens", "generated_log_probs"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + prompts = self.cast_to_run(previous_result) + return self._pure(prompts, **kwargs) + + def _pure( + self, prompts: List[str], **kwargs + ) -> Tuple[List[str], List[List[int]], List[List[float]]]: + """ + Vllm module. + It gets the VLLM instance and returns generated texts by the input prompt. + You can set logprobs to get the log probs of the generated text. + Default logprobs is 1. + + :param prompts: A list of prompts. + :param kwargs: The extra parameters for generating the text. + :return: A tuple of three elements. + The first element is a list of generated text. + The second element is a list of generated text's token ids. + The third element is a list of generated text's log probs. + """ + try: + from vllm.outputs import RequestOutput + from vllm.sequence import SampleLogprobs + from vllm import SamplingParams + except ImportError: + raise ImportError( + "Please install vllm library. You can install it by running `pip install vllm`." + ) + + if "logprobs" not in kwargs: + kwargs["logprobs"] = 1 + + sampling_params = pop_params(SamplingParams.from_optional, kwargs) + generate_params = SamplingParams(**sampling_params) + results: List[RequestOutput] = self.vllm_model.generate( + prompts, generate_params + ) + generated_texts = list(map(lambda x: x.outputs[0].text, results)) + generated_token_ids = list(map(lambda x: x.outputs[0].token_ids, results)) + log_probs: List[SampleLogprobs] = list( + map(lambda x: x.outputs[0].logprobs, results) + ) + generated_log_probs = list( + map( + lambda x: list(map(lambda y: y[0][y[1]].logprob, zip(x[0], x[1]))), + zip(log_probs, generated_token_ids), + ) + ) + return ( + to_list(generated_texts), + to_list(generated_token_ids), + to_list(generated_log_probs), + ) + + async def astream(self, prompt: str, **kwargs): + raise NotImplementedError + + def stream(self, prompt: str, **kwargs): + raise NotImplementedError diff --git a/autorag-workspace/autorag/nodes/generator/vllm_api.py b/autorag-workspace/autorag/nodes/generator/vllm_api.py new file mode 100644 index 0000000..4d46f5d --- /dev/null +++ b/autorag-workspace/autorag/nodes/generator/vllm_api.py @@ -0,0 +1,176 @@ +import logging +from typing import List, Tuple +import time + +import pandas as pd +import requests +from asyncio import to_thread + +from autorag.nodes.generator.base import BaseGenerator +from autorag.utils.util import get_event_loop, process_batch, result_to_dataframe + +logger = logging.getLogger("AutoRAG") + +DEFAULT_MAX_TOKENS = 4096 # Default token limit + + +class VllmAPI(BaseGenerator): + def __init__( + self, + project_dir, + llm: str, + uri: str, + max_tokens: int = None, + batch: int = 16, + *args, + **kwargs, + ): + """ + VLLM API Wrapper for OpenAI-compatible chat/completions format. + + :param project_dir: Project directory. + :param llm: Model name (e.g., LLaMA model). + :param uri: VLLM API server URI. + :param max_tokens: Maximum token limit. + Default is 4096. + :param batch: Request batch size. + Default is 16. + """ + super().__init__(project_dir, llm, *args, **kwargs) + assert batch > 0, "Batch size must be greater than 0." + self.uri = uri.rstrip("/") # Set API URI + self.batch = batch + # Use the provided max_tokens if available, otherwise use the default + self.max_token_size = max_tokens if max_tokens else DEFAULT_MAX_TOKENS + self.max_model_len = self.get_max_model_length() + logger.info(f"{llm} max model length: {self.max_model_len}") + + @result_to_dataframe(["generated_texts", "generated_tokens", "generated_log_probs"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + prompts = self.cast_to_run(previous_result) + return self._pure(prompts, **kwargs) + + def _pure( + self, prompts: List[str], truncate: bool = True, **kwargs + ) -> Tuple[List[str], List[List[int]], List[List[float]]]: + """ + Method to call the VLLM API to generate text. + + :param prompts: List of input prompts. + :param truncate: Whether to truncate input prompts to fit within the token limit. + :param kwargs: Additional options (e.g., temperature, top_p). + :return: Generated text, token lists, and log probability lists. + """ + if kwargs.get("logprobs") is not None: + kwargs.pop("logprobs") + logger.warning( + "parameter logprob does not effective. It always set to True." + ) + if kwargs.get("n") is not None: + kwargs.pop("n") + logger.warning("parameter n does not effective. It always set to 1.") + + if truncate: + prompts = list(map(lambda p: self.truncate_by_token(p), prompts)) + loop = get_event_loop() + tasks = [to_thread(self.get_result, prompt, **kwargs) for prompt in prompts] + results = loop.run_until_complete(process_batch(tasks, self.batch)) + + answer_result = list(map(lambda x: x[0], results)) + token_result = list(map(lambda x: x[1], results)) + logprob_result = list(map(lambda x: x[2], results)) + return answer_result, token_result, logprob_result + + def truncate_by_token(self, prompt: str) -> str: + """ + Function to truncate prompts to fit within the maximum token limit. + """ + tokens = self.encoding_for_model(prompt)["tokens"] # Simple tokenization + return self.decoding_for_model(tokens[: self.max_model_len])["prompt"] + + def call_vllm_api(self, prompt: str, **kwargs) -> dict: + """ + Calls the VLLM API to get chat/completions responses. + + :param prompt: Input prompt. + :param kwargs: Additional API options (e.g., temperature, max_tokens). + :return: API response. + """ + payload = { + "model": self.llm, + "messages": [{"role": "user", "content": prompt}], + "temperature": kwargs.get("temperature", 0.4), + "max_tokens": min( + kwargs.get("max_tokens", self.max_token_size), self.max_token_size + ), + "logprobs": True, + "n": 1, + } + start_time = time.time() # Record request start time + response = requests.post(f"{self.uri}/v1/chat/completions", json=payload) + end_time = time.time() # Record request end time + + response.raise_for_status() + elapsed_time = end_time - start_time # Calculate elapsed time + logger.info( + f"Request chat completions to vllm server completed in {elapsed_time:.2f} seconds" + ) + return response.json() + + # Additional method: abstract method implementation + async def astream(self, prompt: str, **kwargs): + """ + Asynchronous streaming method not implemented. + """ + raise NotImplementedError("astream method is not implemented for VLLM API yet.") + + def stream(self, prompt: str, **kwargs): + """ + Synchronous streaming method not implemented. + """ + raise NotImplementedError("stream method is not implemented for VLLM API yet.") + + def get_result(self, prompt: str, **kwargs): + response = self.call_vllm_api(prompt, **kwargs) + choice = response["choices"][0] + answer = choice["message"]["content"] + + # Handle cases where logprobs is None + if choice.get("logprobs") and "content" in choice["logprobs"]: + logprobs = list(map(lambda x: x["logprob"], choice["logprobs"]["content"])) + tokens = list( + map( + lambda x: self.encoding_for_model(x["token"])["tokens"], + choice["logprobs"]["content"], + ) + ) + else: + logprobs = [] + tokens = [] + + return answer, tokens, logprobs + + def encoding_for_model(self, answer_piece: str): + payload = { + "model": self.llm, + "prompt": answer_piece, + "add_special_tokens": True, + } + response = requests.post(f"{self.uri}/tokenize", json=payload) + response.raise_for_status() + return response.json() + + def decoding_for_model(self, tokens: list[int]): + payload = { + "model": self.llm, + "tokens": tokens, + } + response = requests.post(f"{self.uri}/detokenize", json=payload) + response.raise_for_status() + return response.json() + + def get_max_model_length(self): + response = requests.get(f"{self.uri}/v1/models") + response.raise_for_status() + json_data = response.json() + return json_data["data"][0]["max_model_len"] diff --git a/autorag-workspace/autorag/nodes/passageaugmenter/__init__.py b/autorag-workspace/autorag/nodes/passageaugmenter/__init__.py new file mode 100644 index 0000000..18d42df --- /dev/null +++ b/autorag-workspace/autorag/nodes/passageaugmenter/__init__.py @@ -0,0 +1,2 @@ +from .pass_passage_augmenter import PassPassageAugmenter +from .prev_next_augmenter import PrevNextPassageAugmenter diff --git a/autorag-workspace/autorag/nodes/passageaugmenter/base.py b/autorag-workspace/autorag/nodes/passageaugmenter/base.py new file mode 100644 index 0000000..6b96a35 --- /dev/null +++ b/autorag-workspace/autorag/nodes/passageaugmenter/base.py @@ -0,0 +1,80 @@ +import abc +import logging +import os + +import pandas as pd + +from autorag.schema import BaseModule +from autorag.utils import ( + validate_qa_dataset, + sort_by_scores, + validate_corpus_dataset, + cast_corpus_dataset, +) +from autorag.utils.util import select_top_k + +logger = logging.getLogger("AutoRAG") + + +class BasePassageAugmenter(BaseModule, metaclass=abc.ABCMeta): + def __init__(self, project_dir: str, *args, **kwargs): + logger.info( + f"Initialize passage augmenter node - {self.__class__.__name__} module..." + ) + data_dir = os.path.join(project_dir, "data") + corpus_df = pd.read_parquet( + os.path.join(data_dir, "corpus.parquet"), engine="pyarrow" + ) + validate_corpus_dataset(corpus_df) + corpus_df = cast_corpus_dataset(corpus_df) + self.corpus_df = corpus_df + + def __del__(self): + logger.info( + f"Initialize passage augmenter node - {self.__class__.__name__} module..." + ) + + def cast_to_run(self, previous_result: pd.DataFrame, *args, **kwargs): + logger.info( + f"Running passage augmenter node - {self.__class__.__name__} module..." + ) + validate_qa_dataset(previous_result) + + # find ids columns + assert ( + "retrieved_ids" in previous_result.columns + ), "previous_result must have retrieved_ids column." + ids = previous_result["retrieved_ids"].tolist() + + return ids + + @staticmethod + def sort_by_scores( + augmented_contents, + augmented_ids, + augmented_scores, + top_k: int, + reverse: bool = True, + ): + # sort by scores + df = pd.DataFrame( + { + "contents": augmented_contents, + "ids": augmented_ids, + "scores": augmented_scores, + } + ) + df[["contents", "ids", "scores"]] = df.apply( + lambda row: sort_by_scores(row, reverse=reverse), + axis=1, + result_type="expand", + ) + + # select by top_k + results = select_top_k(df, ["contents", "ids", "scores"], top_k) + + return ( + results["contents"].tolist(), + results["ids"].tolist(), + results["scores"].tolist(), + ) diff --git a/autorag-workspace/autorag/nodes/passageaugmenter/pass_passage_augmenter.py b/autorag-workspace/autorag/nodes/passageaugmenter/pass_passage_augmenter.py new file mode 100644 index 0000000..7978ad8 --- /dev/null +++ b/autorag-workspace/autorag/nodes/passageaugmenter/pass_passage_augmenter.py @@ -0,0 +1,43 @@ +from typing import List + +import pandas as pd + +from autorag.nodes.passageaugmenter.base import BasePassageAugmenter +from autorag.utils import result_to_dataframe + + +class PassPassageAugmenter(BasePassageAugmenter): + @result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + """ + Run the passage augmenter node - PassPassageAugmenter module. + + :param previous_result: The previous result Dataframe. + :param top_k: You must input the top_k value to get the top k results. + :param kwargs: Not affected. + :return: DataFrame with retrieved_contents, retrieved_ids, and retrieve_scores columns + """ + top_k = kwargs.pop("top_k") + + ids = self.cast_to_run(previous_result) + contents = previous_result["retrieved_contents"].tolist() + scores = previous_result["retrieve_scores"].tolist() + + augmented_ids, augmented_contents, augmented_scores = self._pure( + ids, contents, scores + ) + return self.sort_by_scores( + augmented_contents, augmented_ids, augmented_scores, top_k + ) + + def _pure( + self, + ids_list: List[List[str]], + contents_list: List[List[str]], + scores_list: List[List[float]], + ): + """ + Do not perform augmentation. + Return given passages, scores, and ids as is. + """ + return ids_list, contents_list, scores_list diff --git a/autorag-workspace/autorag/nodes/passageaugmenter/prev_next_augmenter.py b/autorag-workspace/autorag/nodes/passageaugmenter/prev_next_augmenter.py new file mode 100644 index 0000000..13e4063 --- /dev/null +++ b/autorag-workspace/autorag/nodes/passageaugmenter/prev_next_augmenter.py @@ -0,0 +1,155 @@ +from typing import List, Union + +import numpy as np +import pandas as pd + +from autorag.embedding.base import EmbeddingModel +from autorag.evaluation.metric.util import calculate_cosine_similarity +from autorag.nodes.passageaugmenter.base import BasePassageAugmenter +from autorag.utils.util import ( + filter_dict_keys, + fetch_contents, + embedding_query_content, + result_to_dataframe, + empty_cuda_cache, +) + + +class PrevNextPassageAugmenter(BasePassageAugmenter): + def __init__( + self, + project_dir: str, + embedding_model: Union[str, dict] = "openai", + *args, + **kwargs, + ): + """ + Initialize the PrevNextPassageAugmenter module. + + :param project_dir: + :param embedding_model: The embedding model name to use for calculating cosine similarity + Default is openai (text-embedding-ada-002) + :param kwargs: + """ + super().__init__(project_dir, *args, **kwargs) + slim_corpus_df = self.corpus_df[["doc_id", "metadata"]] + slim_corpus_df.loc[:, "metadata"] = slim_corpus_df["metadata"].apply( + filter_dict_keys, keys=["prev_id", "next_id"] + ) + self.slim_corpus_df = slim_corpus_df + + # init embedding model + self.embedding_model = EmbeddingModel.load(embedding_model)() + + def __del__(self): + del self.embedding_model + empty_cuda_cache() + super().__del__() + + @result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + """ + Run the passage augmenter node - PrevNextPassageAugmenter module. + + :param previous_result: The previous result Dataframe. + :param top_k: You must input the top_k value to get the top k results. + :param kwargs: Not affected. + :return: DataFrame with retrieved_contents, retrieved_ids, and retrieve_scores columns + """ + top_k = kwargs.pop("top_k") + + ids = self.cast_to_run(previous_result) + # find queries columns + assert ( + "query" in previous_result.columns + ), "previous_result must have query column." + queries = previous_result["query"].tolist() + + mode = kwargs.pop("mode", "both") + num_passages = kwargs.pop("num_passages", 1) + augmented_ids = self._pure(ids, num_passages, mode) + + # fetch contents from corpus to use augmented ids + augmented_contents = fetch_contents(self.corpus_df, augmented_ids) + + query_embeddings, contents_embeddings = embedding_query_content( + queries, augmented_contents, self.embedding_model, batch=128 + ) + + # get scores from calculated cosine similarity + augmented_scores = [ + np.array( + [ + calculate_cosine_similarity(query_embedding, x) + for x in content_embeddings + ] + ).tolist() + for query_embedding, content_embeddings in zip( + query_embeddings, contents_embeddings + ) + ] + return self.sort_by_scores( + augmented_contents, augmented_ids, augmented_scores, top_k + ) + + def _pure( + self, + ids_list: List[List[str]], + num_passages: int = 1, + mode: str = "both", + ) -> List[List[str]]: + """ + Add passages before and/or after the retrieved passage. + For more information, visit https://docs.llamaindex.ai/en/stable/examples/node_postprocessor/PrevNextPostprocessorDemo/. + + :param ids_list: The list of lists of ids retrieved + :param num_passages: The number of passages to add before and after the retrieved passage + Default is 1. + :param mode: The mode of augmentation + 'prev': add passages before the retrieved passage + 'next': add passages after the retrieved passage + 'both': add passages before and after the retrieved passage + Default is 'next'. + :return: The list of lists of augmented ids + """ + if mode not in ["prev", "next", "both"]: + raise ValueError(f"mode must be 'prev', 'next', or 'both', but got {mode}") + + augmented_ids = [ + ( + lambda ids: prev_next_augmenter_pure( + ids, self.slim_corpus_df, mode, num_passages + ) + )(ids) + for ids in ids_list + ] + + return augmented_ids + + +def prev_next_augmenter_pure( + ids: List[str], corpus_df: pd.DataFrame, mode: str, num_passages: int +): + def fetch_id_sequence(start_id, key): + sequence = [] + current_id = start_id + for _ in range(num_passages): + current_id = ( + corpus_df.loc[corpus_df["doc_id"] == current_id]["metadata"] + .values[0] + .get(key) + ) + if current_id is None: + break + sequence.append(current_id) + return sequence + + augmented_group = [] + for id_ in ids: + current_ids = [id_] + if mode in ["prev", "both"]: + current_ids = fetch_id_sequence(id_, "prev_id")[::-1] + current_ids + if mode in ["next", "both"]: + current_ids += fetch_id_sequence(id_, "next_id") + augmented_group.extend(current_ids) + return augmented_group diff --git a/autorag-workspace/autorag/nodes/passageaugmenter/run.py b/autorag-workspace/autorag/nodes/passageaugmenter/run.py new file mode 100644 index 0000000..30e55da --- /dev/null +++ b/autorag-workspace/autorag/nodes/passageaugmenter/run.py @@ -0,0 +1,131 @@ +import logging +import os +import pathlib +from typing import List, Dict + +import pandas as pd + +from autorag.nodes.retrieval.run import evaluate_retrieval_node +from autorag.schema.metricinput import MetricInput +from autorag.strategy import measure_speed, filter_by_threshold, select_best +from autorag.utils.util import apply_recursive, to_list + +logger = logging.getLogger("AutoRAG") + + +def run_passage_augmenter_node( + modules: List, + module_params: List[Dict], + previous_result: pd.DataFrame, + node_line_dir: str, + strategies: Dict, +) -> pd.DataFrame: + if not os.path.exists(node_line_dir): + os.makedirs(node_line_dir) + project_dir = pathlib.PurePath(node_line_dir).parent.parent + qa_df = pd.read_parquet( + os.path.join(project_dir, "data", "qa.parquet"), engine="pyarrow" + ) + retrieval_gt = qa_df["retrieval_gt"].tolist() + retrieval_gt = apply_recursive(lambda x: str(x), to_list(retrieval_gt)) + + results, execution_times = zip( + *map( + lambda task: measure_speed( + task[0].run_evaluator, + project_dir=project_dir, + previous_result=previous_result, + **task[1], + ), + zip(modules, module_params), + ) + ) + average_times = list(map(lambda x: x / len(results[0]), execution_times)) + metric_inputs = [ + MetricInput(retrieval_gt=ret_gt, query=query, generation_gt=gen_gt) + for ret_gt, query, gen_gt in zip( + retrieval_gt, + previous_result["query"].tolist(), + previous_result["generation_gt"].tolist(), + ) + ] + + # run metrics before filtering + if strategies.get("metrics") is None: + raise ValueError( + "You must at least one metrics for passage_augmenter evaluation." + ) + results = list( + map( + lambda x: evaluate_retrieval_node( + x, + metric_inputs, + strategies.get("metrics"), + ), + results, + ) + ) + + # save results to folder + save_dir = os.path.join(node_line_dir, "passage_augmenter") # node name + if not os.path.exists(save_dir): + os.makedirs(save_dir) + filepaths = list( + map(lambda x: os.path.join(save_dir, f"{x}.parquet"), range(len(modules))) + ) + list( + map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths)) + ) # execute save to parquet + filenames = list(map(lambda x: os.path.basename(x), filepaths)) + + summary_df = pd.DataFrame( + { + "filename": filenames, + "module_name": list(map(lambda module: module.__name__, modules)), + "module_params": module_params, + "execution_time": average_times, + **{ + f"passage_augmenter_{metric}": list( + map(lambda result: result[metric].mean(), results) + ) + for metric in strategies.get("metrics") + }, + } + ) + + # filter by strategies + if strategies.get("speed_threshold") is not None: + results, filenames = filter_by_threshold( + results, average_times, strategies["speed_threshold"], filenames + ) + selected_result, selected_filename = select_best( + results, + strategies.get("metrics"), + filenames, + strategies.get("strategy", "mean"), + ) + # change metric name columns to passage_augmenter_metric_name + selected_result = selected_result.rename( + columns={ + metric_name: f"passage_augmenter_{metric_name}" + for metric_name in strategies["metrics"] + } + ) + # drop retrieval result columns in previous_result + previous_result = previous_result.drop( + columns=["retrieved_contents", "retrieved_ids", "retrieve_scores"] + ) + best_result = pd.concat([previous_result, selected_result], axis=1) + + # add 'is_best' column to summary file + summary_df["is_best"] = summary_df["filename"] == selected_filename + + # save files + summary_df.to_csv(os.path.join(save_dir, "summary.csv"), index=False) + best_result.to_parquet( + os.path.join( + save_dir, f"best_{os.path.splitext(selected_filename)[0]}.parquet" + ), + index=False, + ) + return best_result diff --git a/autorag-workspace/autorag/nodes/passagecompressor/__init__.py b/autorag-workspace/autorag/nodes/passagecompressor/__init__.py new file mode 100644 index 0000000..f57388f --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagecompressor/__init__.py @@ -0,0 +1,4 @@ +from .longllmlingua import LongLLMLingua +from .pass_compressor import PassCompressor +from .refine import Refine +from .tree_summarize import TreeSummarize diff --git a/autorag-workspace/autorag/nodes/passagecompressor/base.py b/autorag-workspace/autorag/nodes/passagecompressor/base.py new file mode 100644 index 0000000..e50ffb5 --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagecompressor/base.py @@ -0,0 +1,83 @@ +import abc +import logging +from typing import Dict + +import pandas as pd +from llama_index.core.llms import LLM + +from autorag import generator_models +from autorag.schema import BaseModule +from autorag.utils import result_to_dataframe + +logger = logging.getLogger("AutoRAG") + + +class BasePassageCompressor(BaseModule, metaclass=abc.ABCMeta): + def __init__(self, project_dir: str, *args, **kwargs): + logger.info( + f"Initialize passage compressor node - {self.__class__.__name__} module..." + ) + + def __del__(self): + logger.info( + f"Deleting passage compressor node - {self.__class__.__name__} module..." + ) + + def cast_to_run(self, previous_result: pd.DataFrame, *args, **kwargs): + logger.info( + f"Running passage compressor node - {self.__class__.__name__} module..." + ) + assert all( + [ + column in previous_result.columns + for column in [ + "query", + "retrieved_contents", + ] + ] + ), "previous_result must have retrieved_contents, retrieved_ids, and retrieve_scores columns." + assert len(previous_result) > 0, "previous_result must have at least one row." + + queries = previous_result["query"].tolist() + retrieved_contents = previous_result["retrieved_contents"].tolist() + return queries, retrieved_contents + + +class LlamaIndexCompressor(BasePassageCompressor, metaclass=abc.ABCMeta): + param_list = ["prompt", "chat_prompt", "batch"] + + def __init__(self, project_dir: str, **kwargs): + """ + Initialize passage compressor module. + + :param project_dir: The project directory + :param llm: The llm name that will be used to summarize. + The LlamaIndex LLM model can be used in here. + :param kwargs: Extra parameter for init llm + """ + super().__init__(project_dir) + kwargs_dict = dict( + filter(lambda x: x[0] not in self.param_list, kwargs.items()) + ) + llm_name = kwargs_dict.pop("llm") + self.llm: LLM = make_llm(llm_name, kwargs_dict) + + def __del__(self): + del self.llm + super().__del__() + + @result_to_dataframe(["retrieved_contents"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + queries, retrieved_contents = self.cast_to_run(previous_result) + param_dict = dict(filter(lambda x: x[0] in self.param_list, kwargs.items())) + result = self._pure(queries, retrieved_contents, **param_dict) + return list(map(lambda x: [x], result)) + + +def make_llm(llm_name: str, kwargs: Dict) -> LLM: + if llm_name not in generator_models: + raise KeyError( + f"{llm_name} is not supported. " + "You can add it manually by calling autorag.generator_models." + ) + return generator_models[llm_name](**kwargs) diff --git a/autorag-workspace/autorag/nodes/passagecompressor/longllmlingua.py b/autorag-workspace/autorag/nodes/passagecompressor/longllmlingua.py new file mode 100644 index 0000000..f3388d3 --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagecompressor/longllmlingua.py @@ -0,0 +1,115 @@ +from typing import List, Optional + +import pandas as pd + +from autorag.nodes.passagecompressor.base import BasePassageCompressor +from autorag.utils.util import pop_params, result_to_dataframe, empty_cuda_cache + + +# TODO: Parallel Processing Refactoring at #460 + + +class LongLLMLingua(BasePassageCompressor): + def __init__( + self, project_dir: str, model_name: str = "NousResearch/Llama-2-7b-hf", **kwargs + ): + try: + from llmlingua import PromptCompressor + except ImportError: + raise ImportError( + "LongLLMLingua is not installed. Please install it by running `pip install llmlingua`." + ) + + super().__init__(project_dir) + model_init_params = pop_params(PromptCompressor.__init__, kwargs) + self.llm_lingua = PromptCompressor(model_name=model_name, **model_init_params) + + def __del__(self): + del self.llm_lingua + empty_cuda_cache() + super().__del__() + + @result_to_dataframe(["retrieved_contents"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + queries, retrieved_contents = self.cast_to_run(previous_result) + results = self._pure(queries, retrieved_contents, **kwargs) + return list(map(lambda x: [x], results)) + + def _pure( + self, + queries: List[str], + contents: List[List[str]], + instructions: Optional[str] = None, + target_token: int = 300, + **kwargs, + ) -> List[str]: + """ + Compresses the retrieved texts using LongLLMLingua. + For more information, visit https://github.com/microsoft/LLMLingua. + + :param queries: The queries for retrieved passages. + :param contents: The contents of retrieved passages. + :param model_name: The model name to use for compression. + The default is "NousResearch/Llama-2-7b-hf". + :param instructions: The instructions for compression. + Default is None. When it is None, it will use default instructions. + :param target_token: The target token for compression. + Default is 300. + :param kwargs: Additional keyword arguments. + :return: The list of compressed texts. + """ + if instructions is None: + instructions = "Given the context, please answer the final question" + results = [ + llmlingua_pure( + query, contents_, self.llm_lingua, instructions, target_token, **kwargs + ) + for query, contents_ in zip(queries, contents) + ] + + return results + + +def llmlingua_pure( + query: str, + contents: List[str], + llm_lingua, + instructions: str, + target_token: int = 300, + **kwargs, +) -> str: + """ + Return the compressed text. + + :param query: The query for retrieved passages. + :param contents: The contents of retrieved passages. + :param llm_lingua: The llm instance, that will be used to compress. + :param instructions: The instructions for compression. + :param target_token: The target token for compression. + Default is 300. + :param kwargs: Additional keyword arguments. + :return: The compressed text. + """ + try: + from llmlingua import PromptCompressor + except ImportError: + raise ImportError( + "LongLLMLingua is not installed. Please install it by running `pip install llmlingua`." + ) + # split by "\n\n" (recommended by LongLLMLingua authors) + new_context_texts = [c for context in contents for c in context.split("\n\n")] + compress_prompt_params = pop_params(PromptCompressor.compress_prompt, kwargs) + compressed_prompt = llm_lingua.compress_prompt( + new_context_texts, + question=query, + instruction=instructions, + rank_method="longllmlingua", + target_token=target_token, + **compress_prompt_params, + ) + compressed_prompt_txt = compressed_prompt["compressed_prompt"] + + # separate out the question and instruction + result = "\n\n".join(compressed_prompt_txt.split("\n\n")[1:-1]) + + return result diff --git a/autorag-workspace/autorag/nodes/passagecompressor/pass_compressor.py b/autorag-workspace/autorag/nodes/passagecompressor/pass_compressor.py new file mode 100644 index 0000000..f2d8920 --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagecompressor/pass_compressor.py @@ -0,0 +1,16 @@ +from typing import List + +import pandas as pd + +from autorag.nodes.passagecompressor.base import BasePassageCompressor +from autorag.utils import result_to_dataframe + + +class PassCompressor(BasePassageCompressor): + @result_to_dataframe(["retrieved_contents"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + _, contents = self.cast_to_run(previous_result) + return self._pure(contents) + + def _pure(self, contents: List[List[str]]): + return contents diff --git a/autorag-workspace/autorag/nodes/passagecompressor/refine.py b/autorag-workspace/autorag/nodes/passagecompressor/refine.py new file mode 100644 index 0000000..92cd3b4 --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagecompressor/refine.py @@ -0,0 +1,54 @@ +from typing import List, Optional + +from llama_index.core import PromptTemplate +from llama_index.core.prompts import PromptType +from llama_index.core.prompts.utils import is_chat_model +from llama_index.core.response_synthesizers import Refine as rf + +from autorag.nodes.passagecompressor.base import LlamaIndexCompressor +from autorag.utils.util import get_event_loop, process_batch + + +class Refine(LlamaIndexCompressor): + def _pure( + self, + queries: List[str], + contents: List[List[str]], + prompt: Optional[str] = None, + chat_prompt: Optional[str] = None, + batch: int = 16, + ) -> List[str]: + """ + Refine a response to a query across text chunks. + This function is a wrapper for llama_index.response_synthesizers.Refine. + For more information, visit https://docs.llamaindex.ai/en/stable/examples/response_synthesizers/refine/. + + :param queries: The queries for retrieved passages. + :param contents: The contents of retrieved passages. + :param prompt: The prompt template for refine. + If you want to use chat prompt, you should pass chat_prompt instead. + At prompt, you must specify where to put 'context_msg' and 'query_str'. + Default is None. When it is None, it will use llama index default prompt. + :param chat_prompt: The chat prompt template for refine. + If you want to use normal prompt, you should pass prompt instead. + At prompt, you must specify where to put 'context_msg' and 'query_str'. + Default is None. When it is None, it will use llama index default chat prompt. + :param batch: The batch size for llm. + Set low if you face some errors. + Default is 16. + :return: The list of compressed texts. + """ + if prompt is not None and not is_chat_model(self.llm): + refine_template = PromptTemplate(prompt, prompt_type=PromptType.REFINE) + elif chat_prompt is not None and is_chat_model(self.llm): + refine_template = PromptTemplate(chat_prompt, prompt_type=PromptType.REFINE) + else: + refine_template = None + summarizer = rf(llm=self.llm, refine_template=refine_template, verbose=True) + tasks = [ + summarizer.aget_response(query, content) + for query, content in zip(queries, contents) + ] + loop = get_event_loop() + results = loop.run_until_complete(process_batch(tasks, batch_size=batch)) + return results diff --git a/autorag-workspace/autorag/nodes/passagecompressor/run.py b/autorag-workspace/autorag/nodes/passagecompressor/run.py new file mode 100644 index 0000000..1391070 --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagecompressor/run.py @@ -0,0 +1,186 @@ +import os.path +import pathlib +from typing import List, Dict + +import pandas as pd + +from autorag.evaluation.metric import ( + retrieval_token_recall, + retrieval_token_precision, + retrieval_token_f1, +) +from autorag.schema.metricinput import MetricInput +from autorag.strategy import measure_speed, filter_by_threshold, select_best +from autorag.utils.util import fetch_contents + + +def run_passage_compressor_node( + modules: List, + module_params: List[Dict], + previous_result: pd.DataFrame, + node_line_dir: str, + strategies: Dict, +) -> pd.DataFrame: + """ + Run evaluation and select the best module among passage compressor modules. + + :param modules: Passage compressor modules to run. + :param module_params: Passage compressor module parameters. + :param previous_result: Previous result dataframe. + Could be retrieval, reranker modules result. + It means it must contain 'query', 'retrieved_contents', 'retrieved_ids', 'retrieve_scores' columns. + :param node_line_dir: This node line's directory. + :param strategies: Strategies for passage compressor node. + In this node, we use + You can skip evaluation when you use only one module and a module parameter. + :return: The best result dataframe with previous result columns. + This node will replace 'retrieved_contents' to compressed passages, so its length will be one. + """ + if not os.path.exists(node_line_dir): + os.makedirs(node_line_dir) + project_dir = pathlib.PurePath(node_line_dir).parent.parent + data_dir = os.path.join(project_dir, "data") + save_dir = os.path.join(node_line_dir, "passage_compressor") + if not os.path.exists(save_dir): + os.makedirs(save_dir) + + # make retrieval contents gt + qa_data = pd.read_parquet(os.path.join(data_dir, "qa.parquet"), engine="pyarrow") + corpus_data = pd.read_parquet( + os.path.join(data_dir, "corpus.parquet"), engine="pyarrow" + ) + # check qa_data have retrieval_gt + assert all( + len(x[0]) > 0 for x in qa_data["retrieval_gt"].tolist() + ), "Can't use passage compressor if you don't have retrieval gt values in QA dataset." + + # run modules + results, execution_times = zip( + *map( + lambda task: measure_speed( + task[0].run_evaluator, + project_dir=project_dir, + previous_result=previous_result, + **task[1], + ), + zip(modules, module_params), + ) + ) + results = list(results) + average_times = list(map(lambda x: x / len(results[0]), execution_times)) + + retrieval_gt_contents = list( + map(lambda x: fetch_contents(corpus_data, x), qa_data["retrieval_gt"].tolist()) + ) + + metric_inputs = [ + MetricInput(retrieval_gt_contents=ret_cont_gt) + for ret_cont_gt in retrieval_gt_contents + ] + + # run metrics before filtering + if strategies.get("metrics") is None: + raise ValueError( + "You must at least one metrics for retrieval contents evaluation." + "It can be 'retrieval_token_f1', 'retrieval_token_precision', 'retrieval_token_recall'." + ) + results = list( + map( + lambda x: evaluate_passage_compressor_node( + x, metric_inputs, strategies.get("metrics") + ), + results, + ) + ) + + # save results to folder + filepaths = list( + map(lambda x: os.path.join(save_dir, f"{x}.parquet"), range(len(modules))) + ) + list( + map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths)) + ) # execute save to parquet + filenames = list(map(lambda x: os.path.basename(x), filepaths)) + + # make summary file + summary_df = pd.DataFrame( + { + "filename": filenames, + "module_name": list(map(lambda module: module.__name__, modules)), + "module_params": module_params, + "execution_time": average_times, + **{ + f"passage_compressor_{metric}": list( + map(lambda result: result[metric].mean(), results) + ) + for metric in strategies.get("metrics") + }, + } + ) + + # filter by strategies + if strategies.get("speed_threshold") is not None: + results, filenames = filter_by_threshold( + results, average_times, strategies["speed_threshold"], filenames + ) + selected_result, selected_filename = select_best( + results, + strategies.get("metrics"), + filenames, + strategies.get("strategy", "mean"), + ) + new_retrieved_contents = selected_result["retrieved_contents"] + previous_result["retrieved_contents"] = new_retrieved_contents + selected_result = selected_result.drop(columns=["retrieved_contents"]) + best_result = pd.concat([previous_result, selected_result], axis=1) + + # add 'is_best' column to summary file + summary_df["is_best"] = summary_df["filename"] == selected_filename + + # add prefix 'passage_compressor' to best_result columns + best_result = best_result.rename( + columns={ + metric_name: f"passage_compressor_{metric_name}" + for metric_name in strategies.get("metrics") + } + ) + + # save the result files + best_result.to_parquet( + os.path.join( + save_dir, f"best_{os.path.splitext(selected_filename)[0]}.parquet" + ), + index=False, + ) + summary_df.to_csv(os.path.join(save_dir, "summary.csv"), index=False) + return best_result + + +def evaluate_passage_compressor_node( + result_df: pd.DataFrame, metric_inputs: List[MetricInput], metrics: List[str] +): + metric_funcs = { + retrieval_token_recall.__name__: retrieval_token_recall, + retrieval_token_precision.__name__: retrieval_token_precision, + retrieval_token_f1.__name__: retrieval_token_f1, + } + for metric_input, generated_text in zip( + metric_inputs, result_df["retrieved_contents"].tolist() + ): + metric_input.retrieved_contents = generated_text + metrics = list(filter(lambda x: x in metric_funcs.keys(), metrics)) + if len(metrics) <= 0: + raise ValueError(f"metrics must be one of {metric_funcs.keys()}") + metrics_scores = dict( + map( + lambda metric: ( + metric, + metric_funcs[metric]( + metric_inputs=metric_inputs, + ), + ), + metrics, + ) + ) + result_df = pd.concat([result_df, pd.DataFrame(metrics_scores)], axis=1) + return result_df diff --git a/autorag-workspace/autorag/nodes/passagecompressor/tree_summarize.py b/autorag-workspace/autorag/nodes/passagecompressor/tree_summarize.py new file mode 100644 index 0000000..15e4695 --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagecompressor/tree_summarize.py @@ -0,0 +1,56 @@ +from typing import List, Optional + +from llama_index.core import PromptTemplate +from llama_index.core.prompts import PromptType +from llama_index.core.prompts.utils import is_chat_model +from llama_index.core.response_synthesizers import TreeSummarize as ts + +from autorag.nodes.passagecompressor.base import LlamaIndexCompressor +from autorag.utils.util import get_event_loop, process_batch + + +class TreeSummarize(LlamaIndexCompressor): + def _pure( + self, + queries: List[str], + contents: List[List[str]], + prompt: Optional[str] = None, + chat_prompt: Optional[str] = None, + batch: int = 16, + ) -> List[str]: + """ + Recursively merge retrieved texts and summarizes them in a bottom-up fashion. + This function is a wrapper for llama_index.response_synthesizers.TreeSummarize. + For more information, visit https://docs.llamaindex.ai/en/latest/examples/response_synthesizers/tree_summarize.html. + + :param queries: The queries for retrieved passages. + :param contents: The contents of retrieved passages. + :param prompt: The prompt template for summarization. + If you want to use chat prompt, you should pass chat_prompt instead. + At prompt, you must specify where to put 'context_str' and 'query_str'. + Default is None. When it is None, it will use llama index default prompt. + :param chat_prompt: The chat prompt template for summarization. + If you want to use normal prompt, you should pass prompt instead. + At prompt, you must specify where to put 'context_str' and 'query_str'. + Default is None. When it is None, it will use llama index default chat prompt. + :param batch: The batch size for llm. + Set low if you face some errors. + Default is 16. + :return: The list of compressed texts. + """ + if prompt is not None and not is_chat_model(self.llm): + summary_template = PromptTemplate(prompt, prompt_type=PromptType.SUMMARY) + elif chat_prompt is not None and is_chat_model(self.llm): + summary_template = PromptTemplate( + chat_prompt, prompt_type=PromptType.SUMMARY + ) + else: + summary_template = None + summarizer = ts(llm=self.llm, summary_template=summary_template, use_async=True) + tasks = [ + summarizer.aget_response(query, content) + for query, content in zip(queries, contents) + ] + loop = get_event_loop() + results = loop.run_until_complete(process_batch(tasks, batch_size=batch)) + return results diff --git a/autorag-workspace/autorag/nodes/passagefilter/__init__.py b/autorag-workspace/autorag/nodes/passagefilter/__init__.py new file mode 100644 index 0000000..2f46dbf --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagefilter/__init__.py @@ -0,0 +1,6 @@ +from .pass_passage_filter import PassPassageFilter +from .percentile_cutoff import PercentileCutoff +from .recency import RecencyFilter +from .similarity_percentile_cutoff import SimilarityPercentileCutoff +from .similarity_threshold_cutoff import SimilarityThresholdCutoff +from .threshold_cutoff import ThresholdCutoff diff --git a/autorag-workspace/autorag/nodes/passagefilter/base.py b/autorag-workspace/autorag/nodes/passagefilter/base.py new file mode 100644 index 0000000..8efe697 --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagefilter/base.py @@ -0,0 +1,50 @@ +import abc +import logging +from pathlib import Path +from typing import Union + +import pandas as pd + +from autorag.schema.base import BaseModule +from autorag.utils import validate_qa_dataset + +logger = logging.getLogger("AutoRAG") + + +class BasePassageFilter(BaseModule, metaclass=abc.ABCMeta): + def __init__(self, project_dir: Union[str, Path], *args, **kwargs): + logger.info(f"Initialize passage filter node - {self.__class__.__name__}") + + def __del__(self): + logger.info(f"Prompt maker node - {self.__class__.__name__} module is deleted.") + + def cast_to_run(self, previous_result: pd.DataFrame, *args, **kwargs): + logger.info( + f"Running passage filter node - {self.__class__.__name__} module..." + ) + validate_qa_dataset(previous_result) + + # find queries columns + assert ( + "query" in previous_result.columns + ), "previous_result must have query column." + queries = previous_result["query"].tolist() + + # find contents_list columns + assert ( + "retrieved_contents" in previous_result.columns + ), "previous_result must have retrieved_contents column." + contents = previous_result["retrieved_contents"].tolist() + + # find scores columns + assert ( + "retrieve_scores" in previous_result.columns + ), "previous_result must have retrieve_scores column." + scores = previous_result["retrieve_scores"].tolist() + + # find ids columns + assert ( + "retrieved_ids" in previous_result.columns + ), "previous_result must have retrieved_ids column." + ids = previous_result["retrieved_ids"].tolist() + return queries, contents, scores, ids diff --git a/autorag-workspace/autorag/nodes/passagefilter/pass_passage_filter.py b/autorag-workspace/autorag/nodes/passagefilter/pass_passage_filter.py new file mode 100644 index 0000000..1cb2647 --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagefilter/pass_passage_filter.py @@ -0,0 +1,14 @@ +import pandas as pd + +from autorag.nodes.passagefilter.base import BasePassageFilter +from autorag.utils import result_to_dataframe + + +class PassPassageFilter(BasePassageFilter): + @result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + _, contents, scores, ids = self.cast_to_run(previous_result) + return contents, ids, scores + + def _pure(self, *args, **kwargs): + pass diff --git a/autorag-workspace/autorag/nodes/passagefilter/percentile_cutoff.py b/autorag-workspace/autorag/nodes/passagefilter/percentile_cutoff.py new file mode 100644 index 0000000..7fb95a6 --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagefilter/percentile_cutoff.py @@ -0,0 +1,58 @@ +from typing import List, Tuple + +import pandas as pd + +from autorag.nodes.passagefilter.base import BasePassageFilter +from autorag.utils.util import sort_by_scores, select_top_k, result_to_dataframe + + +class PercentileCutoff(BasePassageFilter): + @result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + queries, contents, scores, ids = self.cast_to_run(previous_result) + return self._pure(queries, contents, scores, ids, *args, **kwargs) + + def _pure( + self, + queries: List[str], + contents_list: List[List[str]], + scores_list: List[List[float]], + ids_list: List[List[str]], + percentile: float, + reverse: bool = False, + ) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]: + """ + Filter out the contents that are below the content's length times percentile. + If This is a filter and does not override scores. + If the value of content's length times percentile is less than 1, keep the only one highest similarity content. + + :param queries: The list of queries to use for filtering + :param contents_list: The list of lists of contents to filter + :param scores_list: The list of lists of scores retrieved + :param ids_list: The list of lists of ids retrieved + :param percentile: The percentile to cut off + :param reverse: If True, the lower the score, the better + Default is False. + :return: Tuple of lists containing the filtered contents, ids, and scores + """ + num_top_k = max(1, int(len(scores_list[0]) * percentile)) + + df = pd.DataFrame( + { + "contents": contents_list, + "ids": ids_list, + "scores": scores_list, + } + ) + + reverse = not reverse + df[["contents", "ids", "scores"]] = df.apply( + sort_by_scores, axis=1, result_type="expand", reverse=reverse + ) + results = select_top_k(df, ["contents", "ids", "scores"], num_top_k) + + return ( + results["contents"].tolist(), + results["ids"].tolist(), + results["scores"].tolist(), + ) diff --git a/autorag-workspace/autorag/nodes/passagefilter/recency.py b/autorag-workspace/autorag/nodes/passagefilter/recency.py new file mode 100644 index 0000000..22b1ba4 --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagefilter/recency.py @@ -0,0 +1,105 @@ +import logging +import os +from datetime import datetime, date +from pathlib import Path +from typing import List, Tuple, Union + +import pandas as pd + +from autorag.nodes.passagefilter.base import BasePassageFilter +from autorag.utils import fetch_contents, result_to_dataframe + +logger = logging.getLogger("AutoRAG") + + +class RecencyFilter(BasePassageFilter): + def __init__(self, project_dir: Union[str, Path], *args, **kwargs): + super().__init__(project_dir, *args, **kwargs) + self.corpus_df = pd.read_parquet( + os.path.join(project_dir, "data", "corpus.parquet"), engine="pyarrow" + ) + + @result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + _, contents, scores, ids = self.cast_to_run(previous_result, *args, **kwargs) + metadatas = fetch_contents(self.corpus_df, ids, column_name="metadata") + times = [ + [time["last_modified_datetime"] for time in time_list] + for time_list in metadatas + ] + return self._pure(contents, scores, ids, times, *args, **kwargs) + + def _pure( + self, + contents_list: List[List[str]], + scores_list: List[List[float]], + ids_list: List[List[str]], + time_list: List[List[datetime]], + threshold_datetime: Union[datetime, date], + ) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]: + """ + Filter out the contents that are below the threshold datetime. + If all contents are filtered, keep the only one recency content. + If the threshold date format is incorrect, return the original contents. + + :param contents_list: The list of lists of contents to filter + :param scores_list: The list of lists of scores retrieved + :param ids_list: The list of lists of ids retrieved + :param time_list: The list of lists of datetime retrieved + :param threshold_datetime: The threshold to cut off. + In recency filter, you have to use the datetime.datetime object or datetime.date object. + All you need to do is to set the date at your YAML file. + For example, you can write "2010-09-09 3:45:06" or "2010-09-09" in the YAML file. + :return: Tuple of lists containing the filtered contents, ids, and scores + """ + if not ( + isinstance(threshold_datetime, datetime) + or isinstance(threshold_datetime, date) + ): + raise ValueError( + f"Threshold should be a datetime object, but got {type(threshold_datetime)}" + ) + + if not isinstance(threshold_datetime, datetime): + threshold_datetime = datetime.combine( + threshold_datetime, datetime.min.time() + ) + + time_list = [ + list( + map( + lambda t: datetime.combine(t, datetime.min.time()) + if not isinstance(t, datetime) + else t, + time, + ) + ) + for time in time_list + ] + + def sort_row(contents, scores, ids, time, _datetime_threshold): + combined = list(zip(contents, scores, ids, time)) + combined_filtered = [ + item for item in combined if item[3] >= _datetime_threshold + ] + + if combined_filtered: + remain_contents, remain_scores, remain_ids, _ = zip(*combined_filtered) + else: + combined.sort(key=lambda x: x[3], reverse=True) + remain_contents, remain_scores, remain_ids, _ = zip(*combined[:1]) + + return list(remain_contents), list(remain_ids), list(remain_scores) + + remain_contents_list, remain_ids_list, remain_scores_list = zip( + *map( + sort_row, + contents_list, + scores_list, + ids_list, + time_list, + [threshold_datetime] * len(contents_list), + ) + ) + + return remain_contents_list, remain_ids_list, remain_scores_list diff --git a/autorag-workspace/autorag/nodes/passagefilter/run.py b/autorag-workspace/autorag/nodes/passagefilter/run.py new file mode 100644 index 0000000..b557996 --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagefilter/run.py @@ -0,0 +1,138 @@ +import os +import pathlib +from typing import List, Dict + +import pandas as pd + +from autorag.nodes.retrieval.run import evaluate_retrieval_node +from autorag.schema.metricinput import MetricInput +from autorag.strategy import measure_speed, filter_by_threshold, select_best +from autorag.utils.util import to_list, apply_recursive + + +def run_passage_filter_node( + modules: List, + module_params: List[Dict], + previous_result: pd.DataFrame, + node_line_dir: str, + strategies: Dict, +) -> pd.DataFrame: + """ + Run evaluation and select the best module among passage filter node results. + + :param modules: Passage filter modules to run. + :param module_params: Passage filter module parameters. + :param previous_result: Previous result dataframe. + Could be retrieval, reranker, passage filter modules result. + It means it must contain 'query', 'retrieved_contents', 'retrieved_ids', 'retrieve_scores' columns. + :param node_line_dir: This node line's directory. + :param strategies: Strategies for passage filter node. + In this node, we use 'retrieval_f1', 'retrieval_recall' and 'retrieval_precision'. + You can skip evaluation when you use only one module and a module parameter. + :return: The best result dataframe with previous result columns. + """ + if not os.path.exists(node_line_dir): + os.makedirs(node_line_dir) + project_dir = pathlib.PurePath(node_line_dir).parent.parent + qa_df = pd.read_parquet( + os.path.join(project_dir, "data", "qa.parquet"), engine="pyarrow" + ) + retrieval_gt = qa_df["retrieval_gt"].tolist() + retrieval_gt = apply_recursive(lambda x: str(x), to_list(retrieval_gt)) + + # make rows to metric_inputs + metric_inputs = [ + MetricInput(retrieval_gt=ret_gt, query=query, generation_gt=gen_gt) + for ret_gt, query, gen_gt in zip( + retrieval_gt, qa_df["query"].tolist(), qa_df["generation_gt"].tolist() + ) + ] + + results, execution_times = zip( + *map( + lambda task: measure_speed( + task[0].run_evaluator, + project_dir=project_dir, + previous_result=previous_result, + **task[1], + ), + zip(modules, module_params), + ) + ) + average_times = list(map(lambda x: x / len(results[0]), execution_times)) + + # run metrics before filtering + if strategies.get("metrics") is None: + raise ValueError("You must at least one metrics for passage_filter evaluation.") + results = list( + map( + lambda x: evaluate_retrieval_node( + x, + metric_inputs, + strategies.get("metrics"), + ), + results, + ) + ) + + # save results to folder + save_dir = os.path.join(node_line_dir, "passage_filter") # node name + if not os.path.exists(save_dir): + os.makedirs(save_dir) + filepaths = list( + map(lambda x: os.path.join(save_dir, f"{x}.parquet"), range(len(modules))) + ) + list( + map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths)) + ) # execute save to parquet + filenames = list(map(lambda x: os.path.basename(x), filepaths)) + + summary_df = pd.DataFrame( + { + "filename": filenames, + "module_name": list(map(lambda module: module.__name__, modules)), + "module_params": module_params, + "execution_time": average_times, + **{ + f"passage_filter_{metric}": list( + map(lambda result: result[metric].mean(), results) + ) + for metric in strategies.get("metrics") + }, + } + ) + + # filter by strategies + if strategies.get("speed_threshold") is not None: + results, filenames = filter_by_threshold( + results, average_times, strategies["speed_threshold"], filenames + ) + selected_result, selected_filename = select_best( + results, + strategies.get("metrics"), + filenames, + strategies.get("strategy", "mean"), + ) + selected_result = selected_result.rename( + columns={ + metric_name: f"passage_filter_{metric_name}" + for metric_name in strategies["metrics"] + } + ) + previous_result = previous_result.drop( + columns=["retrieved_contents", "retrieved_ids", "retrieve_scores"] + ) + best_result = pd.concat([previous_result, selected_result], axis=1) + + # add 'is_best' column to summary file + summary_df["is_best"] = summary_df["filename"] == selected_filename + + # save files + summary_df.to_csv(os.path.join(save_dir, "summary.csv"), index=False) + best_result.to_parquet( + os.path.join( + save_dir, f"best_{os.path.splitext(selected_filename)[0]}.parquet" + ), + index=False, + ) + return best_result diff --git a/autorag-workspace/autorag/nodes/passagefilter/similarity_percentile_cutoff.py b/autorag-workspace/autorag/nodes/passagefilter/similarity_percentile_cutoff.py new file mode 100644 index 0000000..fdc122d --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagefilter/similarity_percentile_cutoff.py @@ -0,0 +1,134 @@ +from pathlib import Path +from typing import List, Tuple, Union + +import numpy as np +import pandas as pd + +from autorag.embedding.base import EmbeddingModel +from autorag.evaluation.metric.util import calculate_cosine_similarity +from autorag.nodes.passagefilter.base import BasePassageFilter +from autorag.nodes.passagefilter.similarity_threshold_cutoff import ( + embedding_query_content, +) +from autorag.utils import result_to_dataframe +from autorag.utils.util import empty_cuda_cache, pop_params + + +class SimilarityPercentileCutoff(BasePassageFilter): + def __init__(self, project_dir: Union[str, Path], *args, **kwargs): + """ + Initialize the SimilarityPercentileCutoff module + + :param project_dir: The project directory to use for initializing the module + :param embedding_model: The embedding model string to use for calculating similarity + Default is "openai" which is OpenAI text-embedding-ada-002 embedding model. + """ + super().__init__(project_dir, *args, **kwargs) + embedding_model = kwargs.pop("embedding_model", "openai") + self.embedding_model = EmbeddingModel.load(embedding_model)() + + def __del__(self): + super().__del__() + del self.embedding_model + + empty_cuda_cache() + + @result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"]) + def pure(self, previous_result: pd.DataFrame, **kwargs): + queries, contents, scores, ids = self.cast_to_run(previous_result) + kwargs = pop_params(self._pure, kwargs) + return self._pure(queries, contents, scores, ids, **kwargs) + + def _pure( + self, + queries: List[str], + contents_list: List[List[str]], + scores_list: List[List[float]], + ids_list: List[List[str]], + percentile: float, + batch: int = 128, + ) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]: + """ + Re-calculate each content's similarity with the query and filter out the contents that are below the content's + length times percentile. If This is a filter and does not override scores. The output of scores is not coming from + query-content similarity. + If the value of content's length times percentile is less than 1, keep the only one highest similarity content. + + :param queries: The list of queries to use for filtering + :param contents_list: The list of lists of contents to filter + :param scores_list: The list of lists of scores retrieved + :param ids_list: The list of lists of ids retrieved + :param percentile: The percentile to cut off + :param batch: The number of queries to be processed in a batch + Default is 128. + :return: Tuple of lists containing the filtered contents, ids, and scores + """ + query_embeddings, content_embeddings = embedding_query_content( + queries, contents_list, self.embedding_model, batch + ) + + results = list( + map( + lambda x: self.__row_pure(x[0], x[1], x[2], x[3], x[4], percentile), + zip( + query_embeddings, + content_embeddings, + contents_list, + ids_list, + scores_list, + ), + ) + ) + + remain_content_list = list(map(lambda x: x[0], results)) + remain_ids_list = list(map(lambda x: x[1], results)) + remain_scores_list = list(map(lambda x: x[2], results)) + + return remain_content_list, remain_ids_list, remain_scores_list + + @staticmethod + def __row_pure( + query_embedding: str, + content_embeddings: List[List[float]], + content_list: List[str], + ids_list: List[str], + scores_list: List[float], + percentile: float, + ) -> Tuple[List[str], List[str], List[float]]: + """ + Return tuple of lists containing the filtered contents, ids, and scores + + :param query_embedding: Query embedding + :param content_embeddings: Each content embedding + :param content_list: Each content + :param ids_list: Each id + :param scores_list: Each score + :param percentile: The percentile to cut off + :return: Tuple of lists containing the filtered contents, ids, and scores + """ + num_top_k = int(len(content_embeddings) * percentile) + + if num_top_k == 0: + num_top_k = 1 + + similarities = np.array( + list( + map( + lambda x: calculate_cosine_similarity(query_embedding, x), + content_embeddings, + ) + ) + ).tolist() + + content_id_score_similarity = list( + zip(ids_list, content_list, scores_list, similarities) + ) + + sorted_content_id_score_similarity = sorted( + content_id_score_similarity, key=lambda x: x[3], reverse=True + )[:num_top_k] + + content_result, id_result, score_result, _ = zip( + *sorted_content_id_score_similarity + ) + return list(content_result), list(id_result), list(score_result) diff --git a/autorag-workspace/autorag/nodes/passagefilter/similarity_threshold_cutoff.py b/autorag-workspace/autorag/nodes/passagefilter/similarity_threshold_cutoff.py new file mode 100644 index 0000000..ff73426 --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagefilter/similarity_threshold_cutoff.py @@ -0,0 +1,112 @@ +from typing import List, Tuple + +import numpy as np +import pandas as pd + +from autorag.embedding.base import EmbeddingModel +from autorag.evaluation.metric.util import calculate_cosine_similarity +from autorag.nodes.passagefilter.base import BasePassageFilter +from autorag.utils.util import ( + embedding_query_content, + empty_cuda_cache, + result_to_dataframe, + pop_params, +) + + +class SimilarityThresholdCutoff(BasePassageFilter): + def __init__(self, project_dir: str, *args, **kwargs): + """ + Initialize the SimilarityThresholdCutoff module + + :param project_dir: The project directory to use for initializing the module + :param embedding_model: The embedding model string to use for calculating similarity + Default is "openai" which is OpenAI text-embedding-ada-002 embedding model. + """ + super().__init__(project_dir, *args, **kwargs) + embedding_model = kwargs.get("embedding_model", "openai") + self.embedding_model = EmbeddingModel.load(embedding_model)() + + def __del__(self): + del self.embedding_model + empty_cuda_cache() + super().__del__() + + @result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + kwargs = pop_params(self._pure, kwargs) + queries, contents, scores, ids = self.cast_to_run(previous_result) + return self._pure(queries, contents, scores, ids, *args, **kwargs) + + def _pure( + self, + queries: List[str], + contents_list: List[List[str]], + scores_list: List[List[float]], + ids_list: List[List[str]], + threshold: float, + batch: int = 128, + ) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]: + """ + Re-calculate each content's similarity with the query and filter out the contents that are below the threshold. + If all contents are filtered, keep the only one highest similarity content. + This is a filter and does not override scores. + The output of scores is not coming from query-content similarity. + + :param queries: The list of queries to use for filtering + :param contents_list: The list of lists of contents to filter + :param scores_list: The list of lists of scores retrieved + :param ids_list: The list of lists of ids retrieved + :param threshold: The threshold to cut off + :param batch: The number of queries to be processed in a batch + Default is 128. + :return: Tuple of lists containing the filtered contents, ids, and scores + """ + query_embeddings, content_embeddings = embedding_query_content( + queries, contents_list, self.embedding_model, batch + ) + + remain_indices = list( + map( + lambda x: self.__row_pure(x[0], x[1], threshold), + zip(query_embeddings, content_embeddings), + ) + ) + + remain_content_list = list( + map(lambda c, idx: [c[i] for i in idx], contents_list, remain_indices) + ) + remain_scores_list = list( + map(lambda s, idx: [s[i] for i in idx], scores_list, remain_indices) + ) + remain_ids_list = list( + map(lambda _id, idx: [_id[i] for i in idx], ids_list, remain_indices) + ) + return remain_content_list, remain_ids_list, remain_scores_list + + @staticmethod + def __row_pure( + query_embedding: str, content_embeddings: List[List[float]], threshold: float + ) -> List[int]: + """ + Return indices that have to remain. + Return at least one index if there is nothing to remain. + + :param query_embedding: Query embedding + :param content_embeddings: Each content embedding + :param threshold: The threshold to cut off + :return: Indices to remain at the contents + """ + + similarities = np.array( + list( + map( + lambda x: calculate_cosine_similarity(query_embedding, x), + content_embeddings, + ) + ) + ) + result = np.where(similarities >= threshold)[0].tolist() + if len(result) > 0: + return result + return [np.argmax(similarities)] diff --git a/autorag-workspace/autorag/nodes/passagefilter/threshold_cutoff.py b/autorag-workspace/autorag/nodes/passagefilter/threshold_cutoff.py new file mode 100644 index 0000000..1f62129 --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagefilter/threshold_cutoff.py @@ -0,0 +1,78 @@ +from typing import List, Tuple + +import pandas as pd + +from autorag.nodes.passagefilter.base import BasePassageFilter +from autorag.utils.util import convert_inputs_to_list, result_to_dataframe + + +class ThresholdCutoff(BasePassageFilter): + @result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + _, contents, scores, ids = self.cast_to_run(previous_result) + return self._pure(contents, scores, ids, *args, **kwargs) + + def _pure( + self, + contents_list: List[List[str]], + scores_list: List[List[float]], + ids_list: List[List[str]], + threshold: float, + reverse: bool = False, + ) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]: + """ + Filters the contents, scores, and ids based on a previous result's score. + Keeps at least one item per query if all scores are below the threshold. + + :param contents_list: List of content strings for each query. + :param scores_list: List of scores for each content. + :param ids_list: List of ids for each content. + :param threshold: The minimum score to keep an item. + :param reverse: If True, the lower the score, the better. + Default is False. + :return: Filtered lists of contents, ids, and scores. + """ + remain_indices = list( + map(lambda x: self.__row_pure(x, threshold, reverse), scores_list) + ) + + remain_content_list = list( + map(lambda c, idx: [c[i] for i in idx], contents_list, remain_indices) + ) + remain_scores_list = list( + map(lambda s, idx: [s[i] for i in idx], scores_list, remain_indices) + ) + remain_ids_list = list( + map(lambda _id, idx: [_id[i] for i in idx], ids_list, remain_indices) + ) + + return remain_content_list, remain_ids_list, remain_scores_list + + @convert_inputs_to_list + def __row_pure( + self, scores_list: List[float], threshold: float, reverse: bool = False + ) -> List[int]: + """ + Return indices that have to remain. + Return at least one index if there is nothing to remain. + + :param scores_list: Each score + :param threshold: The threshold to cut off + :param reverse: If True, the lower the score, the better + Default is False. + :return: Indices to remain at the contents + """ + assert isinstance(scores_list, list), "scores_list must be a list." + + if reverse: + remain_indices = [ + i for i, score in enumerate(scores_list) if score <= threshold + ] + default_index = scores_list.index(min(scores_list)) + else: + remain_indices = [ + i for i, score in enumerate(scores_list) if score >= threshold + ] + default_index = scores_list.index(max(scores_list)) + + return remain_indices if remain_indices else [default_index] diff --git a/autorag-workspace/autorag/nodes/passagereranker/__init__.py b/autorag-workspace/autorag/nodes/passagereranker/__init__.py new file mode 100644 index 0000000..b6299b4 --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagereranker/__init__.py @@ -0,0 +1,18 @@ +from .cohere import CohereReranker +from .colbert import ColbertReranker +from .flag_embedding import FlagEmbeddingReranker +from .flag_embedding_llm import FlagEmbeddingLLMReranker +from .jina import JinaReranker +from .koreranker import KoReranker +from .monot5 import MonoT5 +from .pass_reranker import PassReranker +from .rankgpt import RankGPT +from .sentence_transformer import SentenceTransformerReranker +from .time_reranker import TimeReranker +from .upr import Upr +from .openvino import OpenVINOReranker +from .voyageai import VoyageAIReranker +from .mixedbreadai import MixedbreadAIReranker +from .flashrank import FlashRankReranker + +from .dragonkue2 import DragonKue2 # 250313 추가 - 김용연 \ No newline at end of file diff --git a/autorag-workspace/autorag/nodes/passagereranker/base.py b/autorag-workspace/autorag/nodes/passagereranker/base.py new file mode 100644 index 0000000..17a5cd2 --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagereranker/base.py @@ -0,0 +1,55 @@ +import abc +import logging +from pathlib import Path +from typing import Union + +import pandas as pd + +from autorag.schema import BaseModule +from autorag.utils import validate_qa_dataset + +logger = logging.getLogger("AutoRAG") + + +class BasePassageReranker(BaseModule, metaclass=abc.ABCMeta): + def __init__(self, project_dir: Union[str, Path], *args, **kwargs): + logger.info( + f"Initialize passage reranker node - {self.__class__.__name__} module..." + ) + + def __del__(self): + logger.info( + f"Deleting passage reranker node - {self.__class__.__name__} module..." + ) + + def cast_to_run(self, previous_result: pd.DataFrame, *args, **kwargs): + logger.info( + f"Running passage reranker node - {self.__class__.__name__} module..." + ) + validate_qa_dataset(previous_result) + + # find queries columns + assert ( + "query" in previous_result.columns + ), "previous_result must have query column." + queries = previous_result["query"].tolist() + + # find contents_list columns + assert ( + "retrieved_contents" in previous_result.columns + ), "previous_result must have retrieved_contents column." + contents = previous_result["retrieved_contents"].tolist() + + # find scores columns + assert ( + "retrieve_scores" in previous_result.columns + ), "previous_result must have retrieve_scores column." + scores = previous_result["retrieve_scores"].tolist() + + # find ids columns + assert ( + "retrieved_ids" in previous_result.columns + ), "previous_result must have retrieved_ids column." + ids = previous_result["retrieved_ids"].tolist() + + return queries, contents, scores, ids diff --git a/autorag-workspace/autorag/nodes/passagereranker/cohere.py b/autorag-workspace/autorag/nodes/passagereranker/cohere.py new file mode 100644 index 0000000..34ecc90 --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagereranker/cohere.py @@ -0,0 +1,119 @@ +import os +from typing import List, Tuple + +import cohere +import pandas as pd +from cohere import RerankResponseResultsItem + +from autorag.nodes.passagereranker.base import BasePassageReranker +from autorag.utils.util import get_event_loop, process_batch, result_to_dataframe + + +class CohereReranker(BasePassageReranker): + def __init__(self, project_dir: str, *args, **kwargs): + """ + Initialize Cohere rerank node. + + :param project_dir: The project directory path. + :param api_key: The API key for Cohere rerank. + You can set it in the environment variable COHERE_API_KEY. + Or, you can directly set it on the config YAML file using this parameter. + Default is env variable "COHERE_API_KEY". + :param kwargs: Extra arguments that are not affected + """ + super().__init__(project_dir) + api_key = kwargs.pop("api_key", None) + api_key = os.getenv("COHERE_API_KEY", None) if api_key is None else api_key + if api_key is None: + api_key = os.getenv("CO_API_KEY", None) + if api_key is None: + raise KeyError( + "Please set the API key for Cohere rerank in the environment variable COHERE_API_KEY " + "or directly set it on the config YAML file." + ) + + self.cohere_client = cohere.AsyncClientV2(api_key=api_key) + + def __del__(self): + del self.cohere_client + super().__del__() + + @result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + queries, contents, scores, ids = self.cast_to_run(previous_result) + top_k = kwargs.pop("top_k") + batch = kwargs.pop("batch", 64) + model = kwargs.pop("model", "rerank-v3.5") + return self._pure(queries, contents, scores, ids, top_k, batch, model) + + def _pure( + self, + queries: List[str], + contents_list: List[List[str]], + scores_list: List[List[float]], + ids_list: List[List[str]], + top_k: int, + batch: int = 64, + model: str = "rerank-v3.5", + ) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]: + """ + Rerank a list of contents with Cohere rerank models. + You can get the API key from https://cohere.com/rerank and set it in the environment variable COHERE_API_KEY. + + :param queries: The list of queries to use for reranking + :param contents_list: The list of lists of contents to rerank + :param scores_list: The list of lists of scores retrieved from the initial ranking + :param ids_list: The list of lists of ids retrieved from the initial ranking + :param top_k: The number of passages to be retrieved + :param batch: The number of queries to be processed in a batch + :param model: The model name for Cohere rerank. + You can choose between "rerank-v3.5", "rerank-english-v3.0", and "rerank-multilingual-v3.0". + Default is "rerank-v3.5". + :return: Tuple of lists containing the reranked contents, ids, and scores + """ + # Run async cohere_rerank_pure function + tasks = [ + cohere_rerank_pure(self.cohere_client, model, query, document, ids, top_k) + for query, document, ids in zip(queries, contents_list, ids_list) + ] + loop = get_event_loop() + results = loop.run_until_complete(process_batch(tasks, batch_size=batch)) + content_result = list(map(lambda x: x[0], results)) + id_result = list(map(lambda x: x[1], results)) + score_result = list(map(lambda x: x[2], results)) + + return content_result, id_result, score_result + + +async def cohere_rerank_pure( + cohere_client: cohere.AsyncClient, + model: str, + query: str, + documents: List[str], + ids: List[str], + top_k: int, +) -> Tuple[List[str], List[str], List[float]]: + """ + Rerank a list of contents with Cohere rerank models. + + :param cohere_client: The Cohere AsyncClient to use for reranking + :param model: The model name for Cohere rerank + :param query: The query to use for reranking + :param documents: The list of contents to rerank + :param ids: The list of ids corresponding to the documents + :param top_k: The number of passages to be retrieved + :return: Tuple of lists containing the reranked contents, ids, and scores + """ + rerank_results = await cohere_client.rerank( + model=model, + query=query, + documents=documents, + top_n=top_k, + return_documents=False, + ) + results: List[RerankResponseResultsItem] = rerank_results.results + reranked_scores: List[float] = list(map(lambda x: x.relevance_score, results)) + indices = list(map(lambda x: x.index, results)) + reranked_contents: List[str] = list(map(lambda i: documents[i], indices)) + reranked_ids: List[str] = list(map(lambda i: ids[i], indices)) + return reranked_contents, reranked_ids, reranked_scores diff --git a/autorag-workspace/autorag/nodes/passagereranker/colbert.py b/autorag-workspace/autorag/nodes/passagereranker/colbert.py new file mode 100644 index 0000000..ecf3fc7 --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagereranker/colbert.py @@ -0,0 +1,213 @@ +from typing import List, Tuple + +import numpy as np +import pandas as pd + +from autorag.nodes.passagereranker.base import BasePassageReranker +from autorag.utils.util import ( + flatten_apply, + sort_by_scores, + select_top_k, + pop_params, + result_to_dataframe, + empty_cuda_cache, +) + + +class ColbertReranker(BasePassageReranker): + def __init__( + self, + project_dir: str, + model_name: str = "colbert-ir/colbertv2.0", + *args, + **kwargs, + ): + """ + Initialize a colbert rerank model for reranking. + + :param project_dir: The project directory + :param model_name: The model name for Colbert rerank. + You can choose a colbert model for reranking. + The default is "colbert-ir/colbertv2.0". + :param kwargs: Extra parameter for the model. + """ + super().__init__(project_dir) + try: + import torch + from transformers import AutoModel, AutoTokenizer + except ImportError: + raise ImportError( + "Pytorch is not installed. Please install pytorch to use Colbert reranker." + ) + self.device = "cuda" if torch.cuda.is_available() else "cpu" + model_params = pop_params(AutoModel.from_pretrained, kwargs) + self.model = AutoModel.from_pretrained(model_name, **model_params).to( + self.device + ) + self.tokenizer = AutoTokenizer.from_pretrained(model_name) + + def __del__(self): + del self.model + empty_cuda_cache() + super().__del__() + + @result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + queries, contents, _, ids = self.cast_to_run(previous_result) + top_k = kwargs.pop("top_k") + batch = kwargs.pop("batch", 64) + return self._pure(queries, contents, ids, top_k, batch) + + def _pure( + self, + queries: List[str], + contents_list: List[List[str]], + ids_list: List[List[str]], + top_k: int, + batch: int = 64, + ) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]: + """ + Rerank a list of contents with Colbert rerank models. + You can get more information about a Colbert model at https://huggingface.co/colbert-ir/colbertv2.0. + It uses BERT-based model, so recommend using CUDA gpu for faster reranking. + + :param queries: The list of queries to use for reranking + :param contents_list: The list of lists of contents to rerank + :param ids_list: The list of lists of ids retrieved from the initial ranking + :param top_k: The number of passages to be retrieved + :param batch: The number of queries to be processed in a batch + Default is 64. + + :return: Tuple of lists containing the reranked contents, ids, and scores + """ + + # get query and content embeddings + query_embedding_list = get_colbert_embedding_batch( + queries, self.model, self.tokenizer, batch + ) + content_embedding_list = flatten_apply( + get_colbert_embedding_batch, + contents_list, + model=self.model, + tokenizer=self.tokenizer, + batch_size=batch, + ) + df = pd.DataFrame( + { + "ids": ids_list, + "query_embedding": query_embedding_list, + "contents": contents_list, + "content_embedding": content_embedding_list, + } + ) + temp_df = df.explode("content_embedding") + temp_df["score"] = temp_df.apply( + lambda x: get_colbert_score(x["query_embedding"], x["content_embedding"]), + axis=1, + ) + df["scores"] = ( + temp_df.groupby(level=0, sort=False)["score"].apply(list).tolist() + ) + df[["contents", "ids", "scores"]] = df.apply( + sort_by_scores, axis=1, result_type="expand" + ) + results = select_top_k(df, ["contents", "ids", "scores"], top_k) + + return ( + results["contents"].tolist(), + results["ids"].tolist(), + results["scores"].tolist(), + ) + + +def get_colbert_embedding_batch( + input_strings: List[str], model, tokenizer, batch_size: int +) -> List[np.array]: + try: + import torch + except ImportError: + raise ImportError( + "Pytorch is not installed. Please install pytorch to use Colbert reranker." + ) + encoding = tokenizer( + input_strings, + return_tensors="pt", + padding=True, + truncation=True, + max_length=model.config.max_position_embeddings, + ) + + input_batches = slice_tokenizer_result(encoding, batch_size) + result_embedding = [] + with torch.no_grad(): + for encoding_batch in input_batches: + result_embedding.append(model(**encoding_batch).last_hidden_state) + total_tensor = torch.cat( + result_embedding, dim=0 + ) # shape [batch_size, token_length, embedding_dim] + tensor_results = list(total_tensor.chunk(total_tensor.size()[0])) + + if torch.cuda.is_available(): + return list(map(lambda x: x.detach().cpu().numpy(), tensor_results)) + else: + return list(map(lambda x: x.detach().numpy(), tensor_results)) + + +def slice_tokenizer_result(tokenizer_output, batch_size): + input_ids_batches = slice_tensor(tokenizer_output["input_ids"], batch_size) + attention_mask_batches = slice_tensor( + tokenizer_output["attention_mask"], batch_size + ) + token_type_ids_batches = slice_tensor( + tokenizer_output.get("token_type_ids", None), batch_size + ) + return [ + { + "input_ids": input_ids, + "attention_mask": attention_mask, + "token_type_ids": token_type_ids, + } + for input_ids, attention_mask, token_type_ids in zip( + input_ids_batches, attention_mask_batches, token_type_ids_batches + ) + ] + + +def slice_tensor(input_tensor, batch_size): + try: + import torch + except ImportError: + raise ImportError( + "Pytorch is not installed. Please install pytorch to use Colbert reranker." + ) + # Calculate the number of full batches + num_full_batches = input_tensor.size(0) // batch_size + + # Slice the tensor into batches + tensor_list = [ + input_tensor[i * batch_size : (i + 1) * batch_size] + for i in range(num_full_batches) + ] + + # Handle the last batch if it's smaller than batch_size + remainder = input_tensor.size(0) % batch_size + if remainder: + tensor_list.append(input_tensor[-remainder:]) + + device = "cuda" if torch.cuda.is_available() else "cpu" + tensor_list = list(map(lambda x: x.to(device), tensor_list)) + + return tensor_list + + +def get_colbert_score(query_embedding: np.array, content_embedding: np.array) -> float: + if query_embedding.ndim == 3 and content_embedding.ndim == 3: + query_embedding = query_embedding.reshape(-1, query_embedding.shape[-1]) + content_embedding = content_embedding.reshape(-1, content_embedding.shape[-1]) + + sim_matrix = np.dot(query_embedding, content_embedding.T) / ( + np.linalg.norm(query_embedding, axis=1)[:, np.newaxis] + * np.linalg.norm(content_embedding, axis=1) + ) + max_sim_scores = np.max(sim_matrix, axis=1) + return float(np.mean(max_sim_scores)) diff --git a/autorag-workspace/autorag/nodes/passagereranker/dragonkue2.py b/autorag-workspace/autorag/nodes/passagereranker/dragonkue2.py new file mode 100644 index 0000000..665b406 --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagereranker/dragonkue2.py @@ -0,0 +1,138 @@ +# 250313 reranker module_type 추가 - 김용연 + +from typing import List, Tuple + +import numpy as np +import pandas as pd + +from autorag.nodes.passagereranker.base import BasePassageReranker +from autorag.utils.util import ( + make_batch, + sort_by_scores, + flatten_apply, + select_top_k, + result_to_dataframe, + empty_cuda_cache, +) + + +class DragonKue2(BasePassageReranker): + def __init__(self, project_dir: str, *args, **kwargs): + super().__init__(project_dir) + try: + import torch + from transformers import AutoModelForSequenceClassification, AutoTokenizer + except ImportError: + raise ImportError("For using dragonkue2, please install torch first.") + + model_path = "dragonkue/bge-reranker-v2-m3-ko" + self.tokenizer = AutoTokenizer.from_pretrained(model_path) + self.model = AutoModelForSequenceClassification.from_pretrained(model_path) + self.model.eval() + # Determine the device to run the model on (GPU if available, otherwise CPU) + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.model.to(self.device) + + def __del__(self): + del self.model + empty_cuda_cache() + super().__del__() + + @result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + queries, contents, _, ids = self.cast_to_run(previous_result) + top_k = kwargs.pop("top_k") + batch = kwargs.pop("batch", 64) + return self._pure(queries, contents, ids, top_k, batch) + + def _pure( + self, + queries: List[str], + contents_list: List[List[str]], + ids_list: List[List[str]], + top_k: int, + batch: int = 64, + ) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]: + """ + Rerank a list of contents based on their relevance to a query using ko-reranker. + bge-reranker-v2-m3-ko is a reranker based on korean (https://huggingface.co/dragonkue/bge-reranker-v2-m3-ko). + + :param queries: The list of queries to use for reranking + :param contents_list: The list of lists of contents to rerank + :param ids_list: The list of lists of ids retrieved from the initial ranking + :param top_k: The number of passages to be retrieved + :param batch: The number of queries to be processed in a batch + Default is 64. + :return: Tuple of lists containing the reranked contents, ids, and scores + """ + nested_list = [ + list(map(lambda x: [query, x], content_list)) + for query, content_list in zip(queries, contents_list) + ] + scores_nps = flatten_apply( + dragonku2_run_model, + nested_list, + model=self.model, + batch_size=batch, + tokenizer=self.tokenizer, + device=self.device, + ) + + rerank_scores = list( + map( + lambda scores: exp_normalize(np.array(scores)).astype(float), scores_nps + ) + ) + + df = pd.DataFrame( + { + "contents": contents_list, + "ids": ids_list, + "scores": rerank_scores, + } + ) + df[["contents", "ids", "scores"]] = df.apply( + sort_by_scores, axis=1, result_type="expand" + ) + results = select_top_k(df, ["contents", "ids", "scores"], top_k) + + return ( + results["contents"].tolist(), + results["ids"].tolist(), + results["scores"].tolist(), + ) + + +def dragonku2_run_model(input_texts, model, tokenizer, device, batch_size: int): # 250313 추가 - 김용연 + try: + import torch + except ImportError: + raise ImportError("For using drangonku2, please install torch first.") + batch_input_texts = make_batch(input_texts, batch_size) + results = [] + for batch_texts in batch_input_texts: + inputs = tokenizer( + batch_texts, + padding=True, + truncation=True, + return_tensors="pt", + max_length=512, + ) + inputs = inputs.to(device) + with torch.no_grad(): + scores = ( + model(**inputs, return_dict=True) + .logits.view( + -1, + ) + .float() + ) + scores_np = scores.cpu().numpy() + results.extend(scores_np) + return results + + +def exp_normalize(x): + b = x.max() + y = np.exp(x - b) + return y / y.sum() diff --git a/autorag-workspace/autorag/nodes/passagereranker/flag_embedding.py b/autorag-workspace/autorag/nodes/passagereranker/flag_embedding.py new file mode 100644 index 0000000..09c9cc9 --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagereranker/flag_embedding.py @@ -0,0 +1,112 @@ +from typing import List, Tuple, Iterable + +import pandas as pd + +from autorag.nodes.passagereranker.base import BasePassageReranker +from autorag.utils.util import ( + make_batch, + sort_by_scores, + flatten_apply, + select_top_k, + pop_params, + result_to_dataframe, + empty_cuda_cache, +) + + +class FlagEmbeddingReranker(BasePassageReranker): + def __init__( + self, project_dir, model_name: str = "BAAI/bge-reranker-large", *args, **kwargs + ): + """ + Initialize the FlagEmbeddingReranker module. + + :param project_dir: The project directory. + :param model_name: The name of the BAAI Reranker normal-model name. + Default is "BAAI/bge-reranker-large" + :param kwargs: Extra parameter for FlagEmbedding.FlagReranker + """ + super().__init__(project_dir) + try: + from FlagEmbedding import FlagReranker + except ImportError: + raise ImportError( + "FlagEmbeddingReranker requires the 'FlagEmbedding' package to be installed." + ) + model_params = pop_params(FlagReranker.__init__, kwargs) + model_params.pop("model_name_or_path", None) + self.model = FlagReranker(model_name_or_path=model_name, **model_params) + + def __del__(self): + del self.model + empty_cuda_cache() + super().__del__() + + @result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + queries, contents, _, ids = self.cast_to_run(previous_result) + top_k = kwargs.pop("top_k") + batch = kwargs.pop("batch", 64) + return self._pure(queries, contents, ids, top_k, batch) + + def _pure( + self, + queries: List[str], + contents_list: List[List[str]], + ids_list: List[List[str]], + top_k: int, + batch: int = 64, + ) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]: + """ + Rerank a list of contents based on their relevance to a query using BAAI normal-Reranker model. + + :param queries: The list of queries to use for reranking + :param contents_list: The list of lists of contents to rerank + :param ids_list: The list of lists of ids retrieved from the initial ranking + :param top_k: The number of passages to be retrieved + :param batch: The number of queries to be processed in a batch + Default is 64. + :return: Tuple of lists containing the reranked contents, ids, and scores + """ + nested_list = [ + list(map(lambda x: [query, x], content_list)) + for query, content_list in zip(queries, contents_list) + ] + rerank_scores = flatten_apply( + flag_embedding_run_model, nested_list, model=self.model, batch_size=batch + ) + + df = pd.DataFrame( + { + "contents": contents_list, + "ids": ids_list, + "scores": rerank_scores, + } + ) + df[["contents", "ids", "scores"]] = df.apply( + sort_by_scores, axis=1, result_type="expand" + ) + results = select_top_k(df, ["contents", "ids", "scores"], top_k) + + return ( + results["contents"].tolist(), + results["ids"].tolist(), + results["scores"].tolist(), + ) + + +def flag_embedding_run_model(input_texts, model, batch_size: int): + try: + import torch + except ImportError: + raise ImportError("FlagEmbeddingReranker requires PyTorch to be installed.") + batch_input_texts = make_batch(input_texts, batch_size) + results = [] + for batch_texts in batch_input_texts: + with torch.no_grad(): + pred_scores = model.compute_score(sentence_pairs=batch_texts) + if batch_size == 1 or not isinstance(pred_scores, Iterable): + results.append(pred_scores) + else: + results.extend(pred_scores) + return results diff --git a/autorag-workspace/autorag/nodes/passagereranker/flag_embedding_llm.py b/autorag-workspace/autorag/nodes/passagereranker/flag_embedding_llm.py new file mode 100644 index 0000000..5e5d2f1 --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagereranker/flag_embedding_llm.py @@ -0,0 +1,101 @@ +from typing import List, Tuple + +import pandas as pd + +from autorag.nodes.passagereranker.base import BasePassageReranker +from autorag.nodes.passagereranker.flag_embedding import flag_embedding_run_model +from autorag.utils.util import ( + flatten_apply, + sort_by_scores, + select_top_k, + pop_params, + result_to_dataframe, + empty_cuda_cache, +) + + +class FlagEmbeddingLLMReranker(BasePassageReranker): + def __init__( + self, + project_dir, + model_name: str = "BAAI/bge-reranker-v2-gemma", + *args, + **kwargs, + ): + """ + Initialize the FlagEmbeddingReranker module. + + :param project_dir: The project directory. + :param model_name: The name of the BAAI Reranker LLM-based-model name. + Default is "BAAI/bge-reranker-v2-gemma" + :param kwargs: Extra parameter for FlagEmbedding.FlagReranker + """ + super().__init__(project_dir) + try: + from FlagEmbedding import FlagLLMReranker + except ImportError: + raise ImportError( + "FlagEmbeddingLLMReranker requires the 'FlagEmbedding' package to be installed." + ) + model_params = pop_params(FlagLLMReranker.__init__, kwargs) + model_params.pop("model_name_or_path", None) + self.model = FlagLLMReranker(model_name_or_path=model_name, **model_params) + + def __del__(self): + del self.model + empty_cuda_cache() + super().__del__() + + @result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + queries, contents, _, ids = self.cast_to_run(previous_result) + top_k = kwargs.pop("top_k") + batch = kwargs.pop("batch", 64) + return self._pure(queries, contents, ids, top_k, batch) + + def _pure( + self, + queries: List[str], + contents_list: List[List[str]], + ids_list: List[List[str]], + top_k: int, + batch: int = 64, + ) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]: + """ + Rerank a list of contents based on their relevance to a query using BAAI LLM-based-Reranker model. + + :param queries: The list of queries to use for reranking + :param contents_list: The list of lists of contents to rerank + :param ids_list: The list of lists of ids retrieved from the initial ranking + :param top_k: The number of passages to be retrieved + :param batch: The number of queries to be processed in a batch + Default is 64. + + :return: tuple of lists containing the reranked contents, ids, and scores + """ + + nested_list = [ + list(map(lambda x: [query, x], content_list)) + for query, content_list in zip(queries, contents_list) + ] + rerank_scores = flatten_apply( + flag_embedding_run_model, nested_list, model=self.model, batch_size=batch + ) + + df = pd.DataFrame( + { + "contents": contents_list, + "ids": ids_list, + "scores": rerank_scores, + } + ) + df[["contents", "ids", "scores"]] = df.apply( + sort_by_scores, axis=1, result_type="expand" + ) + results = select_top_k(df, ["contents", "ids", "scores"], top_k) + + return ( + results["contents"].tolist(), + results["ids"].tolist(), + results["scores"].tolist(), + ) diff --git a/autorag-workspace/autorag/nodes/passagereranker/flashrank.py b/autorag-workspace/autorag/nodes/passagereranker/flashrank.py new file mode 100644 index 0000000..e6f7391 --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagereranker/flashrank.py @@ -0,0 +1,245 @@ +import json +from pathlib import Path + +import pandas as pd +import numpy as np +import os +import zipfile +import requests +from tqdm import tqdm +import collections +from typing import List, Dict, Tuple + +from autorag.nodes.passagereranker.base import BasePassageReranker +from autorag.utils import result_to_dataframe +from autorag.utils.util import ( + flatten_apply, + sort_by_scores, + select_top_k, + make_batch, + empty_cuda_cache, +) + +model_url = "https://huggingface.co/prithivida/flashrank/resolve/main/{}.zip" + +model_file_map = { + "ms-marco-TinyBERT-L-2-v2": "flashrank-TinyBERT-L-2-v2.onnx", + "ms-marco-MiniLM-L-12-v2": "flashrank-MiniLM-L-12-v2_Q.onnx", + "ms-marco-MultiBERT-L-12": "flashrank-MultiBERT-L12_Q.onnx", + "rank-T5-flan": "flashrank-rankt5_Q.onnx", + "ce-esci-MiniLM-L12-v2": "flashrank-ce-esci-MiniLM-L12-v2_Q.onnx", + "miniReranker_arabic_v1": "miniReranker_arabic_v1.onnx", +} + + +class FlashRankReranker(BasePassageReranker): + def __init__( + self, project_dir: str, model: str = "ms-marco-TinyBERT-L-2-v2", *args, **kwargs + ): + """ + Initialize FlashRank rerank node. + + :param project_dir: The project directory path. + :param model: The model name for FlashRank rerank. + You can get the list of available models from https://github.com/PrithivirajDamodaran/FlashRank. + Default is "ms-marco-TinyBERT-L-2-v2". + Not support “rank_zephyr_7b_v1_full” due to parallel inference issue. + :param kwargs: Extra arguments that are not affected + """ + super().__init__(project_dir) + try: + from tokenizers import Tokenizer + except ImportError: + raise ImportError( + "Tokenizer is not installed. Please install tokenizers to use FlashRank reranker." + ) + + cache_dir = kwargs.pop("cache_dir", "/tmp") + max_length = kwargs.pop("max_length", 512) + + self.cache_dir: Path = Path(cache_dir) + self.model_dir: Path = self.cache_dir / model + self._prepare_model_dir(model) + model_file = model_file_map[model] + + try: + import onnxruntime as ort + except ImportError: + raise ImportError( + "onnxruntime is not installed. Please install onnxruntime to use FlashRank reranker." + ) + + self.session = ort.InferenceSession(str(self.model_dir / model_file)) + self.tokenizer: Tokenizer = self._get_tokenizer(max_length) + + def __del__(self): + del self.session + del self.tokenizer + empty_cuda_cache() + super().__del__() + + def _prepare_model_dir(self, model_name: str): + if not self.cache_dir.exists(): + self.cache_dir.mkdir(parents=True, exist_ok=True) + + if not self.model_dir.exists(): + self._download_model_files(model_name) + + def _download_model_files(self, model_name: str): + local_zip_file = self.cache_dir / f"{model_name}.zip" + formatted_model_url = model_url.format(model_name) + + with requests.get(formatted_model_url, stream=True) as r: + r.raise_for_status() + total_size = int(r.headers.get("content-length", 0)) + with ( + open(local_zip_file, "wb") as f, + tqdm( + desc=local_zip_file.name, + total=total_size, + unit="iB", + unit_scale=True, + unit_divisor=1024, + ) as bar, + ): + for chunk in r.iter_content(chunk_size=8192): + size = f.write(chunk) + bar.update(size) + + with zipfile.ZipFile(local_zip_file, "r") as zip_ref: + zip_ref.extractall(self.cache_dir) + os.remove(local_zip_file) + + def _get_tokenizer(self, max_length: int = 512): + try: + from tokenizers import AddedToken, Tokenizer + except ImportError: + raise ImportError( + "Pytorch is not installed. Please install pytorch to use FlashRank reranker." + ) + config = json.load(open(str(self.model_dir / "config.json"))) + tokenizer_config = json.load( + open(str(self.model_dir / "tokenizer_config.json")) + ) + tokens_map = json.load(open(str(self.model_dir / "special_tokens_map.json"))) + tokenizer = Tokenizer.from_file(str(self.model_dir / "tokenizer.json")) + + tokenizer.enable_truncation( + max_length=min(tokenizer_config["model_max_length"], max_length) + ) + tokenizer.enable_padding( + pad_id=config["pad_token_id"], pad_token=tokenizer_config["pad_token"] + ) + + for token in tokens_map.values(): + if isinstance(token, str): + tokenizer.add_special_tokens([token]) + elif isinstance(token, dict): + tokenizer.add_special_tokens([AddedToken(**token)]) + + vocab_file = self.model_dir / "vocab.txt" + if vocab_file.exists(): + tokenizer.vocab = self._load_vocab(vocab_file) + tokenizer.ids_to_tokens = collections.OrderedDict( + [(ids, tok) for tok, ids in tokenizer.vocab.items()] + ) + return tokenizer + + def _load_vocab(self, vocab_file: Path) -> Dict[str, int]: + vocab = collections.OrderedDict() + with open(vocab_file, "r", encoding="utf-8") as reader: + tokens = reader.readlines() + for index, token in enumerate(tokens): + token = token.rstrip("\n") + vocab[token] = index + return vocab + + @result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + queries, contents, _, ids = self.cast_to_run(previous_result) + top_k = kwargs.pop("top_k") + batch = kwargs.pop("batch", 64) + return self._pure(queries, contents, ids, top_k, batch) + + def _pure( + self, + queries: List[str], + contents_list: List[List[str]], + ids_list: List[List[str]], + top_k: int, + batch: int = 64, + ) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]: + """ + Rerank a list of contents with FlashRank rerank models. + + :param queries: The list of queries to use for reranking + :param contents_list: The list of lists of contents to rerank + :param ids_list: The list of lists of ids retrieved from the initial ranking + :param top_k: The number of passages to be retrieved + :param batch: The number of queries to be processed in a batch + :return: Tuple of lists containing the reranked contents, ids, and scores + """ + nested_list = [ + list(map(lambda x: [query, x], content_list)) + for query, content_list in zip(queries, contents_list) + ] + + rerank_scores = flatten_apply( + flashrank_run_model, + nested_list, + session=self.session, + batch_size=batch, + tokenizer=self.tokenizer, + ) + + df = pd.DataFrame( + { + "contents": contents_list, + "ids": ids_list, + "scores": rerank_scores, + } + ) + df[["contents", "ids", "scores"]] = df.apply( + sort_by_scores, axis=1, result_type="expand" + ) + results = select_top_k(df, ["contents", "ids", "scores"], top_k) + + return ( + results["contents"].tolist(), + results["ids"].tolist(), + results["scores"].tolist(), + ) + + +def flashrank_run_model(input_texts, tokenizer, session, batch_size: int): + batch_input_texts = make_batch(input_texts, batch_size) + results = [] + + for batch_texts in tqdm(batch_input_texts): + input_text = tokenizer.encode_batch(batch_texts) + input_ids = np.array([e.ids for e in input_text]) + token_type_ids = np.array([e.type_ids for e in input_text]) + attention_mask = np.array([e.attention_mask for e in input_text]) + + use_token_type_ids = token_type_ids is not None and not np.all( + token_type_ids == 0 + ) + + onnx_input = { + "input_ids": input_ids.astype(np.int64), + "attention_mask": attention_mask.astype(np.int64), + } + if use_token_type_ids: + onnx_input["token_type_ids"] = token_type_ids.astype(np.int64) + + outputs = session.run(None, onnx_input) + + logits = outputs[0] + + if logits.shape[1] == 1: + scores = 1 / (1 + np.exp(-logits.flatten())) + else: + exp_logits = np.exp(logits) + scores = exp_logits[:, 1] / np.sum(exp_logits, axis=1) + results.extend(scores) + return results diff --git a/autorag-workspace/autorag/nodes/passagereranker/jina.py b/autorag-workspace/autorag/nodes/passagereranker/jina.py new file mode 100644 index 0000000..629ce99 --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagereranker/jina.py @@ -0,0 +1,115 @@ +import os +from typing import List, Tuple + +import aiohttp +import pandas as pd + +from autorag.nodes.passagereranker.base import BasePassageReranker +from autorag.utils.util import get_event_loop, process_batch, result_to_dataframe + +JINA_API_URL = "https://api.jina.ai/v1/rerank" + + +class JinaReranker(BasePassageReranker): + def __init__(self, project_dir: str, api_key: str = None, *args, **kwargs): + """ + Initialize Jina rerank node. + + :param project_dir: The project directory path. + :param api_key: The API key for Jina rerank. + You can set it in the environment variable JINAAI_API_KEY. + Or, you can directly set it on the config YAML file using this parameter. + Default is env variable "JINAAI_API_KEY". + :param kwargs: Extra arguments that are not affected + """ + super().__init__(project_dir) + if api_key is None: + api_key = os.getenv("JINAAI_API_KEY", None) + if api_key is None: + raise ValueError( + "API key is not provided." + "You can set it as an argument or as an environment variable 'JINAAI_API_KEY'" + ) + self.session = aiohttp.ClientSession(loop=get_event_loop()) + self.session.headers.update( + {"Authorization": f"Bearer {api_key}", "Accept-Encoding": "identity"} + ) + + def __del__(self): + self.session.close() + del self.session + super().__del__() + + @result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + queries, contents, _, ids = self.cast_to_run(previous_result) + top_k = kwargs.pop("top_k") + batch = kwargs.pop("batch", 8) + model = kwargs.pop("model", "jina-reranker-v1-base-en") + return self._pure(queries, contents, ids, top_k, model, batch) + + def _pure( + self, + queries: List[str], + contents_list: List[List[str]], + ids_list: List[List[str]], + top_k: int, + model: str = "jina-reranker-v1-base-en", + batch: int = 8, + ) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]: + """ + Rerank a list of contents with Jina rerank models. + You can get the API key from https://jina.ai/reranker and set it in the environment variable JINAAI_API_KEY. + + :param queries: The list of queries to use for reranking + :param contents_list: The list of lists of contents to rerank + :param ids_list: The list of lists of ids retrieved from the initial ranking + :param top_k: The number of passages to be retrieved + :param model: The model name for Cohere rerank. + You can choose between "jina-reranker-v1-base-en" and "jina-colbert-v1-en". + Default is "jina-reranker-v1-base-en". + :param batch: The number of queries to be processed in a batch + :return: Tuple of lists containing the reranked contents, ids, and scores + """ + tasks = [ + jina_reranker_pure( + self.session, query, contents, ids, top_k=top_k, model=model + ) + for query, contents, ids in zip(queries, contents_list, ids_list) + ] + loop = get_event_loop() + results = loop.run_until_complete(process_batch(tasks, batch)) + + content_result, id_result, score_result = zip(*results) + + return list(content_result), list(id_result), list(score_result) + + +async def jina_reranker_pure( + session, + query: str, + contents: List[str], + ids: List[str], + top_k: int, + model: str = "jina-reranker-v1-base-en", +) -> Tuple[List[str], List[str], List[float]]: + async with session.post( + JINA_API_URL, + json={ + "query": query, + "documents": contents, + "model": model, + "top_n": top_k, + }, + ) as resp: + resp_json = await resp.json() + if "results" not in resp_json: + raise RuntimeError(f"Invalid response from Jina API: {resp_json['detail']}") + + results = resp_json["results"] + indices = list(map(lambda x: x["index"], results)) + score_result = list(map(lambda x: x["relevance_score"], results)) + id_result = list(map(lambda x: ids[x], indices)) + content_result = list(map(lambda x: contents[x], indices)) + + return content_result, id_result, score_result diff --git a/autorag-workspace/autorag/nodes/passagereranker/koreranker.py b/autorag-workspace/autorag/nodes/passagereranker/koreranker.py new file mode 100644 index 0000000..b7b97ca --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagereranker/koreranker.py @@ -0,0 +1,136 @@ +from typing import List, Tuple + +import numpy as np +import pandas as pd + +from autorag.nodes.passagereranker.base import BasePassageReranker +from autorag.utils.util import ( + make_batch, + sort_by_scores, + flatten_apply, + select_top_k, + result_to_dataframe, + empty_cuda_cache, +) + + +class KoReranker(BasePassageReranker): + def __init__(self, project_dir: str, *args, **kwargs): + super().__init__(project_dir) + try: + import torch + from transformers import AutoModelForSequenceClassification, AutoTokenizer + except ImportError: + raise ImportError("For using KoReranker, please install torch first.") + + model_path = "Dongjin-kr/ko-reranker" + self.tokenizer = AutoTokenizer.from_pretrained(model_path) + self.model = AutoModelForSequenceClassification.from_pretrained(model_path) + self.model.eval() + # Determine the device to run the model on (GPU if available, otherwise CPU) + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.model.to(self.device) + + def __del__(self): + del self.model + empty_cuda_cache() + super().__del__() + + @result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + queries, contents, _, ids = self.cast_to_run(previous_result) + top_k = kwargs.pop("top_k") + batch = kwargs.pop("batch", 64) + return self._pure(queries, contents, ids, top_k, batch) + + def _pure( + self, + queries: List[str], + contents_list: List[List[str]], + ids_list: List[List[str]], + top_k: int, + batch: int = 64, + ) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]: + """ + Rerank a list of contents based on their relevance to a query using ko-reranker. + ko-reranker is a reranker based on korean (https://huggingface.co/Dongjin-kr/ko-reranker). + + :param queries: The list of queries to use for reranking + :param contents_list: The list of lists of contents to rerank + :param ids_list: The list of lists of ids retrieved from the initial ranking + :param top_k: The number of passages to be retrieved + :param batch: The number of queries to be processed in a batch + Default is 64. + :return: Tuple of lists containing the reranked contents, ids, and scores + """ + nested_list = [ + list(map(lambda x: [query, x], content_list)) + for query, content_list in zip(queries, contents_list) + ] + scores_nps = flatten_apply( + koreranker_run_model, + nested_list, + model=self.model, + batch_size=batch, + tokenizer=self.tokenizer, + device=self.device, + ) + + rerank_scores = list( + map( + lambda scores: exp_normalize(np.array(scores)).astype(float), scores_nps + ) + ) + + df = pd.DataFrame( + { + "contents": contents_list, + "ids": ids_list, + "scores": rerank_scores, + } + ) + df[["contents", "ids", "scores"]] = df.apply( + sort_by_scores, axis=1, result_type="expand" + ) + results = select_top_k(df, ["contents", "ids", "scores"], top_k) + + return ( + results["contents"].tolist(), + results["ids"].tolist(), + results["scores"].tolist(), + ) + + +def koreranker_run_model(input_texts, model, tokenizer, device, batch_size: int): + try: + import torch + except ImportError: + raise ImportError("For using KoReranker, please install torch first.") + batch_input_texts = make_batch(input_texts, batch_size) + results = [] + for batch_texts in batch_input_texts: + inputs = tokenizer( + batch_texts, + padding=True, + truncation=True, + return_tensors="pt", + max_length=512, + ) + inputs = inputs.to(device) + with torch.no_grad(): + scores = ( + model(**inputs, return_dict=True) + .logits.view( + -1, + ) + .float() + ) + scores_np = scores.cpu().numpy() + results.extend(scores_np) + return results + + +def exp_normalize(x): + b = x.max() + y = np.exp(x - b) + return y / y.sum() diff --git a/autorag-workspace/autorag/nodes/passagereranker/mixedbreadai.py b/autorag-workspace/autorag/nodes/passagereranker/mixedbreadai.py new file mode 100644 index 0000000..04b6382 --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagereranker/mixedbreadai.py @@ -0,0 +1,126 @@ +import os +from typing import List, Tuple + +import pandas as pd +from mixedbread_ai.client import AsyncMixedbreadAI + +from autorag.nodes.passagereranker.base import BasePassageReranker +from autorag.utils.util import ( + result_to_dataframe, + get_event_loop, + process_batch, + pop_params, +) + + +class MixedbreadAIReranker(BasePassageReranker): + def __init__( + self, + project_dir: str, + *args, + **kwargs, + ): + """ + Initialize mixedbread-ai rerank node. + + :param project_dir: The project directory path. + :param api_key: The API key for MixedbreadAI rerank. + You can set it in the environment variable MXBAI_API_KEY. + Or, you can directly set it on the config YAML file using this parameter. + Default is env variable "MXBAI_API_KEY". + :param kwargs: Extra arguments that are not affected + """ + super().__init__(project_dir) + api_key = kwargs.pop("api_key", None) + api_key = os.getenv("MXBAI_API_KEY", None) if api_key is None else api_key + if api_key is None: + raise KeyError( + "Please set the API key for Mixedbread AI rerank in the environment variable MXBAI_API_KEY " + "or directly set it on the config YAML file." + ) + self.client = AsyncMixedbreadAI(api_key=api_key) + + def __del__(self): + del self.client + super().__del__() + + @result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + queries, contents, scores, ids = self.cast_to_run(previous_result) + top_k = kwargs.pop("top_k") + batch = kwargs.pop("batch", 8) + model = kwargs.pop("model", "mixedbread-ai/mxbai-rerank-large-v1") + rerank_params = pop_params(self.client.reranking, kwargs) + return self._pure(queries, contents, ids, top_k, model, batch, **rerank_params) + + def _pure( + self, + queries: List[str], + contents_list: List[List[str]], + ids_list: List[List[str]], + top_k: int, + model: str = "mixedbread-ai/mxbai-rerank-large-v1", + batch: int = 8, + ) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]: + """ + Rerank a list of contents with mixedbread-ai rerank models. + You can get the API key from https://www.mixedbread.ai/api-reference#quick-start-guide and set it in the environment variable MXBAI_API_KEY. + + :param queries: The list of queries to use for reranking + :param contents_list: The list of lists of contents to rerank + :param ids_list: The list of lists of ids retrieved from the initial ranking + :param top_k: The number of passages to be retrieved + :param model: The model name for mixedbread-ai rerank. + You can choose between "mixedbread-ai/mxbai-rerank-large-v1", "mixedbread-ai/mxbai-rerank-base-v1" and "mixedbread-ai/mxbai-rerank-xsmall-v1". + Default is "mixedbread-ai/mxbai-rerank-large-v1". + :param batch: The number of queries to be processed in a batch + :return: Tuple of lists containing the reranked contents, ids, and scores + """ + tasks = [ + mixedbreadai_rerank_pure( + self.client, query, contents, ids, top_k=top_k, model=model + ) + for query, contents, ids in zip(queries, contents_list, ids_list) + ] + loop = get_event_loop() + results = loop.run_until_complete(process_batch(tasks, batch)) + + content_result, id_result, score_result = zip(*results) + + return list(content_result), list(id_result), list(score_result) + + +async def mixedbreadai_rerank_pure( + client: AsyncMixedbreadAI, + query: str, + documents: List[str], + ids: List[str], + top_k: int, + model: str = "mixedbread-ai/mxbai-rerank-large-v1", +) -> Tuple[List[str], List[str], List[float]]: + """ + Rerank a list of contents with mixedbread-ai rerank models. + + :param client: The mixedbread-ai client to use for reranking + :param query: The query to use for reranking + :param documents: The list of contents to rerank + :param ids: The list of ids corresponding to the documents + :param top_k: The number of passages to be retrieved + :param model: The model name for mixedbread-ai rerank. + You can choose between "mixedbread-ai/mxbai-rerank-large-v1" and "mixedbread-ai/mxbai-rerank-base-v1". + Default is "mixedbread-ai/mxbai-rerank-large-v1". + :return: Tuple of lists containing the reranked contents, ids, and scores + """ + + results = await client.reranking( + query=query, + input=documents, + top_k=top_k, + model=model, + ) + reranked_scores: List[float] = list(map(lambda x: x.score, results.data)) + reranked_scores_float = list(map(float, reranked_scores)) + indices = list(map(lambda x: x.index, results.data)) + reranked_contents = list(map(lambda x: documents[x], indices)) + reranked_ids: List[str] = list(map(lambda i: ids[i], indices)) + return reranked_contents, reranked_ids, reranked_scores_float diff --git a/autorag-workspace/autorag/nodes/passagereranker/monot5.py b/autorag-workspace/autorag/nodes/passagereranker/monot5.py new file mode 100644 index 0000000..0d563ee --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagereranker/monot5.py @@ -0,0 +1,190 @@ +from itertools import chain +from typing import List, Tuple + +import pandas as pd + +from autorag.nodes.passagereranker.base import BasePassageReranker +from autorag.utils.util import ( + make_batch, + sort_by_scores, + flatten_apply, + select_top_k, + result_to_dataframe, + pop_params, + empty_cuda_cache, +) + +prediction_tokens = { + "castorini/monot5-base-msmarco": ["▁false", "▁true"], + "castorini/monot5-base-msmarco-10k": ["▁false", "▁true"], + "castorini/monot5-large-msmarco": ["▁false", "▁true"], + "castorini/monot5-large-msmarco-10k": ["▁false", "▁true"], + "castorini/monot5-base-med-msmarco": ["▁false", "▁true"], + "castorini/monot5-3b-med-msmarco": ["▁false", "▁true"], + "castorini/monot5-3b-msmarco-10k": ["▁false", "▁true"], + "unicamp-dl/mt5-base-en-msmarco": ["▁no", "▁yes"], + "unicamp-dl/ptt5-base-pt-msmarco-10k-v2": ["▁não", "▁sim"], + "unicamp-dl/ptt5-base-pt-msmarco-100k-v2": ["▁não", "▁sim"], + "unicamp-dl/ptt5-base-en-pt-msmarco-100k-v2": ["▁não", "▁sim"], + "unicamp-dl/mt5-base-en-pt-msmarco-v2": ["▁no", "▁yes"], + "unicamp-dl/mt5-base-mmarco-v2": ["▁no", "▁yes"], + "unicamp-dl/mt5-base-en-pt-msmarco-v1": ["▁no", "▁yes"], + "unicamp-dl/mt5-base-mmarco-v1": ["▁no", "▁yes"], + "unicamp-dl/ptt5-base-pt-msmarco-10k-v1": ["▁não", "▁sim"], + "unicamp-dl/ptt5-base-pt-msmarco-100k-v1": ["▁não", "▁sim"], + "unicamp-dl/ptt5-base-en-pt-msmarco-10k-v1": ["▁não", "▁sim"], + "unicamp-dl/mt5-3B-mmarco-en-pt": ["▁", "▁true"], + "unicamp-dl/mt5-13b-mmarco-100k": ["▁", "▁true"], +} + + +class MonoT5(BasePassageReranker): + def __init__( + self, + project_dir: str, + model_name: str = "castorini/monot5-3b-msmarco-10k", + *args, + **kwargs, + ): + """ + Initialize the MonoT5 reranker. + + :param project_dir: The project directory + :param model_name: The name of the MonoT5 model to use for reranking + Note: default model name is 'castorini/monot5-3b-msmarco-10k' + If there is a '/' in the model name parameter, + when we create the file to store the results, the path will be twisted because of the '/'. + Therefore, it will be received as '_' instead of '/'. + :param kwargs: The extra arguments for the MonoT5 reranker + """ + super().__init__(project_dir) + try: + import torch + from transformers import T5Tokenizer, T5ForConditionalGeneration + except ImportError: + raise ImportError("For using MonoT5 Reranker, please install torch first.") + # replace '_' to '/' + if "_" in model_name: + model_name = model_name.replace("_", "/") + # Load the tokenizer and model from the pre-trained MonoT5 model + self.tokenizer = T5Tokenizer.from_pretrained(model_name) + model_params = pop_params(T5ForConditionalGeneration.from_pretrained, kwargs) + self.model = T5ForConditionalGeneration.from_pretrained( + model_name, **model_params + ).eval() + + # Determine the device to run the model on (GPU if available, otherwise CPU) + self.device = "cuda" if torch.cuda.is_available() else "cpu" + self.model.to(self.device) + + token_false, token_true = prediction_tokens[model_name] + self.token_false_id = self.tokenizer.convert_tokens_to_ids(token_false) + self.token_true_id = self.tokenizer.convert_tokens_to_ids(token_true) + + def __del__(self): + del self.model + del self.tokenizer + empty_cuda_cache() + super().__del__() + + @result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + queries, contents, _, ids = self.cast_to_run(previous_result) + top_k = kwargs.get("top_k", 3) + batch = kwargs.get("batch", 64) + return self._pure(queries, contents, ids, top_k, batch) + + def _pure( + self, + queries: List[str], + contents_list: List[List[str]], + ids_list: List[List[str]], + top_k: int, + batch: int = 64, + ) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]: + """ + Rerank a list of contents based on their relevance to a query using MonoT5. + + :param queries: The list of queries to use for reranking + :param contents_list: The list of lists of contents to rerank + :param ids_list: The list of lists of ids retrieved from the initial ranking + :param top_k: The number of passages to be retrieved + + :param batch: The number of queries to be processed in a batch + :return: tuple of lists containing the reranked contents, ids, and scores + """ + # Retrieve the tokens used by the model to represent false and true predictions + + nested_list = [ + list(map(lambda x: [f"Query: {query} Document: {x}"], content_list)) + for query, content_list in zip(queries, contents_list) + ] + + rerank_scores = flatten_apply( + monot5_run_model, + nested_list, + model=self.model, + batch_size=batch, + tokenizer=self.tokenizer, + device=self.device, + token_false_id=self.token_false_id, + token_true_id=self.token_true_id, + ) + + df = pd.DataFrame( + { + "contents": contents_list, + "ids": ids_list, + "scores": rerank_scores, + } + ) + df[["contents", "ids", "scores"]] = df.apply( + sort_by_scores, axis=1, result_type="expand" + ) + results = select_top_k(df, ["contents", "ids", "scores"], top_k) + + return ( + results["contents"].tolist(), + results["ids"].tolist(), + results["scores"].tolist(), + ) + + +def monot5_run_model( + input_texts, + model, + batch_size: int, + tokenizer, + device, + token_false_id, + token_true_id, +): + try: + import torch + except ImportError: + raise ImportError("For using MonoT5 Reranker, please install torch first.") + batch_input_texts = make_batch(input_texts, batch_size) + results = [] + for batch_texts in batch_input_texts: + flattened_batch_texts = list(chain.from_iterable(batch_texts)) + input_encodings = tokenizer( + flattened_batch_texts, + padding=True, + truncation=True, + max_length=512, + return_tensors="pt", + ).to(device) + with torch.no_grad(): + outputs = model.generate( + input_ids=input_encodings["input_ids"], + attention_mask=input_encodings["attention_mask"], + output_scores=True, + return_dict_in_generate=True, + ) + + # Extract logits for the 'false' and 'true' tokens from the model's output + logits = outputs.scores[-1][:, [token_false_id, token_true_id]] + # Calculate the softmax probability of the 'true' token + probs = torch.nn.functional.softmax(logits, dim=-1)[:, 1] + results.extend(probs.tolist()) + return results diff --git a/autorag-workspace/autorag/nodes/passagereranker/openvino.py b/autorag-workspace/autorag/nodes/passagereranker/openvino.py new file mode 100644 index 0000000..67c8b2c --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagereranker/openvino.py @@ -0,0 +1,191 @@ +from pathlib import Path +from typing import Any, List, Tuple + +import numpy as np +import pandas as pd + +from autorag.nodes.passagereranker.base import BasePassageReranker + + +from autorag.utils.util import ( + make_batch, + sort_by_scores, + flatten_apply, + select_top_k, + result_to_dataframe, + pop_params, + empty_cuda_cache, +) + + +class OpenVINOReranker(BasePassageReranker): + def __init__( + self, + project_dir: str, + model: str = "BAAI/bge-reranker-large", + *args, + **kwargs, + ): + super().__init__(project_dir) + + try: + from huggingface_hub import HfApi + from transformers import AutoTokenizer + except ImportError as e: + raise ValueError( + "Could not import huggingface_hub python package. " + "Please install it with: " + "`pip install -U huggingface_hub`." + ) from e + + def require_model_export( + model_id: str, revision: Any = None, subfolder: Any = None + ) -> bool: + model_dir = Path(model_id) + if subfolder is not None: + model_dir = model_dir / subfolder + if model_dir.is_dir(): + return ( + not (model_dir / "openvino_model.xml").exists() + or not (model_dir / "openvino_model.bin").exists() + ) + hf_api = HfApi() + try: + model_info = hf_api.model_info(model_id, revision=revision or "main") + normalized_subfolder = ( + None if subfolder is None else Path(subfolder).as_posix() + ) + model_files = [ + file.rfilename + for file in model_info.siblings + if normalized_subfolder is None + or file.rfilename.startswith(normalized_subfolder) + ] + ov_model_path = ( + "openvino_model.xml" + if subfolder is None + else f"{normalized_subfolder}/openvino_model.xml" + ) + return ( + ov_model_path not in model_files + or ov_model_path.replace(".xml", ".bin") not in model_files + ) + except Exception: + return True + + try: + from optimum.intel.openvino import OVModelForSequenceClassification + except ImportError: + raise ImportError( + "Please install optimum package to use OpenVINOReranker" + "pip install 'optimum[openvino,nncf]'" + ) + + model_kwargs = pop_params( + OVModelForSequenceClassification.from_pretrained, kwargs + ) + + if require_model_export(model): + # use remote model + self.model = OVModelForSequenceClassification.from_pretrained( + model, export=True, **model_kwargs + ) + else: + # use local model + self.model = OVModelForSequenceClassification.from_pretrained( + model, **model_kwargs + ) + + self.tokenizer = AutoTokenizer.from_pretrained(model) + + def __del__(self): + del self.model + del self.tokenizer + empty_cuda_cache() + super().__del__() + + @result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + queries, contents, _, ids = self.cast_to_run(previous_result) + top_k = kwargs.get("top_k", 3) + batch = kwargs.get("batch", 64) + return self._pure(queries, contents, ids, top_k, batch) + + def _pure( + self, + queries: List[str], + contents_list: List[List[str]], + ids_list: List[List[str]], + top_k: int, + batch: int = 64, + ) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]: + """ + Rerank a list of contents based on their relevance to a query using MonoT5. + + :param queries: The list of queries to use for reranking + :param contents_list: The list of lists of contents to rerank + :param ids_list: The list of lists of ids retrieved from the initial ranking + :param top_k: The number of passages to be retrieved + + :param batch: The number of queries to be processed in a batch + :return: tuple of lists containing the reranked contents, ids, and scores + """ + # Retrieve the tokens used by the model to represent false and true predictions + + nested_list = [ + list(map(lambda x: [query, x], content_list)) + for query, content_list in zip(queries, contents_list) + ] + + rerank_scores = flatten_apply( + openvino_run_model, + nested_list, + model=self.model, + batch_size=batch, + tokenizer=self.tokenizer, + ) + + df = pd.DataFrame( + { + "contents": contents_list, + "ids": ids_list, + "scores": rerank_scores, + } + ) + df[["contents", "ids", "scores"]] = df.apply( + sort_by_scores, axis=1, result_type="expand" + ) + results = select_top_k(df, ["contents", "ids", "scores"], top_k) + + return ( + results["contents"].tolist(), + results["ids"].tolist(), + results["scores"].tolist(), + ) + + +def openvino_run_model( + input_texts, + model, + batch_size: int, + tokenizer, +): + batch_input_texts = make_batch(input_texts, batch_size) + results = [] + for batch_texts in batch_input_texts: + input_tensors = tokenizer( + batch_texts, + padding=True, + truncation=True, + return_tensors="pt", + ) + + outputs = model(**input_tensors, return_dict=True) + if outputs[0].shape[1] > 1: + scores = outputs[0][:, 1] + else: + scores = outputs[0].flatten() + + scores = list(map(float, (1 / (1 + np.exp(-np.array(scores)))))) + results.extend(scores) + return results diff --git a/autorag-workspace/autorag/nodes/passagereranker/pass_reranker.py b/autorag-workspace/autorag/nodes/passagereranker/pass_reranker.py new file mode 100644 index 0000000..724e3d5 --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagereranker/pass_reranker.py @@ -0,0 +1,31 @@ +from typing import List + +import pandas as pd + +from autorag.nodes.passagereranker.base import BasePassageReranker +from autorag.utils import result_to_dataframe + + +class PassReranker(BasePassageReranker): + @result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + top_k = kwargs.pop("top_k") + + _, contents_list, scores_list, ids_list = self.cast_to_run(previous_result) + return self._pure(contents_list, scores_list, ids_list, top_k) + + def _pure( + self, + contents_list: List[List[str]], + scores_list: List[List[float]], + ids_list: List[List[str]], + top_k: int, + ): + """ + Do not perform reranking. + Return the given top-k passages as is. + """ + contents_list = list(map(lambda x: x[:top_k], contents_list)) + scores_list = list(map(lambda x: x[:top_k], scores_list)) + ids_list = list(map(lambda x: x[:top_k], ids_list)) + return contents_list, ids_list, scores_list diff --git a/autorag-workspace/autorag/nodes/passagereranker/rankgpt.py b/autorag-workspace/autorag/nodes/passagereranker/rankgpt.py new file mode 100644 index 0000000..83d87b1 --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagereranker/rankgpt.py @@ -0,0 +1,170 @@ +from typing import List, Optional, Sequence, Tuple, Union + +import numpy as np +import pandas as pd +from llama_index.core.base.llms.types import ChatMessage, ChatResponse +from llama_index.core.llms import LLM +from llama_index.core.postprocessor.rankGPT_rerank import RankGPTRerank +from llama_index.core.schema import NodeWithScore, QueryBundle, TextNode +from llama_index.core.utils import print_text +from llama_index.llms.openai import OpenAI + +from autorag import generator_models +from autorag.nodes.passagereranker.base import BasePassageReranker +from autorag.utils.util import ( + get_event_loop, + process_batch, + pop_params, + result_to_dataframe, + empty_cuda_cache, +) + + +class RankGPT(BasePassageReranker): + def __init__( + self, project_dir: str, llm: Optional[Union[str, LLM]] = None, **kwargs + ): + """ + Initialize the RankGPT reranker. + + :param project_dir: The project directory + :param llm: The LLM model to use for RankGPT rerank. + It is a llama index model. + Default is the OpenAI model with gpt-4o-mini. + :param kwargs: The keyword arguments for the LLM model. + """ + super().__init__(project_dir) + if llm is None: + self.llm = OpenAI(model="gpt-4o-mini") + else: + if not isinstance(llm, LLM): + llm_class = generator_models[llm] + llm_param = pop_params(llm_class.__init__, kwargs) + self.llm = llm_class(**llm_param) + else: + self.llm = llm + + def __del__(self): + del self.llm + empty_cuda_cache() + super().__del__() + + @result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + queries, contents, scores, ids = self.cast_to_run(previous_result) + top_k = kwargs.get("top_k", 1) + verbose = kwargs.get("verbose", False) + rankgpt_rerank_prompt = kwargs.get("rankgpt_rerank_prompt", None) + batch = kwargs.get("batch", 16) + return self._pure( + queries=queries, + contents_list=contents, + scores_list=scores, + ids_list=ids, + top_k=top_k, + verbose=verbose, + rankgpt_rerank_prompt=rankgpt_rerank_prompt, + batch=batch, + ) + + def _pure( + self, + queries: List[str], + contents_list: List[List[str]], + scores_list: List[List[float]], + ids_list: List[List[str]], + top_k: int, + verbose: bool = False, + rankgpt_rerank_prompt: Optional[str] = None, + batch: int = 16, + ) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]: + """ + Rerank given context paragraphs using RankGPT. + Return pseudo scores, since the actual scores are not available on RankGPT. + + :param queries: The list of queries to use for reranking + :param contents_list: The list of lists of contents to rerank + :param scores_list: The list of lists of scores retrieved from the initial ranking + :param ids_list: The list of lists of ids retrieved from the initial ranking + :param top_k: The number of passages to be retrieved + :param verbose: Whether to print intermediate steps. + :param rankgpt_rerank_prompt: The prompt template for RankGPT rerank. + Default is RankGPT's default prompt. + :param batch: The number of queries to be processed in a batch. + :return: Tuple of lists containing the reranked contents, ids, and scores + """ + query_bundles = list(map(lambda query: QueryBundle(query_str=query), queries)) + nodes_list = [ + list( + map( + lambda x: NodeWithScore(node=TextNode(text=x[0]), score=x[1]), + zip(content_list, score_list), + ) + ) + for content_list, score_list in zip(contents_list, scores_list) + ] + + reranker = AsyncRankGPTRerank( + top_n=top_k, + llm=self.llm, + verbose=verbose, + rankgpt_rerank_prompt=rankgpt_rerank_prompt, + ) + + tasks = [ + reranker.async_postprocess_nodes(nodes, query, ids) + for nodes, query, ids in zip(nodes_list, query_bundles, ids_list) + ] + loop = get_event_loop() + rerank_result = loop.run_until_complete(process_batch(tasks, batch_size=batch)) + content_result = [ + list(map(lambda x: x.node.text, res[0])) for res in rerank_result + ] + score_result = [ + np.linspace(1.0, 0.0, len(res[0])).tolist() for res in rerank_result + ] + id_result = [res[1] for res in rerank_result] + + del reranker + + return content_result, id_result, score_result + + +class AsyncRankGPTRerank(RankGPTRerank): + async def async_run_llm(self, messages: Sequence[ChatMessage]) -> ChatResponse: + return await self.llm.achat(messages) + + async def async_postprocess_nodes( + self, + nodes: List[NodeWithScore], + query_bundle: QueryBundle, + ids: Optional[List[str]] = None, + ) -> Tuple[List[NodeWithScore], List[str]]: + if ids is None: + ids = [str(i) for i in range(len(nodes))] + + items = { + "query": query_bundle.query_str, + "hits": [{"content": node.get_content()} for node in nodes], + } + + messages = self.create_permutation_instruction(item=items) + permutation = await self.async_run_llm(messages=messages) + if permutation.message is not None and permutation.message.content is not None: + rerank_ranks = self._receive_permutation( + items, str(permutation.message.content) + ) + if self.verbose: + print_text(f"After Reranking, new rank list for nodes: {rerank_ranks}") + + initial_results: List[NodeWithScore] = [] + id_results = [] + + for idx in rerank_ranks: + initial_results.append( + NodeWithScore(node=nodes[idx].node, score=nodes[idx].score) + ) + id_results.append(ids[idx]) + return initial_results[: self.top_n], id_results[: self.top_n] + else: + return nodes[: self.top_n], ids[: self.top_n] diff --git a/autorag-workspace/autorag/nodes/passagereranker/run.py b/autorag-workspace/autorag/nodes/passagereranker/run.py new file mode 100644 index 0000000..385776a --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagereranker/run.py @@ -0,0 +1,145 @@ +import logging +import os +import pathlib +from typing import List, Dict + +import pandas as pd + +from autorag.nodes.retrieval.run import evaluate_retrieval_node +from autorag.schema.metricinput import MetricInput +from autorag.strategy import measure_speed, filter_by_threshold, select_best +from autorag.utils.util import apply_recursive, to_list + +logger = logging.getLogger("AutoRAG") + + +def run_passage_reranker_node( + modules: List, + module_params: List[Dict], + previous_result: pd.DataFrame, + node_line_dir: str, + strategies: Dict, +) -> pd.DataFrame: + """ + Run evaluation and select the best module among passage reranker node results. + + :param modules: Passage reranker modules to run. + :param module_params: Passage reranker module parameters. + :param previous_result: Previous result dataframe. + Could be retrieval, reranker modules result. + It means it must contain 'query', 'retrieved_contents', 'retrieved_ids', 'retrieve_scores' columns. + :param node_line_dir: This node line's directory. + :param strategies: Strategies for passage reranker node. + In this node, we use 'retrieval_f1', 'retrieval_recall' and 'retrieval_precision'. + You can skip evaluation when you use only one module and a module parameter. + :return: The best result dataframe with previous result columns. + """ + if not os.path.exists(node_line_dir): + os.makedirs(node_line_dir) + project_dir = pathlib.PurePath(node_line_dir).parent.parent + qa_df = pd.read_parquet( + os.path.join(project_dir, "data", "qa.parquet"), engine="pyarrow" + ) + retrieval_gt = qa_df["retrieval_gt"].tolist() + retrieval_gt = apply_recursive(lambda x: str(x), to_list(retrieval_gt)) + + # make rows to metric_inputs + metric_inputs = [ + MetricInput(retrieval_gt=ret_gt, query=query, generation_gt=gen_gt) + for ret_gt, query, gen_gt in zip( + retrieval_gt, qa_df["query"].tolist(), qa_df["generation_gt"].tolist() + ) + ] + + results, execution_times = zip( + *map( + lambda task: measure_speed( + task[0].run_evaluator, + project_dir=project_dir, + previous_result=previous_result, + **task[1], + ), + zip(modules, module_params), + ) + ) + average_times = list(map(lambda x: x / len(results[0]), execution_times)) + + # run metrics before filtering + if strategies.get("metrics") is None: + raise ValueError( + "You must at least one metrics for passage_reranker evaluation." + ) + results = list( + map( + lambda x: evaluate_retrieval_node( + x, + metric_inputs, + strategies.get("metrics"), + ), + results, + ) + ) + + # save results to folder + save_dir = os.path.join(node_line_dir, "passage_reranker") # node name + if not os.path.exists(save_dir): + os.makedirs(save_dir) + filepaths = list( + map(lambda x: os.path.join(save_dir, f"{x}.parquet"), range(len(modules))) + ) + list( + map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths)) + ) # execute save to parquet + filenames = list(map(lambda x: os.path.basename(x), filepaths)) + + summary_df = pd.DataFrame( + { + "filename": filenames, + "module_name": list(map(lambda module: module.__name__, modules)), + "module_params": module_params, + "execution_time": average_times, + **{ + f"passage_reranker_{metric}": list( + map(lambda result: result[metric].mean(), results) + ) + for metric in strategies.get("metrics") + }, + } + ) + + # filter by strategies + if strategies.get("speed_threshold") is not None: + results, filenames = filter_by_threshold( + results, average_times, strategies["speed_threshold"], filenames + ) + selected_result, selected_filename = select_best( + results, + strategies.get("metrics"), + filenames, + strategies.get("strategy", "mean"), + ) + # change metric name columns to passage_reranker_metric_name + selected_result = selected_result.rename( + columns={ + metric_name: f"passage_reranker_{metric_name}" + for metric_name in strategies["metrics"] + } + ) + # drop retrieval result columns in previous_result + previous_result = previous_result.drop( + columns=["retrieved_contents", "retrieved_ids", "retrieve_scores"] + ) + best_result = pd.concat([previous_result, selected_result], axis=1) + + # add 'is_best' column to summary file + summary_df["is_best"] = summary_df["filename"] == selected_filename + + # save files + summary_df.to_csv(os.path.join(save_dir, "summary.csv"), index=False) + best_result.to_parquet( + os.path.join( + save_dir, f"best_{os.path.splitext(selected_filename)[0]}.parquet" + ), + index=False, + ) + return best_result diff --git a/autorag-workspace/autorag/nodes/passagereranker/sentence_transformer.py b/autorag-workspace/autorag/nodes/passagereranker/sentence_transformer.py new file mode 100644 index 0000000..dd67d5d --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagereranker/sentence_transformer.py @@ -0,0 +1,129 @@ +from typing import List, Tuple + +import pandas as pd + +from autorag.nodes.passagereranker.base import BasePassageReranker +from autorag.utils.util import ( + flatten_apply, + make_batch, + select_top_k, + sort_by_scores, + pop_params, + result_to_dataframe, + empty_cuda_cache, +) + + +class SentenceTransformerReranker(BasePassageReranker): + def __init__( + self, + project_dir: str, + model_name: str = "cross-encoder/ms-marco-MiniLM-L-2-v2", + *args, + **kwargs, + ): + """ + Initialize the Sentence Transformer reranker node. + + :param project_dir: The project directory + :param model_name: The name of the Sentence Transformer model to use for reranking + Default is "cross-encoder/ms-marco-MiniLM-L-2-v2" + :param kwargs: The CrossEncoder parameters + """ + super().__init__(project_dir, *args, **kwargs) + try: + import torch + from sentence_transformers import CrossEncoder + except ImportError: + raise ImportError( + "You have to install AutoRAG[gpu] to use SentenceTransformerReranker" + ) + self.device = "cuda" if torch.cuda.is_available() else "cpu" + model_params = pop_params(CrossEncoder.__init__, kwargs) + self.model = CrossEncoder(model_name, device=self.device, **model_params) + + def __del__(self): + del self.model + empty_cuda_cache() + super().__del__() + + @result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + """ + Rerank a list of contents based on their relevance to a query using a Sentence Transformer model. + + :param previous_result: The previous result + :param top_k: The number of passages to be retrieved + :param batch: The number of queries to be processed in a batch + :return: pd DataFrame containing the reranked contents, ids, and scores + """ + queries, contents_list, scores_list, ids_list = self.cast_to_run( + previous_result + ) + top_k = kwargs.get("top_k", 1) + batch = kwargs.get("batch", 64) + return self._pure(queries, contents_list, ids_list, top_k, batch) + + def _pure( + self, + queries: List[str], + contents_list: List[List[str]], + ids_list: List[List[str]], + top_k: int, + batch: int = 64, + ) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]: + """ + Rerank a list of contents based on their relevance to a query using a Sentence Transformer model. + + :param queries: The list of queries to use for reranking + :param contents_list: The list of lists of contents to rerank + :param ids_list: The list of lists of ids retrieved from the initial ranking + :param top_k: The number of passages to be retrieved + :param batch: The number of queries to be processed in a batch + + :return: tuple of lists containing the reranked contents, ids, and scores + """ + nested_list = [ + list(map(lambda x: [query, x], content_list)) + for query, content_list in zip(queries, contents_list) + ] + rerank_scores = flatten_apply( + sentence_transformer_run_model, + nested_list, + model=self.model, + batch_size=batch, + ) + + df = pd.DataFrame( + { + "contents": contents_list, + "ids": ids_list, + "scores": rerank_scores, + } + ) + df[["contents", "ids", "scores"]] = df.apply( + sort_by_scores, axis=1, result_type="expand" + ) + results = select_top_k(df, ["contents", "ids", "scores"], top_k) + + return ( + results["contents"].tolist(), + results["ids"].tolist(), + results["scores"].tolist(), + ) + + +def sentence_transformer_run_model(input_texts, model, batch_size: int): + try: + import torch + except ImportError: + raise ImportError( + "You have to install AutoRAG[gpu] to use SentenceTransformerReranker" + ) + batch_input_texts = make_batch(input_texts, batch_size) + results = [] + for batch_texts in batch_input_texts: + with torch.no_grad(): + pred_scores = model.predict(sentences=batch_texts, apply_softmax=True) + results.extend(pred_scores.tolist()) + return results diff --git a/autorag-workspace/autorag/nodes/passagereranker/tart/__init__.py b/autorag-workspace/autorag/nodes/passagereranker/tart/__init__.py new file mode 100644 index 0000000..d204438 --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagereranker/tart/__init__.py @@ -0,0 +1 @@ +from .tart import Tart diff --git a/autorag-workspace/autorag/nodes/passagereranker/tart/modeling_enc_t5.py b/autorag-workspace/autorag/nodes/passagereranker/tart/modeling_enc_t5.py new file mode 100644 index 0000000..8cf7d68 --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagereranker/tart/modeling_enc_t5.py @@ -0,0 +1,152 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import copy + +from transformers.modeling_outputs import SequenceClassifierOutput +from transformers.models.t5.modeling_t5 import T5Config, T5PreTrainedModel, T5Stack +from transformers.utils.model_parallel_utils import assert_device_map, get_device_map + +from autorag.utils.util import empty_cuda_cache + + +class EncT5ForSequenceClassification(T5PreTrainedModel): + _keys_to_ignore_on_load_missing = [ + r"encoder\.embed_tokens\.weight", + ] + + def __init__(self, config: T5Config, dropout=0.1): + super().__init__(config) + try: + from torch import nn + except ImportError: + raise ImportError("Please install PyTorch to use TART reranker.") + self.num_labels = config.num_labels + self.config = config + + self.shared = nn.Embedding(config.vocab_size, config.d_model) + + encoder_config = copy.deepcopy(config) + encoder_config.use_cache = False + encoder_config.is_encoder_decoder = False + self.encoder = T5Stack(encoder_config, self.shared) + + self.dropout = nn.Dropout(dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + # Model parallel + self.model_parallel = False + self.device_map = None + + def parallelize(self, device_map=None): + try: + import torch + except ImportError: + raise ImportError("Please install PyTorch to use TART reranker.") + self.device_map = ( + get_device_map(len(self.encoder.block), range(torch.cuda.device_count())) + if device_map is None + else device_map + ) + assert_device_map(self.device_map, len(self.encoder.block)) + self.encoder.parallelize(self.device_map) + self.classifier = self.classifier.to(self.encoder.first_device) + self.model_parallel = True + + def deparallelize(self): + self.encoder.deparallelize() + self.encoder = self.encoder.to("cpu") + self.model_parallel = False + self.device_map = None + empty_cuda_cache() + + def get_input_embeddings(self): + return self.shared + + def set_input_embeddings(self, new_embeddings): + self.shared = new_embeddings + self.encoder.set_input_embeddings(new_embeddings) + + def get_encoder(self): + return self.encoder + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def forward( + self, + input_ids=None, + attention_mask=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + try: + import torch + from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + except ImportError: + raise ImportError("Please install PyTorch to use TART reranker.") + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + pooled_output = hidden_states[:, 0, :] # Take bos token (equiv. to ) + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and ( + labels.dtype == torch.long or labels.dtype == torch.int + ): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/autorag-workspace/autorag/nodes/passagereranker/tart/tart.py b/autorag-workspace/autorag/nodes/passagereranker/tart/tart.py new file mode 100644 index 0000000..56d3b72 --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagereranker/tart/tart.py @@ -0,0 +1,139 @@ +from itertools import chain +from typing import List, Tuple + +import pandas as pd + +from autorag.nodes.passagereranker.base import BasePassageReranker +from autorag.nodes.passagereranker.tart.modeling_enc_t5 import ( + EncT5ForSequenceClassification, +) +from autorag.nodes.passagereranker.tart.tokenization_enc_t5 import EncT5Tokenizer +from autorag.utils.util import ( + make_batch, + sort_by_scores, + flatten_apply, + select_top_k, + result_to_dataframe, + empty_cuda_cache, +) + + +class Tart(BasePassageReranker): + def __init__(self, project_dir: str, *args, **kwargs): + super().__init__(project_dir) + try: + import torch + except ImportError: + raise ImportError( + "torch is not installed. Please install torch first to use TART reranker." + ) + model_name = "facebook/tart-full-flan-t5-xl" + self.model = EncT5ForSequenceClassification.from_pretrained(model_name) + self.tokenizer = EncT5Tokenizer.from_pretrained(model_name) + self.device = "cuda" if torch.cuda.is_available() else "cpu" + self.model = self.model.to(self.device) + + def __del__(self): + del self.model + del self.tokenizer + empty_cuda_cache() + super().__del__() + + @result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + queries, contents, _, ids = self.cast_to_run(previous_result) + top_k = kwargs.pop("top_k") + instruction = kwargs.pop("instruction", "Find passage to answer given question") + batch = kwargs.pop("batch", 64) + return self._pure(queries, contents, ids, top_k, instruction, batch) + + def _pure( + self, + queries: List[str], + contents_list: List[List[str]], + ids_list: List[List[str]], + top_k: int, + instruction: str = "Find passage to answer given question", + batch: int = 64, + ) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]: + """ + Rerank a list of contents based on their relevance to a query using Tart. + TART is a reranker based on TART (https://github.com/facebookresearch/tart). + You can rerank the passages with the instruction using TARTReranker. + The default model is facebook/tart-full-flan-t5-xl. + + :param queries: The list of queries to use for reranking + :param contents_list: The list of lists of contents to rerank + :param ids_list: The list of lists of ids retrieved from the initial ranking + :param top_k: The number of passages to be retrieved + :param instruction: The instruction for reranking. + Note: default instruction is "Find passage to answer given question" + The default instruction from the TART paper is being used. + If you want to use a different instruction, you can change the instruction through this parameter + :param batch: The number of queries to be processed in a batch + :return: tuple of lists containing the reranked contents, ids, and scores + """ + nested_list = [ + [["{} [SEP] {}".format(instruction, query)] for _ in contents] + for query, contents in zip(queries, contents_list) + ] + + rerank_scores = flatten_apply( + tart_run_model, + nested_list, + model=self.model, + batch_size=batch, + tokenizer=self.tokenizer, + device=self.device, + contents_list=contents_list, + ) + + df = pd.DataFrame( + { + "contents": contents_list, + "ids": ids_list, + "scores": rerank_scores, + } + ) + df[["contents", "ids", "scores"]] = df.apply( + sort_by_scores, axis=1, result_type="expand" + ) + results = select_top_k(df, ["contents", "ids", "scores"], top_k) + + return ( + results["contents"].tolist(), + results["ids"].tolist(), + results["scores"].tolist(), + ) + + +def tart_run_model( + input_texts, contents_list, model, batch_size: int, tokenizer, device +): + try: + import torch + import torch.nn.functional as F + except ImportError: + raise ImportError( + "torch is not installed. Please install torch first to use TART reranker." + ) + flattened_texts = list(chain.from_iterable(input_texts)) + flattened_contents = list(chain.from_iterable(contents_list)) + batch_input_texts = make_batch(flattened_texts, batch_size) + batch_contents_list = make_batch(flattened_contents, batch_size) + results = [] + for batch_texts, batch_contents in zip(batch_input_texts, batch_contents_list): + feature = tokenizer( + batch_texts, + batch_contents, + padding=True, + truncation=True, + return_tensors="pt", + ).to(device) + with torch.no_grad(): + pred_scores = model(**feature).logits + normalized_scores = [ + float(score[1]) for score in F.softmax(pred_scores, dim=1) + ] + results.extend(normalized_scores) + return results diff --git a/autorag-workspace/autorag/nodes/passagereranker/tart/tokenization_enc_t5.py b/autorag-workspace/autorag/nodes/passagereranker/tart/tokenization_enc_t5.py new file mode 100644 index 0000000..77de374 --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagereranker/tart/tokenization_enc_t5.py @@ -0,0 +1,112 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from typing import Any, Dict, List, Optional + +from transformers import T5Tokenizer + + +class EncT5Tokenizer(T5Tokenizer): + def __init__( + self, + vocab_file, + bos_token="", + eos_token="", + unk_token="", + pad_token="", + extra_ids=100, + additional_special_tokens=None, + sp_model_kwargs: Optional[Dict[str, Any]] = None, + **kwargs, + ) -> None: + sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + + super().__init__( + vocab_file=vocab_file, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + pad_token=pad_token, + extra_ids=extra_ids, + additional_special_tokens=additional_special_tokens, + sp_model_kwargs=sp_model_kwargs, + **kwargs, + ) + + def get_special_tokens_mask( + self, + token_ids_0: List[int], + token_ids_1: Optional[List[int]] = None, + already_has_special_tokens: bool = False, + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, + token_ids_1=token_ids_1, + already_has_special_tokens=True, + ) + + # normal case: some special tokens + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make + use of token type ids, therefore a list of zeros is returned. + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + Returns: + `List[int]`: List of zeros. + """ + bos = [self.bos_token_id] + eos = [self.eos_token_id] + + if token_ids_1 is None: + return len(bos + token_ids_0 + eos) * [0] + return len(bos + token_ids_0 + eos + token_ids_1 + eos) * [0] + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A sequence has the following format: + - single sequence: ` X ` + - pair of sequences: ` A B ` + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + if token_ids_1 is None: + return [self.bos_token_id] + token_ids_0 + [self.eos_token_id] + else: + return ( + [self.bos_token_id] + + token_ids_0 + + [self.eos_token_id] + + token_ids_1 + + [self.eos_token_id] + ) diff --git a/autorag-workspace/autorag/nodes/passagereranker/time_reranker.py b/autorag-workspace/autorag/nodes/passagereranker/time_reranker.py new file mode 100644 index 0000000..41a7c59 --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagereranker/time_reranker.py @@ -0,0 +1,72 @@ +import os +from datetime import datetime +from typing import List, Tuple + +import pandas as pd + +from autorag.nodes.passagereranker.base import BasePassageReranker +from autorag.utils import result_to_dataframe, fetch_contents + + +class TimeReranker(BasePassageReranker): + def __init__(self, project_dir: str, *args, **kwargs): + super().__init__(project_dir, *args, **kwargs) + self.corpus_df = pd.read_parquet( + os.path.join(project_dir, "data", "corpus.parquet"), engine="pyarrow" + ) + + @result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + _, contents, scores, ids = self.cast_to_run(previous_result) + metadatas = fetch_contents(self.corpus_df, ids, column_name="metadata") + times = [ + [time["last_modified_datetime"] for time in time_list] + for time_list in metadatas + ] + top_k = kwargs.pop("top_k") + return self._pure(contents, scores, ids, top_k, times) + + def _pure( + self, + contents_list: List[List[str]], + scores_list: List[List[float]], + ids_list: List[List[str]], + top_k: int, + time_list: List[List[datetime]], + ) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]: + """ + Rerank the passages based on merely the datetime of the passage. + It uses 'last_modified_datetime' key in the corpus metadata, + so the metadata should be in the format of {'last_modified_datetime': datetime.datetime} at the corpus data file. + + :param contents_list: The list of lists of contents + :param scores_list: The list of lists of scores from the initial ranking + :param ids_list: The list of lists of ids + :param top_k: The number of passages to be retrieved after reranking + :param time_list: The metadata list of lists of datetime.datetime + It automatically extracts the 'last_modified_datetime' key from the metadata in the corpus data. + :return: The reranked contents, ids, and scores + """ + + def sort_row(contents, scores, ids, time, top_k): + combined = list(zip(contents, scores, ids, time)) + combined.sort(key=lambda x: x[3], reverse=True) + sorted_contents, sorted_scores, sorted_ids, _ = zip(*combined) + return ( + list(sorted_contents)[:top_k], + list(sorted_scores)[:top_k], + list(sorted_ids)[:top_k], + ) + + reranked_contents, reranked_scores, reranked_ids = zip( + *map( + sort_row, + contents_list, + scores_list, + ids_list, + time_list, + [top_k] * len(contents_list), + ) + ) + + return list(reranked_contents), list(reranked_ids), list(reranked_scores) diff --git a/autorag-workspace/autorag/nodes/passagereranker/upr.py b/autorag-workspace/autorag/nodes/passagereranker/upr.py new file mode 100644 index 0000000..d06c0cd --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagereranker/upr.py @@ -0,0 +1,160 @@ +import logging +from typing import List, Tuple + +import pandas as pd + +from autorag.nodes.passagereranker.base import BasePassageReranker +from autorag.utils import result_to_dataframe +from autorag.utils.util import select_top_k, sort_by_scores, empty_cuda_cache + +logger = logging.getLogger("AutoRAG") + + +class Upr(BasePassageReranker): + def __init__( + self, + project_dir: str, + use_bf16: bool = False, + prefix_prompt: str = "Passage: ", + suffix_prompt: str = "Please write a question based on this passage.", + *args, + **kwargs, + ): + """ + Initialize the UPR reranker node. + + :param project_dir: The project directory + :param use_bf16: Whether to use bfloat16 for the model. Default is False. + :param prefix_prompt: The prefix prompt for the language model that generates question for reranking. + Default is "Passage: ". + The prefix prompt serves as the initial context or instruction for the language model. + It sets the stage for what is expected in the output + :param suffix_prompt: The suffix prompt for the language model that generates question for reranking. + Default is "Please write a question based on this passage.". + The suffix prompt provides a cue or a closing instruction to the language model, + signaling how to conclude the generated text or what format to follow at the end. + :param kwargs: Extra arguments + """ + super().__init__(project_dir, *args, **kwargs) + + self.scorer = UPRScorer( + suffix_prompt=suffix_prompt, prefix_prompt=prefix_prompt, use_bf16=use_bf16 + ) + + def __del__(self): + del self.scorer + super().__del__() + + @result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + queries, contents, _, ids = self.cast_to_run(previous_result) + top_k = kwargs.pop("top_k") + return self._pure(queries, contents, ids, top_k) + + def _pure( + self, + queries: List[str], + contents_list: List[List[str]], + ids_list: List[List[str]], + top_k: int, + ) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]: + """ + Rerank a list of contents based on their relevance to a query using UPR. + UPR is a reranker based on UPR (https://github.com/DevSinghSachan/unsupervised-passage-reranking). + The language model will make a question based on the passage and rerank the passages by the likelihood of the question. + The default model is t5-large. + + :param queries: The list of queries to use for reranking + :param contents_list: The list of lists of contents to rerank + :param ids_list: The list of lists of ids retrieved from the initial ranking + :param top_k: The number of passages to be retrieved + + :return: tuple of lists containing the reranked contents, ids, and scores + """ + df = pd.DataFrame( + { + "query": queries, + "contents": contents_list, + "ids": ids_list, + } + ) + + df["scores"] = df.apply( + lambda row: self.scorer.compute( + query=row["query"], contents=row["contents"] + ), + axis=1, + ) + df[["contents", "ids", "scores"]] = df.apply( + lambda x: sort_by_scores(x, reverse=False), axis=1, result_type="expand" + ) + results = select_top_k(df, ["contents", "ids", "scores"], top_k) + return ( + results["contents"].tolist(), + results["ids"].tolist(), + results["scores"].tolist(), + ) + + +class UPRScorer: + def __init__(self, suffix_prompt: str, prefix_prompt: str, use_bf16: bool = False): + try: + import torch + from transformers import T5Tokenizer, T5ForConditionalGeneration + except ImportError: + raise ImportError( + "torch is not installed. Please install torch to use UPRReranker." + ) + model_name = "t5-large" + self.device = "cuda" if torch.cuda.is_available() else "cpu" + self.tokenizer = T5Tokenizer.from_pretrained(model_name) + self.model = T5ForConditionalGeneration.from_pretrained( + model_name, torch_dtype=torch.bfloat16 if use_bf16 else torch.float32 + ).to(self.device) + self.suffix_prompt = suffix_prompt + self.prefix_prompt = prefix_prompt + + def compute(self, query: str, contents: List[str]) -> List[float]: + try: + import torch + except ImportError: + raise ImportError( + "torch is not installed. Please install torch to use UPRReranker." + ) + query_token = self.tokenizer( + query, max_length=128, truncation=True, return_tensors="pt" + ) + prompts = list( + map( + lambda content: f"{self.prefix_prompt} {content} {self.suffix_prompt}", + contents, + ) + ) + prompt_token_outputs = self.tokenizer( + prompts, + padding="longest", + max_length=512, + pad_to_multiple_of=8, + truncation=True, + return_tensors="pt", + ) + + query_input_ids = torch.repeat_interleave( + query_token["input_ids"], len(contents), dim=0 + ).to(self.device) + + with torch.no_grad(): + logits = self.model( + input_ids=prompt_token_outputs["input_ids"].to(self.device), + attention_mask=prompt_token_outputs["attention_mask"].to(self.device), + labels=query_input_ids, + ).logits + log_softmax = torch.nn.functional.log_softmax(logits, dim=-1) + nll = -log_softmax.gather(2, query_input_ids.unsqueeze(2)).squeeze(2) + avg_nll = torch.sum(nll, dim=1) + return avg_nll.tolist() + + def __del__(self): + del self.model + del self.tokenizer + empty_cuda_cache() diff --git a/autorag-workspace/autorag/nodes/passagereranker/voyageai.py b/autorag-workspace/autorag/nodes/passagereranker/voyageai.py new file mode 100644 index 0000000..2868189 --- /dev/null +++ b/autorag-workspace/autorag/nodes/passagereranker/voyageai.py @@ -0,0 +1,109 @@ +import os +from typing import List, Tuple +import pandas as pd +import voyageai + +from autorag.nodes.passagereranker.base import BasePassageReranker +from autorag.utils.util import result_to_dataframe, get_event_loop, process_batch + + +class VoyageAIReranker(BasePassageReranker): + def __init__(self, project_dir: str, *args, **kwargs): + super().__init__(project_dir) + api_key = kwargs.pop("api_key", None) + api_key = os.getenv("VOYAGE_API_KEY", None) if api_key is None else api_key + if api_key is None: + raise KeyError( + "Please set the API key for VoyageAI rerank in the environment variable VOYAGE_API_KEY " + "or directly set it on the config YAML file." + ) + + self.voyage_client = voyageai.AsyncClient(api_key=api_key) + + def __del__(self): + del self.voyage_client + super().__del__() + + @result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + queries, contents, scores, ids = self.cast_to_run(previous_result) + top_k = kwargs.pop("top_k") + batch = kwargs.pop("batch", 8) + model = kwargs.pop("model", "rerank-2") + truncation = kwargs.pop("truncation", True) + return self._pure(queries, contents, ids, top_k, model, batch, truncation) + + def _pure( + self, + queries: List[str], + contents_list: List[List[str]], + ids_list: List[List[str]], + top_k: int, + model: str = "rerank-2", + batch: int = 8, + truncation: bool = True, + ) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]: + """ + Rerank a list of contents with VoyageAI rerank models. + You can get the API key from https://docs.voyageai.com/docs/api-key-and-installation and set it in the environment variable VOYAGE_API_KEY. + + :param queries: The list of queries to use for reranking + :param contents_list: The list of lists of contents to rerank + :param ids_list: The list of lists of ids retrieved from the initial ranking + :param top_k: The number of passages to be retrieved + :param model: The model name for VoyageAI rerank. + You can choose between "rerank-2" and "rerank-2-lite". + Default is "rerank-2". + :param batch: The number of queries to be processed in a batch + :param truncation: Whether to truncate the input to satisfy the 'context length limit' on the query and the documents. + :return: Tuple of lists containing the reranked contents, ids, and scores + """ + tasks = [ + voyageai_rerank_pure( + self.voyage_client, model, query, contents, ids, top_k, truncation + ) + for query, contents, ids in zip(queries, contents_list, ids_list) + ] + loop = get_event_loop() + results = loop.run_until_complete(process_batch(tasks, batch)) + + content_result, id_result, score_result = zip(*results) + + return list(content_result), list(id_result), list(score_result) + + +async def voyageai_rerank_pure( + voyage_client: voyageai.AsyncClient, + model: str, + query: str, + documents: List[str], + ids: List[str], + top_k: int, + truncation: bool = True, +) -> Tuple[List[str], List[str], List[float]]: + """ + Rerank a list of contents with VoyageAI rerank models. + + :param voyage_client: The Voyage Client to use for reranking + :param model: The model name for VoyageAI rerank + :param query: The query to use for reranking + :param documents: The list of contents to rerank + :param ids: The list of ids corresponding to the documents + :param top_k: The number of passages to be retrieved + :param truncation: Whether to truncate the input to satisfy the 'context length limit' on the query and the documents. + :return: Tuple of lists containing the reranked contents, ids, and scores + """ + rerank_results = await voyage_client.rerank( + model=model, + query=query, + documents=documents, + top_k=top_k, + truncation=truncation, + ) + reranked_scores: List[float] = list( + map(lambda x: x.relevance_score, rerank_results.results) + ) + indices = list(map(lambda x: x.index, rerank_results.results)) + reranked_contents: List[str] = list(map(lambda i: documents[i], indices)) + reranked_ids: List[str] = list(map(lambda i: ids[i], indices)) + return reranked_contents, reranked_ids, reranked_scores diff --git a/autorag-workspace/autorag/nodes/promptmaker/__init__.py b/autorag-workspace/autorag/nodes/promptmaker/__init__.py new file mode 100644 index 0000000..65d53e4 --- /dev/null +++ b/autorag-workspace/autorag/nodes/promptmaker/__init__.py @@ -0,0 +1,3 @@ +from .long_context_reorder import LongContextReorder +from .window_replacement import WindowReplacement +from .fstring import Fstring diff --git a/autorag-workspace/autorag/nodes/promptmaker/base.py b/autorag-workspace/autorag/nodes/promptmaker/base.py new file mode 100644 index 0000000..be69e4f --- /dev/null +++ b/autorag-workspace/autorag/nodes/promptmaker/base.py @@ -0,0 +1,34 @@ +import logging +from abc import ABCMeta +from pathlib import Path +from typing import Union + +import pandas as pd + +from autorag.schema.base import BaseModule + +logger = logging.getLogger("AutoRAG") + + +class BasePromptMaker(BaseModule, metaclass=ABCMeta): + def __init__(self, project_dir: Union[str, Path], *args, **kwargs): + logger.info( + f"Initialize prompt maker node - {self.__class__.__name__} module..." + ) + + def __del__(self): + logger.info(f"Prompt maker node - {self.__class__.__name__} module is deleted.") + + def cast_to_run(self, previous_result: pd.DataFrame, *args, **kwargs): + logger.info(f"Running prompt maker node - {self.__class__.__name__} module...") + # get query and retrieved contents from previous_result + assert ( + "query" in previous_result.columns + ), "previous_result must have query column." + assert ( + "retrieved_contents" in previous_result.columns + ), "previous_result must have retrieved_contents column." + query = previous_result["query"].tolist() + retrieved_contents = previous_result["retrieved_contents"].tolist() + prompt = kwargs.pop("prompt") + return query, retrieved_contents, prompt diff --git a/autorag-workspace/autorag/nodes/promptmaker/fstring.py b/autorag-workspace/autorag/nodes/promptmaker/fstring.py new file mode 100644 index 0000000..e1b0a69 --- /dev/null +++ b/autorag-workspace/autorag/nodes/promptmaker/fstring.py @@ -0,0 +1,49 @@ +from typing import List + +import pandas as pd + +from autorag.nodes.promptmaker.base import BasePromptMaker +from autorag.utils import result_to_dataframe + + +class Fstring(BasePromptMaker): + @result_to_dataframe(["prompts"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + query, retrieved_contents, prompt = self.cast_to_run( + previous_result, *args, **kwargs + ) + return self._pure(prompt, query, retrieved_contents) + + def _pure( + self, prompt: str, queries: List[str], retrieved_contents: List[List[str]] + ) -> List[str]: + """ + Make a prompt using f-string from a query and retrieved_contents. + You must type a prompt or prompt list at a config YAML file like this: + + .. Code:: yaml + nodes: + - node_type: prompt_maker + modules: + - module_type: fstring + prompt: [Answer this question: {query} \n\n {retrieved_contents}, + Read the passages carefully and answer this question: {query} \n\n Passages: {retrieved_contents}] + + :param prompt: A prompt string. + :param queries: List of query strings. + :param retrieved_contents: List of retrieved contents. + :return: Prompts that are made by f-string. + """ + + def fstring_row( + _prompt: str, _query: str, _retrieved_contents: List[str] + ) -> str: + contents_str = "\n\n".join(_retrieved_contents) + return _prompt.format(query=_query, retrieved_contents=contents_str) + + return list( + map( + lambda x: fstring_row(prompt, x[0], x[1]), + zip(queries, retrieved_contents), + ) + ) diff --git a/autorag-workspace/autorag/nodes/promptmaker/long_context_reorder.py b/autorag-workspace/autorag/nodes/promptmaker/long_context_reorder.py new file mode 100644 index 0000000..d910673 --- /dev/null +++ b/autorag-workspace/autorag/nodes/promptmaker/long_context_reorder.py @@ -0,0 +1,83 @@ +import logging +from typing import List + +import numpy as np +import pandas as pd + +from autorag.nodes.promptmaker.base import BasePromptMaker +from autorag.utils import result_to_dataframe + +logger = logging.getLogger("AutoRAG") + + +class LongContextReorder(BasePromptMaker): + @result_to_dataframe(["prompts"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + query, retrieved_contents, prompt = self.cast_to_run( + previous_result, *args, **kwargs + ) + assert ( + "retrieve_scores" in previous_result.columns + ), "previous_result must have retrieve_scores column." + retrieve_scores = previous_result["retrieve_scores"].tolist() + return self._pure(prompt, query, retrieved_contents, retrieve_scores) + + def _pure( + self, + prompt: str, + queries: List[str], + retrieved_contents: List[List[str]], + retrieve_scores: List[List[float]], + ) -> List[str]: + """ + Models struggle to access significant details found + in the center of extended contexts. A study + (https://arxiv.org/abs/2307.03172) observed that the best + performance typically arises when crucial data is positioned + at the start or conclusion of the input context. Additionally, + as the input context lengthens, performance drops notably, even + in models designed for long contexts." + + .. Code:: yaml + nodes: + - node_type: prompt_maker + modules: + - module_type: long_context_reorder + prompt: [Answer this question: {query} \n\n {retrieved_contents}, + Read the passages carefully and answer this question: {query} \n\n Passages: {retrieved_contents}] + + :param prompt: A prompt string. + :param queries: List of query strings. + :param retrieved_contents: List of retrieved contents. + :param retrieve_scores: List of `retrieve scores`. + :return: Prompts that are made by long context reorder. + """ + + def long_context_reorder_row( + _prompt: str, + _query: str, + _retrieved_contents: List[str], + _retrieve_scores: List[float], + ) -> str: + if isinstance(_retrieved_contents, np.ndarray): + _retrieved_contents = _retrieved_contents.tolist() + if not len(_retrieved_contents) == len(_retrieve_scores): + logger.info("If you use a summarizer, the reorder will not proceed.") + return _prompt.format( + query=_query, retrieved_contents="\n\n".join(_retrieved_contents) + ) + content_scores = list(zip(_retrieved_contents, _retrieve_scores)) + sorted_content_scores = sorted( + content_scores, key=lambda x: x[1], reverse=True + ) + content_result, score_result = zip(*sorted_content_scores) + _retrieved_contents.append(content_result[0]) + contents_str = "\n\n".join(_retrieved_contents) + return _prompt.format(query=_query, retrieved_contents=contents_str) + + return list( + map( + lambda x: long_context_reorder_row(prompt, x[0], x[1], x[2]), + zip(queries, retrieved_contents, retrieve_scores), + ) + ) diff --git a/autorag-workspace/autorag/nodes/promptmaker/run.py b/autorag-workspace/autorag/nodes/promptmaker/run.py new file mode 100644 index 0000000..6331321 --- /dev/null +++ b/autorag-workspace/autorag/nodes/promptmaker/run.py @@ -0,0 +1,280 @@ +import os +import pathlib +from copy import deepcopy +from typing import List, Dict, Optional, Union + +import pandas as pd +import tokenlog + +from autorag.evaluation import evaluate_generation +from autorag.evaluation.util import cast_metrics +from autorag.schema.metricinput import MetricInput +from autorag.strategy import measure_speed, filter_by_threshold, select_best +from autorag.support import get_support_modules +from autorag.utils import validate_qa_dataset +from autorag.utils.util import make_combinations, explode, split_dataframe + + +def run_prompt_maker_node( + modules: List, + module_params: List[Dict], + previous_result: pd.DataFrame, + node_line_dir: str, + strategies: Dict, +) -> pd.DataFrame: + """ + Run prompt maker node. + With this function, you can select the best prompt maker module. + As default, when you can use only one module, the evaluation will be skipped. + If you want to select the best prompt among modules, you can use strategies. + When you use them, you must pass 'generator_modules' and its parameters at strategies. + Because it uses generator modules and generator metrics for evaluation this module. + It is recommended to use one params and modules for evaluation, + but you can use multiple params and modules for evaluation. + When you don't set generator module at strategies, it will use the default generator module. + The default generator module is llama_index_llm with openai gpt-3.5-turbo model. + + :param modules: Prompt maker module classes to run. + :param module_params: Prompt maker module parameters. + :param previous_result: Previous result dataframe. + Could be query expansion's best result or qa data. + :param node_line_dir: This node line's directory. + :param strategies: Strategies for prompt maker node. + :return: The best result dataframe. + It contains previous result columns and prompt maker's result columns which is 'prompts'. + """ + if not os.path.exists(node_line_dir): + os.makedirs(node_line_dir) + node_dir = os.path.join(node_line_dir, "prompt_maker") + if not os.path.exists(node_dir): + os.makedirs(node_dir) + project_dir = pathlib.PurePath(node_line_dir).parent.parent + + # run modules + results, execution_times = zip( + *map( + lambda task: measure_speed( + task[0].run_evaluator, + project_dir=project_dir, + previous_result=previous_result, + **task[1], + ), + zip(modules, module_params), + ) + ) + average_times = list(map(lambda x: x / len(results[0]), execution_times)) + + # get average token usage + token_usages = [] + for i, result in enumerate(results): + token_logger = tokenlog.getLogger( + f"prompt_maker_{i}", strategies.get("tokenizer", "gpt2") + ) + token_logger.query_batch(result["prompts"].tolist()) + token_usages.append(token_logger.get_token_usage() / len(result)) + + # save results to folder + filepaths = list( + map(lambda x: os.path.join(node_dir, f"{x}.parquet"), range(len(modules))) + ) + list( + map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths)) + ) # execute save to parquet + filenames = list(map(lambda x: os.path.basename(x), filepaths)) + + # make summary file + summary_df = pd.DataFrame( + { + "filename": filenames, + "module_name": list(map(lambda module: module.__name__, modules)), + "module_params": module_params, + "execution_time": average_times, + "average_prompt_token": token_usages, + } + ) + + metric_names, metric_params = cast_metrics(strategies.get("metrics")) + + # Run evaluation when there are more than one module. + if len(modules) > 1: + # pop general keys from strategies (e.g. metrics, speed_threshold) + general_key = ["metrics", "speed_threshold", "token_threshold", "tokenizer"] + general_strategy = dict( + filter(lambda x: x[0] in general_key, strategies.items()) + ) + extra_strategy = dict( + filter(lambda x: x[0] not in general_key, strategies.items()) + ) + + # first, filter by threshold if it is enabled. + if general_strategy.get("speed_threshold") is not None: + results, filenames = filter_by_threshold( + results, average_times, general_strategy["speed_threshold"], filenames + ) + + # Calculate tokens and save to summary + if general_strategy.get("token_threshold") is not None: + results, filenames = filter_by_threshold( + results, token_usages, general_strategy["token_threshold"], filenames + ) + + # run metrics before filtering + if metric_names is None or len(metric_names) <= 0: + raise ValueError( + "You must at least one metrics for prompt maker evaluation." + ) + + # get generator modules from strategy + generator_callables, generator_params = make_generator_callable_params( + extra_strategy + ) + + # get generation_gt + qa_data = pd.read_parquet( + os.path.join(project_dir, "data", "qa.parquet"), engine="pyarrow" + ) + validate_qa_dataset(qa_data) + generation_gt = qa_data["generation_gt"].tolist() + generation_gt = list(map(lambda x: x.tolist(), generation_gt)) + + metric_inputs = [MetricInput(generation_gt=gen_gt) for gen_gt in generation_gt] + + all_prompts = [] + for result in results: + all_prompts.extend(result["prompts"].tolist()) + + evaluation_result_all = evaluate_one_prompt_maker_node( + all_prompts, + generator_callables, + generator_params, + metric_inputs * len(results), + general_strategy["metrics"], + project_dir, + strategy_name=strategies.get("strategy", "mean"), + ) + evaluation_results = split_dataframe( + evaluation_result_all, chunk_size=len(results[0]) + ) + + evaluation_df = pd.DataFrame( + { + "filename": filenames, + **{ + f"prompt_maker_{metric_name}": list( + map(lambda x: x[metric_name].mean(), evaluation_results) + ) + for metric_name in metric_names + }, + } + ) + summary_df = pd.merge( + on="filename", left=summary_df, right=evaluation_df, how="left" + ) + + best_result, best_filename = select_best( + evaluation_results, + metric_names, + filenames, + strategies.get("strategy", "mean"), + ) + # change metric name columns to prompt_maker_metric_name + best_result = best_result.rename( + columns={ + metric_name: f"prompt_maker_{metric_name}" + for metric_name in metric_names + } + ) + best_result = best_result.drop(columns=["generated_texts"]) + else: + best_result, best_filename = results[0], filenames[0] + + # add 'is_best' column at summary file + summary_df["is_best"] = summary_df["filename"] == best_filename + + best_result = pd.concat([previous_result, best_result], axis=1) + + # save files + summary_df.to_csv(os.path.join(node_dir, "summary.csv"), index=False) + best_result.to_parquet( + os.path.join(node_dir, f"best_{os.path.splitext(best_filename)[0]}.parquet"), + index=False, + ) + + return best_result + + +def make_generator_callable_params(strategy_dict: Dict): + node_dict = deepcopy(strategy_dict) + generator_module_list: Optional[List[Dict]] = node_dict.pop( + "generator_modules", None + ) + if generator_module_list is None: + generator_module_list = [ + { + "module_type": "llama_index_llm", + "llm": "openai", + "model": "gpt-3.5-turbo", + } + ] + node_params = node_dict + modules = list( + map( + lambda module_dict: get_support_modules(module_dict.pop("module_type")), + generator_module_list, + ) + ) + param_combinations = list( + map( + lambda module_dict: make_combinations({**module_dict, **node_params}), + generator_module_list, + ) + ) + return explode(modules, param_combinations) + + +def evaluate_one_prompt_maker_node( + prompts: List[str], + generator_classes: List, + generator_params: List[Dict], + metric_inputs: List[MetricInput], + metrics: Union[List[str], List[Dict]], + project_dir, + strategy_name: str, +) -> pd.DataFrame: + input_df = pd.DataFrame({"prompts": prompts}) + generator_results = list( + map( + lambda x: x[0].run_evaluator( + project_dir=project_dir, previous_result=input_df, **x[1] + ), + zip(generator_classes, generator_params), + ) + ) + evaluation_results = list( + map( + lambda x: evaluate_generator_result(x[0], metric_inputs, metrics), + zip(generator_results, generator_classes), + ) + ) + metric_names = ( + list(map(lambda x: x["metric_name"], metrics)) + if isinstance(metrics[0], dict) + else metrics + ) + best_result, _ = select_best( + evaluation_results, metric_names, strategy_name=strategy_name + ) + best_result = pd.concat([input_df, best_result], axis=1) + return best_result # it has 'generated_texts' column + + +def evaluate_generator_result( + result_df: pd.DataFrame, + metric_inputs: List[MetricInput], + metrics: Union[List[str], List[Dict]], +) -> pd.DataFrame: + @evaluate_generation(metric_inputs=metric_inputs, metrics=metrics) + def evaluate(df): + return df["generated_texts"].tolist() + + return evaluate(result_df) diff --git a/autorag-workspace/autorag/nodes/promptmaker/window_replacement.py b/autorag-workspace/autorag/nodes/promptmaker/window_replacement.py new file mode 100644 index 0000000..1a80f60 --- /dev/null +++ b/autorag-workspace/autorag/nodes/promptmaker/window_replacement.py @@ -0,0 +1,85 @@ +import logging +import os +from typing import List, Dict + +import pandas as pd + +from autorag.nodes.promptmaker.base import BasePromptMaker +from autorag.utils import result_to_dataframe, fetch_contents + +logger = logging.getLogger("AutoRAG") + + +class WindowReplacement(BasePromptMaker): + def __init__(self, project_dir: str, *args, **kwargs): + super().__init__(project_dir, *args, **kwargs) + # load corpus + data_dir = os.path.join(project_dir, "data") + self.corpus_data = pd.read_parquet( + os.path.join(data_dir, "corpus.parquet"), engine="pyarrow" + ) + + @result_to_dataframe(["prompts"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + query, retrieved_contents, prompt = self.cast_to_run( + previous_result, *args, **kwargs + ) + retrieved_ids = previous_result["retrieved_ids"].tolist() + # get metadata from corpus + retrieved_metadata = fetch_contents( + self.corpus_data, retrieved_ids, column_name="metadata" + ) + return self._pure(prompt, query, retrieved_contents, retrieved_metadata) + + def _pure( + self, + prompt: str, + queries: List[str], + retrieved_contents: List[List[str]], + retrieved_metadata: List[List[Dict]], + ) -> List[str]: + """ + Replace retrieved_contents with a window to create a Prompt + (only available for corpus chunked with Sentence window method) + You must type a prompt or prompt list at a config YAML file like this: + + .. Code:: yaml + nodes: + - node_type: prompt_maker + modules: + - module_type: window_replacement + prompt: [Answer this question: {query} \n\n {retrieved_contents}, + Read the passages carefully and answer this question: {query} \n\n Passages: {retrieved_contents}] + + :param prompt: A prompt string. + :param queries: List of query strings. + :param retrieved_contents: List of retrieved contents. + :param retrieved_metadata: List of retrieved metadata. + :return: Prompts that are made by window_replacement. + """ + + def window_replacement_row( + _prompt: str, + _query: str, + _retrieved_contents, + _retrieved_metadata: List[Dict], + ) -> str: + window_list = [] + for content, metadata in zip(_retrieved_contents, _retrieved_metadata): + if "window" in metadata: + window_list.append(metadata["window"]) + else: + window_list.append(content) + logger.info( + "Only available for corpus chunked with Sentence window method." + "window_replacement will not proceed." + ) + contents_str = "\n\n".join(window_list) + return _prompt.format(query=_query, retrieved_contents=contents_str) + + return list( + map( + lambda x: window_replacement_row(prompt, x[0], x[1], x[2]), + zip(queries, retrieved_contents, retrieved_metadata), + ) + ) diff --git a/autorag-workspace/autorag/nodes/queryexpansion/__init__.py b/autorag-workspace/autorag/nodes/queryexpansion/__init__.py new file mode 100644 index 0000000..1a56f3d --- /dev/null +++ b/autorag-workspace/autorag/nodes/queryexpansion/__init__.py @@ -0,0 +1,4 @@ +from .hyde import HyDE +from .multi_query_expansion import MultiQueryExpansion +from .pass_query_expansion import PassQueryExpansion +from .query_decompose import QueryDecompose diff --git a/autorag-workspace/autorag/nodes/queryexpansion/base.py b/autorag-workspace/autorag/nodes/queryexpansion/base.py new file mode 100644 index 0000000..d0da7be --- /dev/null +++ b/autorag-workspace/autorag/nodes/queryexpansion/base.py @@ -0,0 +1,62 @@ +import abc +import logging +from pathlib import Path +from typing import List, Union + +import pandas as pd + +from autorag.nodes.util import make_generator_callable_param +from autorag.schema import BaseModule +from autorag.utils import validate_qa_dataset + +logger = logging.getLogger("AutoRAG") + + +class BaseQueryExpansion(BaseModule, metaclass=abc.ABCMeta): + def __init__(self, project_dir: Union[str, Path], *args, **kwargs): + logger.info( + f"Initialize query expansion node - {self.__class__.__name__} module..." + ) + # set generator module for query expansion + generator_class, generator_param = make_generator_callable_param(kwargs) + self.generator = generator_class(project_dir, **generator_param) + + def __del__(self): + del self.generator + logger.info( + f"Delete query expansion node - {self.__class__.__name__} module..." + ) + + def cast_to_run(self, previous_result: pd.DataFrame, *args, **kwargs): + logger.info( + f"Running query expansion node - {self.__class__.__name__} module..." + ) + validate_qa_dataset(previous_result) + + # find queries columns + assert ( + "query" in previous_result.columns + ), "previous_result must have query column." + queries = previous_result["query"].tolist() + return queries + + @staticmethod + def _check_expanded_query(queries: List[str], expanded_queries: List[List[str]]): + return list( + map( + lambda query, expanded_query_list: check_expanded_query( + query, expanded_query_list + ), + queries, + expanded_queries, + ) + ) + + +def check_expanded_query(query: str, expanded_query_list: List[str]): + # check if the expanded query is the same as the original query + expanded_query_list = list(map(lambda x: x.strip(), expanded_query_list)) + return [ + expanded_query if expanded_query else query + for expanded_query in expanded_query_list + ] diff --git a/autorag-workspace/autorag/nodes/queryexpansion/hyde.py b/autorag-workspace/autorag/nodes/queryexpansion/hyde.py new file mode 100644 index 0000000..86bfa4b --- /dev/null +++ b/autorag-workspace/autorag/nodes/queryexpansion/hyde.py @@ -0,0 +1,43 @@ +from typing import List + +import pandas as pd + +from autorag.nodes.queryexpansion.base import BaseQueryExpansion +from autorag.utils import result_to_dataframe + +hyde_prompt = "Please write a passage to answer the question" + + +class HyDE(BaseQueryExpansion): + @result_to_dataframe(["queries"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + queries = self.cast_to_run(previous_result, *args, **kwargs) + + # pop prompt from kwargs + prompt = kwargs.pop("prompt", hyde_prompt) + kwargs.pop("generator_module_type", None) + + expanded_queries = self._pure(queries, prompt, **kwargs) + return self._check_expanded_query(queries, expanded_queries) + + def _pure(self, queries: List[str], prompt: str = hyde_prompt, **generator_params): + """ + HyDE, which inspired by "Precise Zero-shot Dense Retrieval without Relevance Labels" (https://arxiv.org/pdf/2212.10496.pdf) + LLM model creates a hypothetical passage. + And then, retrieve passages using hypothetical passage as a query. + :param queries: List[str], queries to retrieve. + :param prompt: Prompt to use when generating hypothetical passage + :return: List[List[str]], List of hyde results. + """ + full_prompts = list( + map( + lambda x: (prompt if not bool(prompt) else hyde_prompt) + + f"\nQuestion: {x}\nPassage:", + queries, + ) + ) + input_df = pd.DataFrame({"prompts": full_prompts}) + result_df = self.generator.pure(previous_result=input_df, **generator_params) + answers = result_df["generated_texts"].tolist() + results = list(map(lambda x: [x], answers)) + return results diff --git a/autorag-workspace/autorag/nodes/queryexpansion/multi_query_expansion.py b/autorag-workspace/autorag/nodes/queryexpansion/multi_query_expansion.py new file mode 100644 index 0000000..c76d6fc --- /dev/null +++ b/autorag-workspace/autorag/nodes/queryexpansion/multi_query_expansion.py @@ -0,0 +1,57 @@ +from typing import List + +import pandas as pd + +from autorag.nodes.queryexpansion.base import BaseQueryExpansion +from autorag.utils import result_to_dataframe + +multi_query_expansion_prompt = """You are an AI language model assistant. + Your task is to generate 3 different versions of the given user + question to retrieve relevant documents from a vector database. + By generating multiple perspectives on the user question, + your goal is to help the user overcome some of the limitations + of distance-based similarity search. Provide these alternative + questions separated by newlines. Original question: {query}""" + + +class MultiQueryExpansion(BaseQueryExpansion): + @result_to_dataframe(["queries"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + queries = self.cast_to_run(previous_result, *args, **kwargs) + + # pop prompt from kwargs + prompt = kwargs.pop("prompt", multi_query_expansion_prompt) + kwargs.pop("generator_module_type", None) + + expanded_queries = self._pure(queries, prompt, **kwargs) + return self._check_expanded_query(queries, expanded_queries) + + def _pure( + self, queries, prompt: str = multi_query_expansion_prompt, **kwargs + ) -> List[List[str]]: + """ + Expand a list of queries using a multi-query expansion approach. + LLM model generate 3 different versions queries for each input query. + + :param queries: List[str], queries to decompose. + :param prompt: str, prompt to use for multi-query expansion. + default prompt comes from langchain MultiQueryRetriever default query prompt. + :return: List[List[str]], list of expansion query. + """ + full_prompts = list(map(lambda x: prompt.format(query=x), queries)) + input_df = pd.DataFrame({"prompts": full_prompts}) + result_df = self.generator.pure(previous_result=input_df, **kwargs) + answers = result_df["generated_texts"].tolist() + results = list( + map(lambda x: get_multi_query_expansion(x[0], x[1]), zip(queries, answers)) + ) + return results + + +def get_multi_query_expansion(query: str, answer: str) -> List[str]: + try: + queries = answer.split("\n") + queries.insert(0, query) + return queries + except: + return [query] diff --git a/autorag-workspace/autorag/nodes/queryexpansion/pass_query_expansion.py b/autorag-workspace/autorag/nodes/queryexpansion/pass_query_expansion.py new file mode 100644 index 0000000..1b7f6b6 --- /dev/null +++ b/autorag-workspace/autorag/nodes/queryexpansion/pass_query_expansion.py @@ -0,0 +1,22 @@ +import pandas as pd + +from autorag.nodes.queryexpansion.base import BaseQueryExpansion +from autorag.utils import result_to_dataframe + + +class PassQueryExpansion(BaseQueryExpansion): + @result_to_dataframe(["queries"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + """ + Do not perform query expansion. + Return with the same queries. + The dimension will be 2-d list, and the column name will be 'queries'. + """ + assert ( + "query" in previous_result.columns + ), "previous_result must have query column." + queries = previous_result["query"].tolist() + return list(map(lambda x: [x], queries)) + + def _pure(self, *args, **kwargs): + pass diff --git a/autorag-workspace/autorag/nodes/queryexpansion/query_decompose.py b/autorag-workspace/autorag/nodes/queryexpansion/query_decompose.py new file mode 100644 index 0000000..95e8d51 --- /dev/null +++ b/autorag-workspace/autorag/nodes/queryexpansion/query_decompose.py @@ -0,0 +1,111 @@ +from typing import List + +import pandas as pd + +from autorag.nodes.queryexpansion.base import BaseQueryExpansion +from autorag.utils import result_to_dataframe + +decompose_prompt = """Decompose a question in self-contained sub-questions. Use \"The question needs no decomposition\" when no decomposition is needed. + + Example 1: + + Question: Is Hamlet more common on IMDB than Comedy of Errors? + Decompositions: + 1: How many listings of Hamlet are there on IMDB? + 2: How many listing of Comedy of Errors is there on IMDB? + + Example 2: + + Question: Are birds important to badminton? + + Decompositions: + The question needs no decomposition + + Example 3: + + Question: Is it legal for a licensed child driving Mercedes-Benz to be employed in US? + + Decompositions: + 1: What is the minimum driving age in the US? + 2: What is the minimum age for someone to be employed in the US? + + Example 4: + + Question: Are all cucumbers the same texture? + + Decompositions: + The question needs no decomposition + + Example 5: + + Question: Hydrogen's atomic number squared exceeds number of Spice Girls? + + Decompositions: + 1: What is the atomic number of hydrogen? + 2: How many Spice Girls are there? + + Example 6: + + Question: {question} + + Decompositions: + """ + + +class QueryDecompose(BaseQueryExpansion): + @result_to_dataframe(["queries"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + queries = self.cast_to_run(previous_result, *args, **kwargs) + + # pop prompt from kwargs + prompt = kwargs.pop("prompt", decompose_prompt) + kwargs.pop("generator_module_type", None) + + expanded_queries = self._pure(queries, prompt, **kwargs) + return self._check_expanded_query(queries, expanded_queries) + + def _pure( + self, queries: List[str], prompt: str = decompose_prompt, *args, **kwargs + ) -> List[List[str]]: + """ + decompose query to little piece of questions. + :param queries: List[str], queries to decompose. + :param prompt: str, prompt to use for query decomposition. + default prompt comes from Visconde's StrategyQA few-shot prompt. + :return: List[List[str]], list of decomposed query. Return input query if query is not decomposable. + """ + full_prompts = [] + for query in queries: + if bool(prompt): + full_prompt = f"prompt: {prompt}\n\n question: {query}" + else: + full_prompt = decompose_prompt.format(question=query) + full_prompts.append(full_prompt) + input_df = pd.DataFrame({"prompts": full_prompts}) + result_df = self.generator.pure(previous_result=input_df, *args, **kwargs) + answers = result_df["generated_texts"].tolist() + results = list( + map(lambda x: get_query_decompose(x[0], x[1]), zip(queries, answers)) + ) + return results + + +def get_query_decompose(query: str, answer: str) -> List[str]: + """ + decompose query to little piece of questions. + :param query: str, query to decompose. + :param answer: str, answer from query_decompose function. + :return: List[str], list of a decomposed query. Return input query if query is not decomposable. + """ + if answer.lower() == "the question needs no decomposition": + return [query] + try: + lines = [line.strip() for line in answer.splitlines() if line.strip()] + if lines[0].startswith("Decompositions:"): + lines.pop(0) + questions = [line.split(":", 1)[1].strip() for line in lines if ":" in line] + if not questions: + return [query] + return questions + except: + return [query] diff --git a/autorag-workspace/autorag/nodes/queryexpansion/run.py b/autorag-workspace/autorag/nodes/queryexpansion/run.py new file mode 100644 index 0000000..c2cd0c5 --- /dev/null +++ b/autorag-workspace/autorag/nodes/queryexpansion/run.py @@ -0,0 +1,276 @@ +import logging +import os +import pathlib +from copy import deepcopy +from typing import List, Dict, Optional + +import pandas as pd + +from autorag.nodes.retrieval.run import evaluate_retrieval_node +from autorag.schema.metricinput import MetricInput +from autorag.strategy import measure_speed, filter_by_threshold, select_best +from autorag.support import get_support_modules +from autorag.utils.util import make_combinations, explode + +logger = logging.getLogger("AutoRAG") + + +def run_query_expansion_node( + modules: List, + module_params: List[Dict], + previous_result: pd.DataFrame, + node_line_dir: str, + strategies: Dict, +) -> pd.DataFrame: + """ + Run evaluation and select the best module among query expansion node results. + Initially, retrieval is run using expanded_queries, the result of the query_expansion module. + The retrieval module is run as a combination of the retrieval_modules in strategies. + If there are multiple retrieval_modules, run them all and choose the best result. + If there are no retrieval_modules, run them with the default of bm25. + In this way, the best result is selected for each module, and then the best result is selected. + + :param modules: Query expansion modules to run. + :param module_params: Query expansion module parameters. + :param previous_result: Previous result dataframe. + In this case, it would be qa data. + :param node_line_dir: This node line's directory. + :param strategies: Strategies for query expansion node. + :return: The best result dataframe. + """ + if not os.path.exists(node_line_dir): + os.makedirs(node_line_dir) + node_dir = os.path.join(node_line_dir, "query_expansion") + if not os.path.exists(node_dir): + os.makedirs(node_dir) + project_dir = pathlib.PurePath(node_line_dir).parent.parent + + # run query expansion + results, execution_times = zip( + *map( + lambda task: measure_speed( + task[0].run_evaluator, + project_dir=project_dir, + previous_result=previous_result, + **task[1], + ), + zip(modules, module_params), + ) + ) + average_times = list(map(lambda x: x / len(results[0]), execution_times)) + + # save results to folder + pseudo_module_params = deepcopy(module_params) + for i, module_param in enumerate(pseudo_module_params): + if "prompt" in module_params: + module_param["prompt"] = str(i) + filepaths = list( + map(lambda x: os.path.join(node_dir, f"{x}.parquet"), range(len(modules))) + ) + list( + map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths)) + ) # execute save to parquet + filenames = list(map(lambda x: os.path.basename(x), filepaths)) + + # make summary file + summary_df = pd.DataFrame( + { + "filename": filenames, + "module_name": list(map(lambda module: module.__name__, modules)), + "module_params": module_params, + "execution_time": average_times, + } + ) + + # Run evaluation when there are more than one module. + if len(modules) > 1: + # pop general keys from strategies (e.g. metrics, speed_threshold) + general_key = ["metrics", "speed_threshold", "strategy"] + general_strategy = dict( + filter(lambda x: x[0] in general_key, strategies.items()) + ) + extra_strategy = dict( + filter(lambda x: x[0] not in general_key, strategies.items()) + ) + + # first, filter by threshold if it is enabled. + if general_strategy.get("speed_threshold") is not None: + results, filenames = filter_by_threshold( + results, average_times, general_strategy["speed_threshold"], filenames + ) + + # check metrics in strategy + if general_strategy.get("metrics") is None: + raise ValueError( + "You must at least one metrics for query expansion evaluation." + ) + + if extra_strategy.get("top_k") is None: + extra_strategy["top_k"] = 10 # default value + + # get retrieval modules from strategy + retrieval_callables, retrieval_params = make_retrieval_callable_params( + extra_strategy + ) + + # get retrieval_gt + retrieval_gt = pd.read_parquet( + os.path.join(project_dir, "data", "qa.parquet"), engine="pyarrow" + )["retrieval_gt"].tolist() + + # make rows to metric_inputs + metric_inputs = [ + MetricInput(retrieval_gt=ret_gt, query=query, generation_gt=gen_gt) + for ret_gt, query, gen_gt in zip( + retrieval_gt, + previous_result["query"].tolist(), + previous_result["generation_gt"].tolist(), + ) + ] + + # run evaluation + evaluation_results = list( + map( + lambda result: evaluate_one_query_expansion_node( + retrieval_callables, + retrieval_params, + [ + setattr(metric_input, "queries", queries) or metric_input + for metric_input, queries in zip( + metric_inputs, result["queries"].to_list() + ) + ], + general_strategy["metrics"], + project_dir, + previous_result, + general_strategy.get("strategy", "mean"), + ), + results, + ) + ) + + evaluation_df = pd.DataFrame( + { + "filename": filenames, + **{ + f"query_expansion_{metric_name}": list( + map(lambda x: x[metric_name].mean(), evaluation_results) + ) + for metric_name in general_strategy["metrics"] + }, + } + ) + summary_df = pd.merge( + on="filename", left=summary_df, right=evaluation_df, how="left" + ) + + best_result, best_filename = select_best( + evaluation_results, + general_strategy["metrics"], + filenames, + strategies.get("strategy", "mean"), + ) + # change metric name columns to query_expansion_metric_name + best_result = best_result.rename( + columns={ + metric_name: f"query_expansion_{metric_name}" + for metric_name in strategies["metrics"] + } + ) + best_result = best_result.drop( + columns=["retrieved_contents", "retrieved_ids", "retrieve_scores"] + ) + else: + best_result, best_filename = results[0], filenames[0] + best_result = pd.concat([previous_result, best_result], axis=1) + + # add 'is_best' column at summary file + summary_df["is_best"] = summary_df["filename"] == best_filename + + # save files + summary_df.to_csv(os.path.join(node_dir, "summary.csv"), index=False) + best_result.to_parquet( + os.path.join(node_dir, f"best_{os.path.splitext(best_filename)[0]}.parquet"), + index=False, + ) + + return best_result + + +def evaluate_one_query_expansion_node( + retrieval_funcs: List, + retrieval_params: List[Dict], + metric_inputs: List[MetricInput], + metrics: List[str], + project_dir, + previous_result: pd.DataFrame, + strategy_name: str, +) -> pd.DataFrame: + previous_result["queries"] = [ + metric_input.queries for metric_input in metric_inputs + ] + retrieval_results = list( + map( + lambda x: x[0].run_evaluator( + project_dir=project_dir, previous_result=previous_result, **x[1] + ), + zip(retrieval_funcs, retrieval_params), + ) + ) + evaluation_results = list( + map( + lambda x: evaluate_retrieval_node( + x, + metric_inputs, + metrics, + ), + retrieval_results, + ) + ) + best_result, _ = select_best( + evaluation_results, metrics, strategy_name=strategy_name + ) + best_result = pd.concat([previous_result, best_result], axis=1) + return best_result + + +def make_retrieval_callable_params(strategy_dict: Dict): + """ + strategy_dict looks like this: + + .. Code:: json + + { + "metrics": ["retrieval_f1", "retrieval_recall"], + "top_k": 50, + "retrieval_modules": [ + {"module_type": "bm25"}, + {"module_type": "vectordb", "embedding_model": ["openai", "huggingface"]} + ] + } + + """ + node_dict = deepcopy(strategy_dict) + retrieval_module_list: Optional[List[Dict]] = node_dict.pop( + "retrieval_modules", None + ) + if retrieval_module_list is None: + retrieval_module_list = [ + { + "module_type": "bm25", + } + ] + node_params = node_dict + modules = list( + map( + lambda module_dict: get_support_modules(module_dict.pop("module_type")), + retrieval_module_list, + ) + ) + param_combinations = list( + map( + lambda module_dict: make_combinations({**module_dict, **node_params}), + retrieval_module_list, + ) + ) + return explode(modules, param_combinations) diff --git a/autorag-workspace/autorag/nodes/retrieval/__init__.py b/autorag-workspace/autorag/nodes/retrieval/__init__.py new file mode 100644 index 0000000..cba3b51 --- /dev/null +++ b/autorag-workspace/autorag/nodes/retrieval/__init__.py @@ -0,0 +1,4 @@ +from .bm25 import BM25 +from .hybrid_cc import HybridCC +from .hybrid_rrf import HybridRRF +from .vectordb import VectorDB diff --git a/autorag-workspace/autorag/nodes/retrieval/base.py b/autorag-workspace/autorag/nodes/retrieval/base.py new file mode 100644 index 0000000..7556c9a --- /dev/null +++ b/autorag-workspace/autorag/nodes/retrieval/base.py @@ -0,0 +1,127 @@ +import abc +import logging +import os +from typing import List, Union, Tuple + +import pandas as pd + +from autorag.schema import BaseModule +from autorag.support import get_support_modules +from autorag.utils import fetch_contents, result_to_dataframe, validate_qa_dataset +from autorag.utils.util import pop_params + +logger = logging.getLogger("AutoRAG") + + +class BaseRetrieval(BaseModule, metaclass=abc.ABCMeta): + def __init__(self, project_dir: str, *args, **kwargs): + logger.info(f"Initialize retrieval node - {self.__class__.__name__}") + + self.resources_dir = os.path.join(project_dir, "resources") + data_dir = os.path.join(project_dir, "data") + # fetch data from corpus_data + self.corpus_df = pd.read_parquet( + os.path.join(data_dir, "corpus.parquet"), engine="pyarrow" + ) + + def __del__(self): + logger.info(f"Deleting retrieval node - {self.__class__.__name__} module...") + + def cast_to_run(self, previous_result: pd.DataFrame, *args, **kwargs): + logger.info(f"Running retrieval node - {self.__class__.__name__} module...") + validate_qa_dataset(previous_result) + # find queries columns & type cast queries + assert ( + "query" in previous_result.columns + ), "previous_result must have query column." + if "queries" not in previous_result.columns: + previous_result["queries"] = previous_result["query"] + previous_result.loc[:, "queries"] = previous_result["queries"].apply( + cast_queries + ) + queries = previous_result["queries"].tolist() + return queries + + +class HybridRetrieval(BaseRetrieval, metaclass=abc.ABCMeta): + def __init__( + self, project_dir: str, target_modules, target_module_params, *args, **kwargs + ): + super().__init__(project_dir) + self.target_modules = list( + map( + lambda x, y: get_support_modules(x)( + **y, + project_dir=project_dir, + ), + target_modules, + target_module_params, + ) + ) + self.target_module_params = target_module_params + + @result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + result_dfs: List[pd.DataFrame] = list( + map( + lambda x, y: x.pure( + **y, + previous_result=previous_result, + ), + self.target_modules, + self.target_module_params, + ) + ) + ids = tuple( + map(lambda df: df["retrieved_ids"].apply(list).tolist(), result_dfs) + ) + scores = tuple( + map( + lambda df: df["retrieve_scores"].apply(list).tolist(), + result_dfs, + ) + ) + + _pure_params = pop_params(self._pure, kwargs) + if "ids" in _pure_params or "scores" in _pure_params: + raise ValueError( + "With specifying ids or scores, you must use HybridRRF.run_evaluator instead." + ) + ids, scores = self._pure(ids=ids, scores=scores, **_pure_params) + contents = fetch_contents(self.corpus_df, ids) + return contents, ids, scores + + +def cast_queries(queries: Union[str, List[str]]) -> List[str]: + if isinstance(queries, str): + return [queries] + elif isinstance(queries, List): + return queries + else: + raise ValueError(f"queries must be str or list, but got {type(queries)}") + + +def evenly_distribute_passages( + ids: List[List[str]], scores: List[List[float]], top_k: int +) -> Tuple[List[str], List[float]]: + assert len(ids) == len(scores), "ids and scores must have same length." + query_cnt = len(ids) + avg_len = top_k // query_cnt + remainder = top_k % query_cnt + + new_ids = [] + new_scores = [] + for i in range(query_cnt): + if i < remainder: + new_ids.extend(ids[i][: avg_len + 1]) + new_scores.extend(scores[i][: avg_len + 1]) + else: + new_ids.extend(ids[i][:avg_len]) + new_scores.extend(scores[i][:avg_len]) + + return new_ids, new_scores + + +def get_bm25_pkl_name(bm25_tokenizer: str): + bm25_tokenizer = bm25_tokenizer.replace("/", "") + return f"bm25_{bm25_tokenizer}.pkl" diff --git a/autorag-workspace/autorag/nodes/retrieval/bm25.py b/autorag-workspace/autorag/nodes/retrieval/bm25.py new file mode 100644 index 0000000..80ac44d --- /dev/null +++ b/autorag-workspace/autorag/nodes/retrieval/bm25.py @@ -0,0 +1,365 @@ +import asyncio +import os +import pickle +import re +from typing import List, Dict, Tuple, Callable, Union, Iterable, Optional + +import numpy as np +import pandas as pd +from llama_index.core.indices.keyword_table.utils import simple_extract_keywords +from nltk import PorterStemmer +from rank_bm25 import BM25Okapi +from transformers import AutoTokenizer, PreTrainedTokenizerBase + +from autorag.nodes.retrieval.base import ( + evenly_distribute_passages, + BaseRetrieval, + get_bm25_pkl_name, +) +from autorag.utils import validate_corpus_dataset, fetch_contents +from autorag.utils.util import ( + get_event_loop, + normalize_string, + result_to_dataframe, + pop_params, +) + + +def tokenize_ko_kiwi(texts: List[str]) -> List[List[str]]: + try: + from kiwipiepy import Kiwi, Token + except ImportError: + raise ImportError( + "You need to install kiwipiepy to use 'ko_kiwi' tokenizer. " + "Please install kiwipiepy by running 'pip install kiwipiepy'. " + "Or install Korean version of AutoRAG by running 'pip install AutoRAG[ko]'." + ) + texts = list(map(lambda x: x.strip().lower(), texts)) + kiwi = Kiwi() + tokenized_list: Iterable[List[Token]] = kiwi.tokenize(texts) + return [list(map(lambda x: x.form, token_list)) for token_list in tokenized_list] + + +def tokenize_ko_kkma(texts: List[str]) -> List[List[str]]: + try: + from konlpy.tag import Kkma + except ImportError: + raise ImportError( + "You need to install konlpy to use 'ko_kkma' tokenizer. " + "Please install konlpy by running 'pip install konlpy'. " + "Or install Korean version of AutoRAG by running 'pip install AutoRAG[ko]'." + ) + tokenizer = Kkma() + tokenized_list: List[List[str]] = list(map(lambda x: tokenizer.morphs(x), texts)) + return tokenized_list + + +def tokenize_ko_okt(texts: List[str]) -> List[List[str]]: + try: + from konlpy.tag import Okt + except ImportError: + raise ImportError( + "You need to install konlpy to use 'ko_kkma' tokenizer. " + "Please install konlpy by running 'pip install konlpy'. " + "Or install Korean version of AutoRAG by running 'pip install AutoRAG[ko]'." + ) + tokenizer = Okt() + tokenized_list: List[List[str]] = list(map(lambda x: tokenizer.morphs(x), texts)) + return tokenized_list + + +def tokenize_porter_stemmer(texts: List[str]) -> List[List[str]]: + def tokenize_remove_stopword(text: str, stemmer) -> List[str]: + text = text.lower() + words = list(simple_extract_keywords(text)) + return [stemmer.stem(word) for word in words] + + stemmer = PorterStemmer() + tokenized_list: List[List[str]] = list( + map(lambda x: tokenize_remove_stopword(x, stemmer), texts) + ) + return tokenized_list + + +def tokenize_space(texts: List[str]) -> List[List[str]]: + def tokenize_space_text(text: str) -> List[str]: + text = normalize_string(text) + return re.split(r"\s+", text.strip()) + + return list(map(tokenize_space_text, texts)) + + +def load_bm25_corpus(bm25_path: str) -> Dict: + if bm25_path is None: + return {} + with open(bm25_path, "rb") as f: + bm25_corpus = pickle.load(f) + return bm25_corpus + + +def tokenize_ja_sudachipy(texts: List[str]) -> List[List[str]]: + try: + from sudachipy import dictionary, tokenizer + except ImportError: + raise ImportError( + "You need to install SudachiPy to use 'sudachipy' tokenizer. " + "Please install SudachiPy by running 'pip install sudachipy'." + ) + + # Initialize SudachiPy with the default tokenizer + tokenizer_obj = dictionary.Dictionary(dict="core").create() + + # Choose the tokenizer mode: NORMAL, SEARCH, A + mode = tokenizer.Tokenizer.SplitMode.A + + # Tokenize the input texts + tokenized_list = [] + for text in texts: + tokens = tokenizer_obj.tokenize(text, mode) + tokenized_list.append([token.surface() for token in tokens]) + + return tokenized_list + + +BM25_TOKENIZER = { + "porter_stemmer": tokenize_porter_stemmer, + "ko_kiwi": tokenize_ko_kiwi, + "space": tokenize_space, + "ko_kkma": tokenize_ko_kkma, + "ko_okt": tokenize_ko_okt, + "sudachipy": tokenize_ja_sudachipy, +} + + +class BM25(BaseRetrieval): + def __init__(self, project_dir: str, *args, **kwargs): + """ + Initialize BM25 module. + (Retrieval) + + :param project_dir: The project directory path. + :param bm25_tokenizer: The tokenizer name that is used to the BM25. + It supports 'porter_stemmer', 'ko_kiwi', and huggingface `AutoTokenizer`. + You can pass huggingface tokenizer name. + Default is porter_stemmer. + :param kwargs: The optional arguments. + """ + + super().__init__(project_dir) + # check if bm25_path and file exist + bm25_tokenizer = kwargs.get("bm25_tokenizer", None) + if bm25_tokenizer is None: + bm25_tokenizer = "porter_stemmer" + bm25_path = os.path.join(self.resources_dir, get_bm25_pkl_name(bm25_tokenizer)) + + assert ( + bm25_path is not None + ), "bm25_path must be specified for using bm25 retrieval." + assert os.path.exists( + bm25_path + ), f"bm25_path {bm25_path} does not exist. Please ingest first." + + self.bm25_corpus = load_bm25_corpus(bm25_path) + assert ( + "tokens" and "passage_id" in list(self.bm25_corpus.keys()) + ), "bm25_corpus must contain tokens and passage_id. Please check you ingested bm25 corpus correctly." + self.tokenizer = select_bm25_tokenizer(bm25_tokenizer) + assert self.bm25_corpus["tokenizer_name"] == bm25_tokenizer, ( + f"The bm25 corpus tokenizer is {self.bm25_corpus['tokenizer_name']}, but your input is {bm25_tokenizer}. " + f"You need to ingest again. Delete bm25 pkl file and re-ingest it." + ) + self.bm25_instance = BM25Okapi(self.bm25_corpus["tokens"]) + + @result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + queries = self.cast_to_run(previous_result) + pure_params = pop_params(self._pure, kwargs) + ids, scores = self._pure(queries, *args, **pure_params) + contents = fetch_contents(self.corpus_df, ids) + return contents, ids, scores + + def _pure( + self, + queries: List[List[str]], + top_k: int, + ids: Optional[List[List[str]]] = None, + ) -> Tuple[List[List[str]], List[List[float]]]: + """ + BM25 retrieval function. + You have to load a pickle file that is already ingested. + + :param queries: 2-d list of query strings. + Each element of the list is a query strings of each row. + :param top_k: The number of passages to be retrieved. + :param ids: The optional list of ids that you want to retrieve. + You don't need to specify this in the general use cases. + Default is None. + :return: The 2-d list contains a list of passage ids that retrieved from bm25 and 2-d list of its scores. + It will be a length of queries. And each element has a length of top_k. + """ + if ids is not None: + score_result = list( + map( + lambda query_list, id_list: get_bm25_scores( + query_list, + id_list, + self.tokenizer, + self.bm25_instance, + self.bm25_corpus, + ), + queries, + ids, + ) + ) + return ids, score_result + + # run async bm25_pure function + tasks = [ + bm25_pure( + input_queries, + top_k, + self.tokenizer, + self.bm25_instance, + self.bm25_corpus, + ) + for input_queries in queries + ] + loop = get_event_loop() + results = loop.run_until_complete(asyncio.gather(*tasks)) + id_result = list(map(lambda x: x[0], results)) + score_result = list(map(lambda x: x[1], results)) + return id_result, score_result + + +async def bm25_pure( + queries: List[str], top_k: int, tokenizer, bm25_api: BM25Okapi, bm25_corpus: Dict +) -> Tuple[List[str], List[float]]: + """ + Async BM25 retrieval function. + Its usage is for async retrieval of bm25 row by row. + + :param queries: A list of query strings. + :param top_k: The number of passages to be retrieved. + :param tokenizer: A tokenizer that will be used to tokenize queries. + :param bm25_api: A bm25 api instance that will be used to retrieve passages. + :param bm25_corpus: A dictionary containing the bm25 corpus, which is doc_id from corpus and tokenized corpus. + Its data structure looks like this: + + .. Code:: python + + { + "tokens": [], # 2d list of tokens + "passage_id": [], # 2d list of passage_id. Type must be str. + } + :return: The tuple contains a list of passage ids that retrieved from bm25 and its scores. + """ + # I don't make queries operation to async, because queries length might be small, so it will occur overhead. + tokenized_queries = tokenize(queries, tokenizer) + id_result = [] + score_result = [] + for query in tokenized_queries: + scores = bm25_api.get_scores(query) + sorted_scores = sorted(scores, reverse=True) + top_n_index = np.argsort(scores)[::-1][:top_k] + ids = [bm25_corpus["passage_id"][i] for i in top_n_index] + id_result.append(ids) + score_result.append(sorted_scores[:top_k]) + + # make a total result to top_k + id_result, score_result = evenly_distribute_passages(id_result, score_result, top_k) + # sort id_result and score_result by score + result = [ + (_id, score) + for score, _id in sorted( + zip(score_result, id_result), key=lambda pair: pair[0], reverse=True + ) + ] + id_result, score_result = zip(*result) + return list(id_result), list(score_result) + + +def get_bm25_scores( + queries: List[str], + ids: List[str], + tokenizer, + bm25_api: BM25Okapi, + bm25_corpus: Dict, +) -> List[float]: + if len(ids) == 0 or not bool(ids): + return [] + tokenized_queries = tokenize(queries, tokenizer) + result_dict = {id_: [] for id_ in ids} + for query in tokenized_queries: + scores = bm25_api.get_scores(query) + for i, id_ in enumerate(ids): + result_dict[id_].append(scores[bm25_corpus["passage_id"].index(id_)]) + result_df = pd.DataFrame(result_dict) + return result_df.max(axis=0).tolist() + + +def tokenize(queries: List[str], tokenizer) -> List[List[int]]: + if isinstance(tokenizer, PreTrainedTokenizerBase): + tokenized_queries = tokenizer(queries).input_ids + else: + tokenized_queries = tokenizer(queries) + return tokenized_queries + + +def bm25_ingest( + corpus_path: str, corpus_data: pd.DataFrame, bm25_tokenizer: str = "porter_stemmer" +): + if not corpus_path.endswith(".pkl"): + raise ValueError(f"Corpus path {corpus_path} is not a pickle file.") + validate_corpus_dataset(corpus_data) + ids = corpus_data["doc_id"].tolist() + + # Initialize bm25_corpus + bm25_corpus = pd.DataFrame() + + # Load the BM25 corpus if it exists and get the passage ids + if os.path.exists(corpus_path) and os.path.getsize(corpus_path) > 0: + with open(corpus_path, "rb") as r: + corpus = pickle.load(r) + bm25_corpus = pd.DataFrame.from_dict(corpus) + duplicated_passage_rows = bm25_corpus[bm25_corpus["passage_id"].isin(ids)] + new_passage = corpus_data[ + ~corpus_data["doc_id"].isin(duplicated_passage_rows["passage_id"]) + ] + else: + new_passage = corpus_data + + if not new_passage.empty: + tokenizer = select_bm25_tokenizer(bm25_tokenizer) + if isinstance(tokenizer, PreTrainedTokenizerBase): + tokenized_corpus = tokenizer(new_passage["contents"].tolist()).input_ids + else: + tokenized_corpus = tokenizer(new_passage["contents"].tolist()) + new_bm25_corpus = pd.DataFrame( + { + "tokens": tokenized_corpus, + "passage_id": new_passage["doc_id"].tolist(), + } + ) + + if not bm25_corpus.empty: + bm25_corpus_updated = pd.concat( + [bm25_corpus, new_bm25_corpus], ignore_index=True + ) + bm25_dict = bm25_corpus_updated.to_dict("list") + else: + bm25_dict = new_bm25_corpus.to_dict("list") + + # add tokenizer name to bm25_dict + bm25_dict["tokenizer_name"] = bm25_tokenizer + + with open(corpus_path, "wb") as w: + pickle.dump(bm25_dict, w) + + +def select_bm25_tokenizer( + bm25_tokenizer: str, +) -> Callable[[str], List[Union[int, str]]]: + if bm25_tokenizer in list(BM25_TOKENIZER.keys()): + return BM25_TOKENIZER[bm25_tokenizer] + + return AutoTokenizer.from_pretrained(bm25_tokenizer, use_fast=False) diff --git a/autorag-workspace/autorag/nodes/retrieval/hybrid_cc.py b/autorag-workspace/autorag/nodes/retrieval/hybrid_cc.py new file mode 100644 index 0000000..53d6bfa --- /dev/null +++ b/autorag-workspace/autorag/nodes/retrieval/hybrid_cc.py @@ -0,0 +1,214 @@ +import os +from pathlib import Path +from typing import Tuple, List, Union + +import numpy as np +import pandas as pd + +from autorag.nodes.retrieval.base import HybridRetrieval +from autorag.utils.util import pop_params, fetch_contents, result_to_dataframe + + +def normalize_mm(scores: List[str], fixed_min_value: float = 0): + arr = np.array(scores) + max_value = np.max(arr) + min_value = np.min(arr) + norm_score = (arr - min_value) / (max_value - min_value) + return norm_score + + +def normalize_tmm(scores: List[str], fixed_min_value: float): + arr = np.array(scores) + max_value = np.max(arr) + norm_score = (arr - fixed_min_value) / (max_value - fixed_min_value) + return norm_score + + +def normalize_z(scores: List[str], fixed_min_value: float = 0): + arr = np.array(scores) + mean_value = np.mean(arr) + std_value = np.std(arr) + norm_score = (arr - mean_value) / std_value + return norm_score + + +def normalize_dbsf(scores: List[str], fixed_min_value: float = 0): + arr = np.array(scores) + mean_value = np.mean(arr) + std_value = np.std(arr) + min_value = mean_value - 3 * std_value + max_value = mean_value + 3 * std_value + norm_score = (arr - min_value) / (max_value - min_value) + return norm_score + + +normalize_method_dict = { + "mm": normalize_mm, + "tmm": normalize_tmm, + "z": normalize_z, + "dbsf": normalize_dbsf, +} + + +class HybridCC(HybridRetrieval): + def _pure( + self, + ids: Tuple, + scores: Tuple, + top_k: int, + weight: float, + normalize_method: str = "mm", + semantic_theoretical_min_value: float = -1.0, + lexical_theoretical_min_value: float = 0.0, + ): + return hybrid_cc( + ids, + scores, + top_k, + weight, + normalize_method, + semantic_theoretical_min_value, + lexical_theoretical_min_value, + ) + + @classmethod + def run_evaluator( + cls, + project_dir: Union[str, Path], + previous_result: pd.DataFrame, + *args, + **kwargs, + ): + if "ids" in kwargs and "scores" in kwargs: + data_dir = os.path.join(project_dir, "data") + corpus_df = pd.read_parquet( + os.path.join(data_dir, "corpus.parquet"), engine="pyarrow" + ) + + params = pop_params(hybrid_cc, kwargs) + assert ( + "ids" in params and "scores" in params and "top_k" in params + ), "ids, scores, and top_k must be specified." + + @result_to_dataframe( + ["retrieved_contents", "retrieved_ids", "retrieve_scores"] + ) + def __cc(**cc_params): + ids, scores = hybrid_cc(**cc_params) + contents = fetch_contents(corpus_df, ids) + return contents, ids, scores + + return __cc(**params) + else: + assert ( + "target_modules" in kwargs and "target_module_params" in kwargs + ), "target_modules and target_module_params must be specified if there is not ids and scores." + instance = cls(project_dir, *args, **kwargs) + result = instance.pure(previous_result, *args, **kwargs) + del instance + return result + + +def hybrid_cc( + ids: Tuple, + scores: Tuple, + top_k: int, + weight: float, + normalize_method: str = "mm", + semantic_theoretical_min_value: float = -1.0, + lexical_theoretical_min_value: float = 0.0, +) -> Tuple[List[List[str]], List[List[float]]]: + """ + Hybrid CC function. + CC (convex combination) is a method to fuse lexical and semantic retrieval results. + It is a method that first normalizes the scores of each retrieval result, + and then combines them with the given weights. + It is uniquer than other retrieval modules, because it does not really execute retrieval, + but just fuse the results of other retrieval functions. + So you have to run more than two retrieval modules before running this function. + And collect ids and scores result from each retrieval module. + Make it as tuple and input it to this function. + + :param ids: The tuple of ids that you want to fuse. + The length of this must be the same as the length of scores. + The semantic retrieval ids must be the first index. + :param scores: The retrieve scores that you want to fuse. + The length of this must be the same as the length of ids. + The semantic retrieval scores must be the first index. + :param top_k: The number of passages to be retrieved. + :param normalize_method: The normalization method to use. + There are some normalization method that you can use at the hybrid cc method. + AutoRAG support following. + - `mm`: Min-max scaling + - `tmm`: Theoretical min-max scaling + - `z`: z-score normalization + - `dbsf`: 3-sigma normalization + :param weight: The weight value. If the weight is 1.0, it means the + weight to the semantic module will be 1.0 and weight to the lexical module will be 0.0. + :param semantic_theoretical_min_value: This value used by `tmm` normalization method. You can set the + theoretical minimum value by yourself. Default is -1. + :param lexical_theoretical_min_value: This value used by `tmm` normalization method. You can set the + theoretical minimum value by yourself. Default is 0. + :return: The tuple of ids and fused scores that fused by CC. Plus, the third element is selected weight value. + """ + assert len(ids) == len(scores), "The length of ids and scores must be the same." + assert len(ids) > 1, "You must input more than one retrieval results." + assert top_k > 0, "top_k must be greater than 0." + assert weight >= 0, "The weight must be greater than 0." + assert weight <= 1, "The weight must be less than 1." + + df = pd.DataFrame( + { + "semantic_ids": ids[0], + "lexical_ids": ids[1], + "semantic_score": scores[0], + "lexical_score": scores[1], + } + ) + + def cc_pure_apply(row): + return fuse_per_query( + row["semantic_ids"], + row["lexical_ids"], + row["semantic_score"], + row["lexical_score"], + normalize_method=normalize_method, + weight=weight, + top_k=top_k, + semantic_theoretical_min_value=semantic_theoretical_min_value, + lexical_theoretical_min_value=lexical_theoretical_min_value, + ) + + # fixed weight + df[["cc_id", "cc_score"]] = df.apply( + lambda row: cc_pure_apply(row), axis=1, result_type="expand" + ) + return df["cc_id"].tolist(), df["cc_score"].tolist() + + +def fuse_per_query( + semantic_ids: List[str], + lexical_ids: List[str], + semantic_scores: List[float], + lexical_scores: List[float], + normalize_method: str, + weight: float, + top_k: int, + semantic_theoretical_min_value: float, + lexical_theoretical_min_value: float, +): + normalize_func = normalize_method_dict[normalize_method] + norm_semantic_scores = normalize_func( + semantic_scores, semantic_theoretical_min_value + ) + norm_lexical_scores = normalize_func(lexical_scores, lexical_theoretical_min_value) + ids = [semantic_ids, lexical_ids] + scores = [norm_semantic_scores, norm_lexical_scores] + df = pd.concat( + [pd.Series(dict(zip(_id, score))) for _id, score in zip(ids, scores)], axis=1 + ) + df.columns = ["semantic", "lexical"] + df = df.fillna(0) + df["weighted_sum"] = df.mul((weight, 1.0 - weight)).sum(axis=1) + df = df.sort_values(by="weighted_sum", ascending=False) + return df.index.tolist()[:top_k], df["weighted_sum"][:top_k].tolist() diff --git a/autorag-workspace/autorag/nodes/retrieval/hybrid_rrf.py b/autorag-workspace/autorag/nodes/retrieval/hybrid_rrf.py new file mode 100644 index 0000000..d5aae10 --- /dev/null +++ b/autorag-workspace/autorag/nodes/retrieval/hybrid_rrf.py @@ -0,0 +1,128 @@ +import os +from pathlib import Path +from typing import List, Tuple, Union + +import pandas as pd + +from autorag.nodes.retrieval.base import HybridRetrieval +from autorag.utils.util import pop_params, fetch_contents, result_to_dataframe + + +class HybridRRF(HybridRetrieval): + def _pure(self, ids, scores, top_k: int, weight: int = 60, rrf_k: int = -1): + return hybrid_rrf(ids, scores, top_k, weight, rrf_k) + + @classmethod + def run_evaluator( + cls, + project_dir: Union[str, Path], + previous_result: pd.DataFrame, + *args, + **kwargs, + ): + if "ids" in kwargs and "scores" in kwargs: + data_dir = os.path.join(project_dir, "data") + corpus_df = pd.read_parquet( + os.path.join(data_dir, "corpus.parquet"), engine="pyarrow" + ) + + params = pop_params(hybrid_rrf, kwargs) + assert ( + "ids" in params and "scores" in params and "top_k" in params + ), "ids, scores, and top_k must be specified." + + @result_to_dataframe( + ["retrieved_contents", "retrieved_ids", "retrieve_scores"] + ) + def __rrf(**rrf_params): + ids, scores = hybrid_rrf(**rrf_params) + contents = fetch_contents(corpus_df, ids) + return contents, ids, scores + + return __rrf(**params) + else: + assert ( + "target_modules" in kwargs and "target_module_params" in kwargs + ), "target_modules and target_module_params must be specified if there is not ids and scores." + instance = cls(project_dir, *args, **kwargs) + result = instance.pure(previous_result, *args, **kwargs) + del instance + return result + + +def hybrid_rrf( + ids: Tuple, + scores: Tuple, + top_k: int, + weight: int = 60, + rrf_k: int = -1, +) -> Tuple[List[List[str]], List[List[float]]]: + """ + Hybrid RRF function. + RRF (Rank Reciprocal Fusion) is a method to fuse multiple retrieval results. + It is common to fuse dense retrieval and sparse retrieval results using RRF. + To use this function, you must input ids and scores as tuple. + It is more unique than other retrieval modules because it does not really execute retrieval but just fuses + the results of other retrieval functions. + So you have to run more than two retrieval modules before running this function. + And collect ids and scores result from each retrieval module. + Make it as a tuple and input it to this function. + + :param ids: The tuple of ids that you want to fuse. + The length of this must be the same as the length of scores. + :param scores: The retrieve scores that you want to fuse. + The length of this must be the same as the length of ids. + :param top_k: The number of passages to be retrieved. + :param weight: Hyperparameter for RRF. + It was originally rrf_k value. + Default is 60. + For more information, please visit our documentation. + :param rrf_k: (Deprecated) Hyperparameter for RRF. + It was originally rrf_k value. Will remove at a further version. + :return: The tuple of ids and fused scores that are fused by RRF. + """ + assert len(ids) == len(scores), "The length of ids and scores must be the same." + assert len(ids) > 1, "You must input more than one retrieval results." + assert top_k > 0, "top_k must be greater than 0." + assert weight > 0, "rrf_k must be greater than 0." + + if rrf_k != -1: + weight = int(rrf_k) + else: + weight = int(weight) + + id_df = pd.DataFrame({f"id_{i}": id_list for i, id_list in enumerate(ids)}) + score_df = pd.DataFrame( + {f"score_{i}": score_list for i, score_list in enumerate(scores)} + ) + df = pd.concat([id_df, score_df], axis=1) + + def rrf_pure_apply(row): + ids_tuple = tuple(row[[f"id_{i}" for i in range(len(ids))]].values) + scores_tuple = tuple(row[[f"score_{i}" for i in range(len(scores))]].values) + return pd.Series(rrf_pure(ids_tuple, scores_tuple, weight, top_k)) + + df[["rrf_id", "rrf_score"]] = df.apply(rrf_pure_apply, axis=1) + return df["rrf_id"].tolist(), df["rrf_score"].tolist() + + +def rrf_pure( + ids: Tuple, scores: Tuple, rrf_k: int, top_k: int +) -> Tuple[List[str], List[float]]: + df = pd.concat( + [pd.Series(dict(zip(_id, score))) for _id, score in zip(ids, scores)], axis=1 + ) + rank_df = df.rank(ascending=False, method="min") + rank_df = rank_df.fillna(0) + rank_df["rrf"] = rank_df.apply(lambda row: rrf_calculate(row, rrf_k), axis=1) + rank_df = rank_df.sort_values(by="rrf", ascending=False) + return rank_df.index.tolist()[:top_k], rank_df["rrf"].tolist()[:top_k] + + +def rrf_calculate(row, rrf_k): + result = 0 + for r in row: + if r == 0: + continue + result += 1 / (r + rrf_k) + return result diff --git a/autorag-workspace/autorag/nodes/retrieval/run.py b/autorag-workspace/autorag/nodes/retrieval/run.py new file mode 100644 index 0000000..218471f --- /dev/null +++ b/autorag-workspace/autorag/nodes/retrieval/run.py @@ -0,0 +1,544 @@ +import logging +import os +import pathlib +from copy import deepcopy +from typing import List, Callable, Dict, Tuple, Union + +import numpy as np +import pandas as pd + +from autorag.evaluation import evaluate_retrieval +from autorag.schema.metricinput import MetricInput +from autorag.strategy import measure_speed, filter_by_threshold, select_best +from autorag.support import get_support_modules +from autorag.utils.util import get_best_row, to_list, apply_recursive + +logger = logging.getLogger("AutoRAG") + +semantic_module_names = ["vectordb", "VectorDB"] +lexical_module_names = ["bm25", "BM25"] +hybrid_module_names = ["hybrid_rrf", "hybrid_cc", "HybridCC", "HybridRRF"] + + +def run_retrieval_node( + modules: List, + module_params: List[Dict], + previous_result: pd.DataFrame, + node_line_dir: str, + strategies: Dict, +) -> pd.DataFrame: + """ + Run evaluation and select the best module among retrieval node results. + + :param modules: Retrieval modules to run. + :param module_params: Retrieval module parameters. + :param previous_result: Previous result dataframe. + Could be query expansion's best result or qa data. + :param node_line_dir: This node line's directory. + :param strategies: Strategies for retrieval node. + :return: The best result dataframe. + It contains previous result columns and retrieval node's result columns. + """ + if not os.path.exists(node_line_dir): + os.makedirs(node_line_dir) + project_dir = pathlib.PurePath(node_line_dir).parent.parent + qa_df = pd.read_parquet( + os.path.join(project_dir, "data", "qa.parquet"), engine="pyarrow" + ) + retrieval_gt = qa_df["retrieval_gt"].tolist() + retrieval_gt = apply_recursive(lambda x: str(x), to_list(retrieval_gt)) + # make rows to metric_inputs + metric_inputs = [ + MetricInput(retrieval_gt=ret_gt, query=query, generation_gt=gen_gt) + for ret_gt, query, gen_gt in zip( + retrieval_gt, qa_df["query"].tolist(), qa_df["generation_gt"].tolist() + ) + ] + + save_dir = os.path.join(node_line_dir, "retrieval") # node name + if not os.path.exists(save_dir): + os.makedirs(save_dir) + + def run(input_modules, input_module_params) -> Tuple[List[pd.DataFrame], List]: + """ + Run input modules and parameters. + + :param input_modules: Input modules + :param input_module_params: Input module parameters + :return: First, it returns list of result dataframe. + Second, it returns list of execution times. + """ + result, execution_times = zip( + *map( + lambda task: measure_speed( + task[0].run_evaluator, + project_dir=project_dir, + previous_result=previous_result, + **task[1], + ), + zip(input_modules, input_module_params), + ) + ) + average_times = list(map(lambda x: x / len(result[0]), execution_times)) + + # run metrics before filtering + if strategies.get("metrics") is None: + raise ValueError("You must at least one metrics for retrieval evaluation.") + result = list( + map( + lambda x: evaluate_retrieval_node( + x, + metric_inputs, + strategies.get("metrics"), + ), + result, + ) + ) + + return result, average_times + + def save_and_summary( + input_modules, + input_module_params, + result_list, + execution_time_list, + filename_start: int, + ): + """ + Save the result and make summary file + + :param input_modules: Input modules + :param input_module_params: Input module parameters + :param result_list: Result list + :param execution_time_list: Execution times + :param filename_start: The first filename to use + :return: First, it returns list of result dataframe. + Second, it returns list of execution times. + """ + + # save results to folder + filepaths = list( + map( + lambda x: os.path.join(save_dir, f"{x}.parquet"), + range(filename_start, filename_start + len(input_modules)), + ) + ) + list( + map( + lambda x: x[0].to_parquet(x[1], index=False), + zip(result_list, filepaths), + ) + ) # execute save to parquet + filename_list = list(map(lambda x: os.path.basename(x), filepaths)) + + summary_df = pd.DataFrame( + { + "filename": filename_list, + "module_name": list(map(lambda module: module.__name__, input_modules)), + "module_params": input_module_params, + "execution_time": execution_time_list, + **{ + metric: list(map(lambda result: result[metric].mean(), result_list)) + for metric in strategies.get("metrics") + }, + } + ) + summary_df.to_csv(os.path.join(save_dir, "summary.csv"), index=False) + return summary_df + + def find_best(results, average_times, filenames): + # filter by strategies + if strategies.get("speed_threshold") is not None: + results, filenames = filter_by_threshold( + results, average_times, strategies["speed_threshold"], filenames + ) + selected_result, selected_filename = select_best( + results, + strategies.get("metrics"), + filenames, + strategies.get("strategy", "mean"), + ) + return selected_result, selected_filename + + filename_first = 0 + # run semantic modules + logger.info("Running retrieval node - semantic retrieval module...") + if any([module.__name__ in semantic_module_names for module in modules]): + semantic_modules, semantic_module_params = zip( + *filter( + lambda x: x[0].__name__ in semantic_module_names, + zip(modules, module_params), + ) + ) + semantic_results, semantic_times = run(semantic_modules, semantic_module_params) + semantic_summary_df = save_and_summary( + semantic_modules, + semantic_module_params, + semantic_results, + semantic_times, + filename_first, + ) + semantic_selected_result, semantic_selected_filename = find_best( + semantic_results, semantic_times, semantic_summary_df["filename"].tolist() + ) + semantic_summary_df["is_best"] = ( + semantic_summary_df["filename"] == semantic_selected_filename + ) + filename_first += len(semantic_modules) + else: + ( + semantic_selected_filename, + semantic_summary_df, + semantic_results, + semantic_times, + ) = None, pd.DataFrame(), [], [] + # run lexical modules + logger.info("Running retrieval node - lexical retrieval module...") + if any([module.__name__ in lexical_module_names for module in modules]): + lexical_modules, lexical_module_params = zip( + *filter( + lambda x: x[0].__name__ in lexical_module_names, + zip(modules, module_params), + ) + ) + lexical_results, lexical_times = run(lexical_modules, lexical_module_params) + lexical_summary_df = save_and_summary( + lexical_modules, + lexical_module_params, + lexical_results, + lexical_times, + filename_first, + ) + lexical_selected_result, lexical_selected_filename = find_best( + lexical_results, lexical_times, lexical_summary_df["filename"].tolist() + ) + lexical_summary_df["is_best"] = ( + lexical_summary_df["filename"] == lexical_selected_filename + ) + filename_first += len(lexical_modules) + else: + ( + lexical_selected_filename, + lexical_summary_df, + lexical_results, + lexical_times, + ) = None, pd.DataFrame(), [], [] + + logger.info("Running retrieval node - hybrid retrieval module...") + # Next, run hybrid retrieval + if any([module.__name__ in hybrid_module_names for module in modules]): + hybrid_modules, hybrid_module_params = zip( + *filter( + lambda x: x[0].__name__ in hybrid_module_names, + zip(modules, module_params), + ) + ) + if all( + ["target_module_params" in x for x in hybrid_module_params] + ): # for Runner.run + # If target_module_params are already given, run hybrid retrieval directly + hybrid_results, hybrid_times = run(hybrid_modules, hybrid_module_params) + hybrid_summary_df = save_and_summary( + hybrid_modules, + hybrid_module_params, + hybrid_results, + hybrid_times, + filename_first, + ) + filename_first += len(hybrid_modules) + else: # for Evaluator + # get id and score + ids_scores = get_ids_and_scores( + save_dir, + [semantic_selected_filename, lexical_selected_filename], + semantic_summary_df, + lexical_summary_df, + previous_result, + ) + hybrid_module_params = list( + map(lambda x: {**x, **ids_scores}, hybrid_module_params) + ) + + # optimize each modules + real_hybrid_times = [ + get_hybrid_execution_times(semantic_summary_df, lexical_summary_df) + ] * len(hybrid_module_params) + hybrid_times = real_hybrid_times.copy() + hybrid_results = [] + for module, module_param in zip(hybrid_modules, hybrid_module_params): + module_result_df, module_best_weight = optimize_hybrid( + module, + module_param, + strategies, + metric_inputs, + project_dir, + previous_result, + ) + module_param["weight"] = module_best_weight + hybrid_results.append(module_result_df) + + hybrid_summary_df = save_and_summary( + hybrid_modules, + hybrid_module_params, + hybrid_results, + hybrid_times, + filename_first, + ) + filename_first += len(hybrid_modules) + hybrid_summary_df["execution_time"] = hybrid_times + best_semantic_summary_row = semantic_summary_df.loc[ + semantic_summary_df["is_best"] + ].iloc[0] + best_lexical_summary_row = lexical_summary_df.loc[ + lexical_summary_df["is_best"] + ].iloc[0] + target_modules = ( + best_semantic_summary_row["module_name"], + best_lexical_summary_row["module_name"], + ) + target_module_params = ( + best_semantic_summary_row["module_params"], + best_lexical_summary_row["module_params"], + ) + hybrid_summary_df = edit_summary_df_params( + hybrid_summary_df, target_modules, target_module_params + ) + else: + if any([module.__name__ in hybrid_module_names for module in modules]): + logger.warning( + "You must at least one semantic module and lexical module for hybrid evaluation." + "Passing hybrid module." + ) + _, hybrid_summary_df, hybrid_results, hybrid_times = ( + None, + pd.DataFrame(), + [], + [], + ) + + summary = pd.concat( + [semantic_summary_df, lexical_summary_df, hybrid_summary_df], ignore_index=True + ) + results = semantic_results + lexical_results + hybrid_results + average_times = semantic_times + lexical_times + hybrid_times + filenames = summary["filename"].tolist() + + # filter by strategies + selected_result, selected_filename = find_best(results, average_times, filenames) + best_result = pd.concat([previous_result, selected_result], axis=1) + + # add summary.csv 'is_best' column + summary["is_best"] = summary["filename"] == selected_filename + + # save the result files + best_result.to_parquet( + os.path.join( + save_dir, f"best_{os.path.splitext(selected_filename)[0]}.parquet" + ), + index=False, + ) + summary.to_csv(os.path.join(save_dir, "summary.csv"), index=False) + return best_result + + +def evaluate_retrieval_node( + result_df: pd.DataFrame, + metric_inputs: List[MetricInput], + metrics: Union[List[str], List[Dict]], +) -> pd.DataFrame: + """ + Evaluate retrieval node from retrieval node result dataframe. + + :param result_df: The result dataframe from a retrieval node. + :param metric_inputs: List of metric input schema for AutoRAG. + :param metrics: Metric list from input strategies. + :return: Return result_df with metrics columns. + The columns will be 'retrieved_contents', 'retrieved_ids', 'retrieve_scores', and metric names. + """ + + @evaluate_retrieval( + metric_inputs=metric_inputs, + metrics=metrics, + ) + def evaluate_this_module(df: pd.DataFrame): + return ( + df["retrieved_contents"].tolist(), + df["retrieved_ids"].tolist(), + df["retrieve_scores"].tolist(), + ) + + return evaluate_this_module(result_df) + + +def edit_summary_df_params( + summary_df: pd.DataFrame, target_modules, target_module_params +) -> pd.DataFrame: + def delete_ids_scores(x): + del x["ids"] + del x["scores"] + return x + + summary_df["module_params"] = summary_df["module_params"].apply(delete_ids_scores) + summary_df["new_params"] = [ + {"target_modules": target_modules, "target_module_params": target_module_params} + ] * len(summary_df) + summary_df["module_params"] = summary_df.apply( + lambda row: {**row["module_params"], **row["new_params"]}, axis=1 + ) + summary_df = summary_df.drop(columns=["new_params"]) + return summary_df + + +def get_ids_and_scores( + node_dir: str, + filenames: List[str], + semantic_summary_df: pd.DataFrame, + lexical_summary_df: pd.DataFrame, + previous_result, +) -> Dict[str, Tuple[List[List[str]], List[List[float]]]]: + project_dir = pathlib.PurePath(node_dir).parent.parent.parent + best_results_df = list( + map( + lambda filename: pd.read_parquet( + os.path.join(node_dir, filename), engine="pyarrow" + ), + filenames, + ) + ) + ids = tuple( + map(lambda df: df["retrieved_ids"].apply(list).tolist(), best_results_df) + ) + scores = tuple( + map(lambda df: df["retrieve_scores"].apply(list).tolist(), best_results_df) + ) + # search non-duplicate ids + semantic_ids = deepcopy(ids[0]) + lexical_ids = deepcopy(ids[1]) + + def get_non_duplicate_ids(target_ids, compare_ids) -> List[List[str]]: + """ + Get non-duplicate ids from target_ids and compare_ids. + If you want to non-duplicate ids of semantic_ids, you have to put it at target_ids. + """ + result_ids = [] + assert len(target_ids) == len(compare_ids) + for target_id_list, compare_id_list in zip(target_ids, compare_ids): + query_duplicated = list(set(compare_id_list) - set(target_id_list)) + duplicate_list = query_duplicated if len(query_duplicated) != 0 else [] + result_ids.append(duplicate_list) + return result_ids + + lexical_target_ids = get_non_duplicate_ids(lexical_ids, semantic_ids) + semantic_target_ids = get_non_duplicate_ids(semantic_ids, lexical_ids) + + new_id_tuple = ( + [a + b for a, b in zip(semantic_ids, semantic_target_ids)], + [a + b for a, b in zip(lexical_ids, lexical_target_ids)], + ) + + # search non-duplicate ids' scores + new_semantic_scores = get_scores_by_ids( + semantic_target_ids, semantic_summary_df, project_dir, previous_result + ) + new_lexical_scores = get_scores_by_ids( + lexical_target_ids, lexical_summary_df, project_dir, previous_result + ) + + new_score_tuple = ( + [a + b for a, b in zip(scores[0], new_semantic_scores)], + [a + b for a, b in zip(scores[1], new_lexical_scores)], + ) + return { + "ids": new_id_tuple, + "scores": new_score_tuple, + } + + +def get_scores_by_ids( + ids: List[List[str]], module_summary_df: pd.DataFrame, project_dir, previous_result +) -> List[List[float]]: + module_name = get_best_row(module_summary_df)["module_name"] + module_params = get_best_row(module_summary_df)["module_params"] + module = get_support_modules(module_name) + result_df = module.run_evaluator( + project_dir=project_dir, + previous_result=previous_result, + ids=ids, + **module_params, + ) + return to_list(result_df["retrieve_scores"].tolist()) + + +def find_unique_elems(list1: List[str], list2: List[str]) -> List[str]: + return list(set(list1).symmetric_difference(set(list2))) + + +def get_hybrid_execution_times(lexical_summary, semantic_summary) -> float: + lexical_execution_time = lexical_summary.loc[lexical_summary["is_best"]].iloc[0][ + "execution_time" + ] + semantic_execution_time = semantic_summary.loc[semantic_summary["is_best"]].iloc[0][ + "execution_time" + ] + return lexical_execution_time + semantic_execution_time + + +def optimize_hybrid( + hybrid_module_func: Callable, + hybrid_module_param: Dict, + strategy: Dict, + input_metrics: List[MetricInput], + project_dir, + previous_result, +): + if ( + hybrid_module_func.__name__ == "HybridRRF" + or hybrid_module_func.__name__ == "hybrid_rrf" + ): + weight_range = hybrid_module_param.pop("weight_range", (4, 80)) + test_weight_size = weight_range[1] - weight_range[0] + 1 + elif ( + hybrid_module_func.__name__ == "HybridCC" + or hybrid_module_func.__name__ == "hybrid_cc" + ): + weight_range = hybrid_module_param.pop("weight_range", (0.0, 1.0)) + test_weight_size = hybrid_module_param.pop("test_weight_size", 101) + else: + raise ValueError("You must input hybrid module function at hybrid_module_func.") + + weight_candidates = np.linspace( + weight_range[0], weight_range[1], test_weight_size + ).tolist() + + result_list = [] + for weight_value in weight_candidates: + result_df = hybrid_module_func.run_evaluator( + project_dir=project_dir, + previous_result=previous_result, + weight=weight_value, + **hybrid_module_param, + ) + result_list.append(result_df) + + # evaluate here + if strategy.get("metrics") is None: + raise ValueError("You must at least one metrics for retrieval evaluation.") + result_list = list( + map( + lambda x: evaluate_retrieval_node( + x, + input_metrics, + strategy.get("metrics"), + ), + result_list, + ) + ) + + # select best result + best_result_df, best_weight = select_best( + result_list, + strategy.get("metrics"), + metadatas=weight_candidates, + strategy_name=strategy.get("strategy", "normalize_mean"), + ) + return best_result_df, best_weight diff --git a/autorag-workspace/autorag/nodes/retrieval/vectordb.py b/autorag-workspace/autorag/nodes/retrieval/vectordb.py new file mode 100644 index 0000000..ba0cc2c --- /dev/null +++ b/autorag-workspace/autorag/nodes/retrieval/vectordb.py @@ -0,0 +1,303 @@ +import itertools +import logging +import os +from typing import List, Tuple, Optional + +import numpy as np +import pandas as pd +from llama_index.core.embeddings import BaseEmbedding +from llama_index.embeddings.openai import OpenAIEmbedding + +from autorag.evaluation.metric.util import ( + calculate_l2_distance, + calculate_inner_product, + calculate_cosine_similarity, +) +from autorag.nodes.retrieval.base import evenly_distribute_passages, BaseRetrieval +from autorag.utils import ( + validate_corpus_dataset, + cast_corpus_dataset, + cast_qa_dataset, + validate_qa_dataset, +) +from autorag.utils.util import ( + get_event_loop, + process_batch, + openai_truncate_by_token, + flatten_apply, + result_to_dataframe, + pop_params, + fetch_contents, + empty_cuda_cache, + convert_inputs_to_list, + make_batch, +) +from autorag.vectordb import load_vectordb_from_yaml +from autorag.vectordb.base import BaseVectorStore + +logger = logging.getLogger("AutoRAG") + + +class VectorDB(BaseRetrieval): + def __init__(self, project_dir: str, vectordb: str = "default", **kwargs): + """ + Initialize VectorDB retrieval node. + + :param project_dir: The project directory path. + :param vectordb: The vectordb name. + You must configure the vectordb name in the config.yaml file. + If you don't configure, it uses the default vectordb. + :param kwargs: The optional arguments. + Not affected in the init method. + """ + super().__init__(project_dir) + + vectordb_config_path = os.path.join(self.resources_dir, "vectordb.yaml") + self.vector_store = load_vectordb_from_yaml( + vectordb_config_path, vectordb, project_dir + ) + + self.embedding_model = self.vector_store.embedding + + def __del__(self): + del self.vector_store + del self.embedding_model + empty_cuda_cache() + super().__del__() + + @result_to_dataframe(["retrieved_contents", "retrieved_ids", "retrieve_scores"]) + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + queries = self.cast_to_run(previous_result) + pure_params = pop_params(self._pure, kwargs) + ids, scores = self._pure(queries, **pure_params) + contents = fetch_contents(self.corpus_df, ids) + return contents, ids, scores + + def _pure( + self, + queries: List[List[str]], + top_k: int, + embedding_batch: int = 128, + ids: Optional[List[List[str]]] = None, + ) -> Tuple[List[List[str]], List[List[float]]]: + """ + VectorDB retrieval function. + You have to get a chroma collection that is already ingested. + You have to get an embedding model that is already used in ingesting. + + :param queries: 2-d list of query strings. + Each element of the list is a query strings of each row. + :param top_k: The number of passages to be retrieved. + :param embedding_batch: The number of queries to be processed in parallel. + This is used to prevent API error at the query embedding. + Default is 128. + :param ids: The optional list of ids that you want to retrieve. + You don't need to specify this in the general use cases. + Default is None. + + :return: The 2-d list contains a list of passage ids that retrieved from vectordb and 2-d list of its scores. + It will be a length of queries. And each element has a length of top_k. + """ + # if ids are specified, fetch the ids score from Chroma + if ids is not None: + return self.__get_ids_scores(queries, ids, embedding_batch) + + # run async vector_db_pure function + tasks = [ + vectordb_pure(query_list, top_k, self.vector_store) + for query_list in queries + ] + loop = get_event_loop() + results = loop.run_until_complete( + process_batch(tasks, batch_size=embedding_batch) + ) + id_result = list(map(lambda x: x[0], results)) + score_result = list(map(lambda x: x[1], results)) + return id_result, score_result + + def __get_ids_scores(self, queries, ids, embedding_batch: int): + # truncate queries and embedding execution here. + openai_embedding_limit = 8000 + if isinstance(self.embedding_model, OpenAIEmbedding): + queries = list( + map( + lambda query_list: openai_truncate_by_token( + query_list, + openai_embedding_limit, + self.embedding_model.model_name, + ), + queries, + ) + ) + + query_embeddings = flatten_apply( + run_query_embedding_batch, + queries, + embedding_model=self.embedding_model, + batch_size=embedding_batch, + ) + + loop = get_event_loop() + + async def run_fetch(ids): + final_result = [] + for id_list in ids: + if len(id_list) == 0: + final_result.append([]) + else: + result = await self.vector_store.fetch(id_list) + final_result.append(result) + return final_result + + content_embeddings = loop.run_until_complete(run_fetch(ids)) + + score_result = list( + map( + lambda query_embedding_list, content_embedding_list: get_id_scores( + query_embedding_list, + content_embedding_list, + similarity_metric=self.vector_store.similarity_metric, + ), + query_embeddings, + content_embeddings, + ) + ) + return ids, score_result + + +async def vectordb_pure( + queries: List[str], top_k: int, vectordb: BaseVectorStore +) -> Tuple[List[str], List[float]]: + """ + Async VectorDB retrieval function. + Its usage is for async retrieval of vector_db row by row. + + :param query_embeddings: A list of query embeddings. + :param top_k: The number of passages to be retrieved. + :param vectordb: The vector store instance. + :return: The tuple contains a list of passage ids that are retrieved from vectordb and a list of its scores. + """ + id_result, score_result = await vectordb.query(queries=queries, top_k=top_k) + + # Distribute passages evenly + id_result, score_result = evenly_distribute_passages(id_result, score_result, top_k) + # sort id_result and score_result by score + result = [ + (_id, score) + for score, _id in sorted( + zip(score_result, id_result), key=lambda pair: pair[0], reverse=True + ) + ] + id_result, score_result = zip(*result) + return list(id_result), list(score_result) + + +async def filter_exist_ids( + vectordb: BaseVectorStore, + corpus_data: pd.DataFrame, +) -> pd.DataFrame: + corpus_data = cast_corpus_dataset(corpus_data) + validate_corpus_dataset(corpus_data) + ids = corpus_data["doc_id"].tolist() + + # Query the collection to check if IDs already exist + existed_bool_list = await vectordb.is_exist(ids=ids) + # Assuming 'ids' is the key in the response + new_passage = corpus_data[~pd.Series(existed_bool_list)] + return new_passage + + +async def filter_exist_ids_from_retrieval_gt( + vectordb: BaseVectorStore, + qa_data: pd.DataFrame, + corpus_data: pd.DataFrame, +) -> pd.DataFrame: + qa_data = cast_qa_dataset(qa_data) + validate_qa_dataset(qa_data) + corpus_data = cast_corpus_dataset(corpus_data) + validate_corpus_dataset(corpus_data) + retrieval_gt = ( + qa_data["retrieval_gt"] + .apply(lambda x: list(itertools.chain.from_iterable(x))) + .tolist() + ) + retrieval_gt = list(itertools.chain.from_iterable(retrieval_gt)) + retrieval_gt = list(set(retrieval_gt)) + + existed_bool_list = await vectordb.is_exist(ids=retrieval_gt) + add_ids = [] + for ret_gt, is_exist in zip(retrieval_gt, existed_bool_list): + if not is_exist: + add_ids.append(ret_gt) + new_passage = corpus_data[corpus_data["doc_id"].isin(add_ids)] + return new_passage + + +async def vectordb_ingest( + vectordb: BaseVectorStore, + corpus_data: pd.DataFrame, +): + """ + Ingest given corpus data to the vectordb. + It truncates corpus content when the embedding model is OpenAIEmbedding to the 8000 tokens. + Plus, when the corpus content is empty (whitespace), it will be ignored. + And if there is a document id that already exists in the collection, it will be ignored. + + :param vectordb: A vector stores instance that you want to ingest. + :param corpus_data: The corpus data that contains doc_id and contents columns. + """ + embedding_batch = vectordb.embedding_batch + if not corpus_data.empty: + new_contents = corpus_data["contents"].tolist() + new_ids = corpus_data["doc_id"].tolist() + content_batches = make_batch(new_contents, embedding_batch) + id_batches = make_batch(new_ids, embedding_batch) + for content_batch, id_batch in zip(content_batches, id_batches): + await vectordb.add(ids=id_batch, texts=content_batch) + + +def run_query_embedding_batch( + queries: List[str], embedding_model: BaseEmbedding, batch_size: int +) -> List[List[float]]: + result = [] + for i in range(0, len(queries), batch_size): + batch = queries[i : i + batch_size] + embeddings = embedding_model.get_text_embedding_batch(batch) + result.extend(embeddings) + return result + + +@convert_inputs_to_list +def get_id_scores( # To find the uncalculated score when fuse the scores for the hybrid retrieval + query_embeddings: List[ + List[float] + ], # `queries` is input. This is one user input query. + content_embeddings: List[List[float]], + similarity_metric: str, +) -> List[ + float +]: # The most high scores among each query. The length of a result is the same as the contents length. + """ + Calculate the highest similarity scores between query embeddings and content embeddings. + + :param query_embeddings: A list of lists containing query embeddings. + :param content_embeddings: A list of lists containing content embeddings. + :param similarity_metric: The similarity metric to use ('l2', 'ip', or 'cosine'). + :return: A list of the highest similarity scores for each content embedding. + """ + metric_func_dict = { + "l2": lambda x, y: 1 - calculate_l2_distance(x, y), + "ip": calculate_inner_product, + "cosine": calculate_cosine_similarity, + } + metric_func = metric_func_dict[similarity_metric] + + result = [] + for content_embedding in content_embeddings: + scores = [] + for query_embedding in query_embeddings: + scores.append( + metric_func(np.array(query_embedding), np.array(content_embedding)) + ) + result.append(max(scores)) + return result diff --git a/autorag-workspace/autorag/nodes/util.py b/autorag-workspace/autorag/nodes/util.py new file mode 100644 index 0000000..015d351 --- /dev/null +++ b/autorag-workspace/autorag/nodes/util.py @@ -0,0 +1,16 @@ +from typing import Optional, Dict + +from autorag.support import get_support_modules + + +def make_generator_callable_param(generator_dict: Optional[Dict]): + if "generator_module_type" not in generator_dict.keys(): + generator_dict = { + "generator_module_type": "llama_index_llm", + "llm": "openai", + "model": "gpt-4o-mini", + } + module_str = generator_dict.pop("generator_module_type") + module_class = get_support_modules(module_str) + module_param = generator_dict + return module_class, module_param diff --git a/autorag-workspace/autorag/parser.py b/autorag-workspace/autorag/parser.py new file mode 100644 index 0000000..b67e5c5 --- /dev/null +++ b/autorag-workspace/autorag/parser.py @@ -0,0 +1,37 @@ +import logging +import os +import shutil +from typing import Optional + +from autorag.data.parse.run import run_parser +from autorag.data.utils.util import load_yaml, get_param_combinations + +logger = logging.getLogger("AutoRAG") + + +class Parser: + def __init__(self, data_path_glob: str, project_dir: Optional[str] = None): + self.data_path_glob = data_path_glob + self.project_dir = project_dir if project_dir is not None else os.getcwd() + + def start_parsing(self, yaml_path: str, all_files: bool = False): + if not os.path.exists(self.project_dir): + os.makedirs(self.project_dir) + + # copy yaml file to project directory + shutil.copy(yaml_path, os.path.join(self.project_dir, "parse_config.yaml")) + + # load yaml file + modules = load_yaml(yaml_path) + + input_modules, input_params = get_param_combinations(modules) + + logger.info("Parsing Start...") + run_parser( + modules=input_modules, + module_params=input_params, + data_path_glob=self.data_path_glob, + project_dir=self.project_dir, + all_files=all_files, + ) + logger.info("Parsing Done!") diff --git a/autorag-workspace/autorag/schema/__init__.py b/autorag-workspace/autorag/schema/__init__.py new file mode 100644 index 0000000..dd346d0 --- /dev/null +++ b/autorag-workspace/autorag/schema/__init__.py @@ -0,0 +1,3 @@ +from .module import Module +from .node import Node +from .base import BaseModule diff --git a/autorag-workspace/autorag/schema/base.py b/autorag-workspace/autorag/schema/base.py new file mode 100644 index 0000000..9c6864e --- /dev/null +++ b/autorag-workspace/autorag/schema/base.py @@ -0,0 +1,35 @@ +from abc import ABCMeta, abstractmethod +from pathlib import Path +from typing import Union + +import pandas as pd + + +class BaseModule(metaclass=ABCMeta): + @abstractmethod + def pure(self, previous_result: pd.DataFrame, *args, **kwargs): + pass + + @abstractmethod + def _pure(self, *args, **kwargs): + pass + + @classmethod + def run_evaluator( + cls, + project_dir: Union[str, Path], + previous_result: pd.DataFrame, + *args, + **kwargs, + ): + instance = cls(project_dir, *args, **kwargs) + result = instance.pure(previous_result, *args, **kwargs) + del instance + return result + + @abstractmethod + def cast_to_run(self, previous_result: pd.DataFrame, *args, **kwargs): + """ + This function is for cast function (a.k.a decorator) only for pure function in the whole node. + """ + pass diff --git a/autorag-workspace/autorag/schema/metricinput.py b/autorag-workspace/autorag/schema/metricinput.py new file mode 100644 index 0000000..e20c8d9 --- /dev/null +++ b/autorag-workspace/autorag/schema/metricinput.py @@ -0,0 +1,99 @@ +from dataclasses import dataclass +from typing import Optional, List, Dict, Callable, Any, Union + +import numpy as np +import pandas as pd + + +@dataclass +class MetricInput: + query: Optional[str] = None + queries: Optional[List[str]] = None + retrieval_gt_contents: Optional[List[List[str]]] = None + retrieved_contents: Optional[List[str]] = None + retrieval_gt: Optional[List[List[str]]] = None + retrieved_ids: Optional[List[str]] = None + prompt: Optional[str] = None + generated_texts: Optional[str] = None + generation_gt: Optional[List[str]] = None + generated_log_probs: Optional[List[float]] = None + + def is_fields_notnone(self, fields_to_check: List[str]) -> bool: + for field in fields_to_check: + actual_value = getattr(self, field) + + if actual_value is None: + return False + + try: + if not type_checks.get(type(actual_value), lambda _: False)( + actual_value + ): + return False + except Exception: + return False + + return True + + @classmethod + def from_dataframe(cls, qa_data: pd.DataFrame) -> List["MetricInput"]: + """ + Convert a pandas DataFrame into a list of MetricInput instances. + qa_data: pd.DataFrame: qa_data DataFrame containing metric data. + + :returns: List[MetricInput]: List of MetricInput objects created from DataFrame rows. + """ + instances = [] + + for _, row in qa_data.iterrows(): + instance = cls() + + for attr_name in cls.__annotations__: + if attr_name in row: + value = row[attr_name] + + if isinstance(value, str): + setattr( + instance, + attr_name, + value.strip() if value.strip() != "" else None, + ) + elif isinstance(value, list): + setattr(instance, attr_name, value if len(value) > 0 else None) + else: + setattr(instance, attr_name, value) + + instances.append(instance) + + return instances + + @staticmethod + def _check_list(lst_or_arr: Union[List[Any], np.ndarray]) -> bool: + if isinstance(lst_or_arr, np.ndarray): + lst_or_arr = lst_or_arr.flatten().tolist() + + if len(lst_or_arr) == 0: + return False + + for item in lst_or_arr: + if item is None: + return False + + item_type = type(item) + + if item_type in type_checks: + if not type_checks[item_type](item): + return False + else: + return False + + return True + + +type_checks: Dict[type, Callable[[Any], bool]] = { + str: lambda x: len(x.strip()) > 0, + list: MetricInput._check_list, + np.ndarray: MetricInput._check_list, + int: lambda _: True, + float: lambda _: True, +} diff --git a/autorag-workspace/autorag/schema/module.py b/autorag-workspace/autorag/schema/module.py new file mode 100644 index 0000000..d21439d --- /dev/null +++ b/autorag-workspace/autorag/schema/module.py @@ -0,0 +1,24 @@ +from copy import deepcopy +from dataclasses import dataclass, field +from typing import Callable, Dict + +from autorag.support import get_support_modules + + +@dataclass +class Module: + module_type: str + module_param: Dict + module: Callable = field(init=False) + + def __post_init__(self): + self.module = get_support_modules(self.module_type) + if self.module is None: + raise ValueError(f"Module type {self.module_type} is not supported.") + + @classmethod + def from_dict(cls, module_dict: Dict) -> "Module": + _module_dict = deepcopy(module_dict) + module_type = _module_dict.pop("module_type") + module_params = _module_dict + return cls(module_type, module_params) diff --git a/autorag-workspace/autorag/schema/node.py b/autorag-workspace/autorag/schema/node.py new file mode 100644 index 0000000..eab7ab6 --- /dev/null +++ b/autorag-workspace/autorag/schema/node.py @@ -0,0 +1,143 @@ +import itertools +import logging +from copy import deepcopy +from dataclasses import dataclass, field +from typing import Dict, List, Callable, Tuple, Any + +import pandas as pd + +from autorag.schema.module import Module +from autorag.support import get_support_nodes +from autorag.utils.util import make_combinations, explode, find_key_values + +logger = logging.getLogger("AutoRAG") + + +@dataclass +class Node: + node_type: str + strategy: Dict + node_params: Dict + modules: List[Module] + run_node: Callable = field(init=False) + + def __post_init__(self): + self.run_node = get_support_nodes(self.node_type) + if self.run_node is None: + raise ValueError(f"Node type {self.node_type} is not supported.") + + def get_param_combinations(self) -> Tuple[List[Callable], List[Dict]]: + """ + This method returns a combination of module and node parameters, also corresponding modules. + + :return: Each module and its module parameters. + :rtype: Tuple[List[Callable], List[Dict]] + """ + + def make_single_combination(module: Module) -> List[Dict]: + input_dict = {**self.node_params, **module.module_param} + return make_combinations(input_dict) + + combinations = list(map(make_single_combination, self.modules)) + module_list, combination_list = explode(self.modules, combinations) + return list(map(lambda x: x.module, module_list)), combination_list + + @classmethod + def from_dict(cls, node_dict: Dict) -> "Node": + _node_dict = deepcopy(node_dict) + node_type = _node_dict.pop("node_type") + strategy = _node_dict.pop("strategy") + modules = list(map(lambda x: Module.from_dict(x), _node_dict.pop("modules"))) + node_params = _node_dict + return cls(node_type, strategy, node_params, modules) + + def run(self, previous_result: pd.DataFrame, node_line_dir: str) -> pd.DataFrame: + logger.info(f"Running node {self.node_type}...") + input_modules, input_params = self.get_param_combinations() + return self.run_node( + modules=input_modules, + module_params=input_params, + previous_result=previous_result, + node_line_dir=node_line_dir, + strategies=self.strategy, + ) + + +def extract_values(node: Node, key: str) -> List[str]: + """ + This function extract values from node's modules' module_param. + + :param node: The node you want to extract values from. + :param key: The key of module_param that you want to extract. + :return: The list of extracted values. + It removes duplicated elements automatically. + """ + + def extract_module_values(module: Module): + if key not in module.module_param: + return [] + value = module.module_param[key] + if isinstance(value, str) or isinstance(value, int): + return [value] + elif isinstance(value, list): + return value + else: + raise ValueError(f"{key} must be str,list or int, but got {type(value)}") + + values = list(map(extract_module_values, node.modules)) + return list(set(list(itertools.chain.from_iterable(values)))) + + +def extract_values_from_nodes(nodes: List[Node], key: str) -> List[str]: + """ + This function extract values from nodes' modules' module_param. + + :param nodes: The nodes you want to extract values from. + :param key: The key of module_param that you want to extract. + :return: The list of extracted values. + It removes duplicated elements automatically. + """ + values = list(map(lambda node: extract_values(node, key), nodes)) + return list(set(list(itertools.chain.from_iterable(values)))) + + +def extract_values_from_nodes_strategy(nodes: List[Node], key: str) -> List[Any]: + """ + This function extract values from nodes' strategy. + + :param nodes: The nodes you want to extract values from. + :param key: The key string that you want to extract. + :return: The list of extracted values. + It removes duplicated elements automatically. + """ + values = [] + for node in nodes: + value_list = find_key_values(node.strategy, key) + if value_list: + values.extend(value_list) + return values + + +def module_type_exists(nodes: List[Node], module_type: str) -> bool: + """ + This function check if the module type exists in the nodes. + + :param nodes: The nodes you want to check. + :param module_type: The module type you want to check. + :return: True if the module type exists in the nodes. + """ + return any( + list( + map( + lambda node: any( + list( + map( + lambda module: module.module_type == module_type, + node.modules, + ) + ) + ), + nodes, + ) + ) + ) diff --git a/autorag-workspace/autorag/strategy.py b/autorag-workspace/autorag/strategy.py new file mode 100644 index 0000000..9afbad0 --- /dev/null +++ b/autorag-workspace/autorag/strategy.py @@ -0,0 +1,165 @@ +import functools +import time +from typing import List, Iterable, Tuple, Any, Optional, Callable + +import numpy as np +import pandas as pd + + +def measure_speed(func, *args, **kwargs): + """ + Method for measuring execution speed of the function. + """ + start_time = time.time() + result = func(*args, **kwargs) + end_time = time.time() + return result, end_time - start_time + + +def avoid_empty_result(return_index: List[int]): + """ + Decorator for avoiding empty results from the function. + When the func returns an empty result, it will return the origin results. + When the func returns a None, it will return the origin results. + When the return value is a tuple, it will check all the value or list is empty. + If so, it will return the origin results. + It keeps parameters at return_index of the function as the origin results. + + :param return_index: The index of the result to be returned when there is no result. + :return: The origin results or the results from the function. + """ + + def decorator_avoid_empty_result(func: Callable): + @functools.wraps(func) + def wrapper(*args, **kwargs) -> List: + func_result = func(*args, **kwargs) + if isinstance(func_result, tuple): + # if all the results are empty, return the origin results. + if all([not bool(result) for result in func_result]): + return [args[index] for index in return_index] + if not bool(func_result): + return [args[index] for index in return_index] + else: + return func_result + + return wrapper + + return decorator_avoid_empty_result + + +@avoid_empty_result([0, 3]) +def filter_by_threshold(results, value, threshold, metadatas=None) -> Tuple[List, List]: + """ + Filter results by value's threshold. + + :param results: The result list to be filtered. + :param value: The value list to be filtered. + It must have the same length with results. + :param threshold: The threshold value. + :param metadatas: The metadata of each result. + :return: Filtered list of results and filtered list of metadatas. + Metadatas will be returned even if you did not give input metadatas. + :rtype: Tuple[List, List] + """ + if metadatas is None: + metadatas = [None] * len(results) + assert len(results) == len(value), "results and value must have the same length." + try: + filtered_results, _, filtered_metadatas = zip( + *filter(lambda x: x[1] <= threshold, zip(results, value, metadatas)) + ) + except ValueError: + return [], [] + return list(filtered_results), list(filtered_metadatas) + + +def validate_strategy_inputs( + results: List[pd.DataFrame], + columns: Iterable[str], + metadatas: Optional[List[Any]] = None, +): + if metadatas is None: + metadatas = [None] * len(results) + assert len(results) == len( + metadatas + ), "results and module_filename must have the same length." + assert all( + [isinstance(result, pd.DataFrame) for result in results] + ), "results must be pd.DataFrame." + assert all( + [column in result.columns for result in results for column in columns] + ), "columns must be in the columns of results." + return results, columns, metadatas + + +def select_best( + results: List[pd.DataFrame], + columns: Iterable[str], + metadatas: Optional[List[Any]] = None, + strategy_name: str = "mean", +) -> Tuple[pd.DataFrame, Any]: + strategy_func_dict = { + "mean": select_best_average, + "rank": select_best_rr, + "normalize_mean": select_normalize_mean, + } + if strategy_name not in strategy_func_dict: + raise ValueError( + f"Input strategy name {strategy_name} is not in {strategy_func_dict.keys()}" + ) + + return strategy_func_dict[strategy_name](results, columns, metadatas) + + +def select_best_average( + results: List[pd.DataFrame], + columns: Iterable[str], + metadatas: Optional[List[Any]] = None, +) -> Tuple[pd.DataFrame, Any]: + """ + Select the best result by average value among given columns. + + :param results: The list of results. + Each result must be pd.DataFrame. + :param columns: Column names to be averaged. + Standard to select the best result. + :param metadatas: The metadata of each result. + It will select one metadata with the best result. + :return: The best result and the best metadata. + The metadata will be returned even if you did not give input 'metadatas' parameter. + :rtype: Tuple[pd.DataFrame, Any] + """ + results, columns, metadatas = validate_strategy_inputs(results, columns, metadatas) + each_average = [df[columns].mean(axis=1).mean() for df in results] + best_index = each_average.index(max(each_average)) + return results[best_index], metadatas[best_index] + + +def select_best_rr( + results: List[pd.DataFrame], + columns: Iterable[str], + metadatas: Optional[List[Any]] = None, +) -> Tuple[pd.DataFrame, Any]: + results, columns, metadatas = validate_strategy_inputs(results, columns, metadatas) + each_average_df = pd.DataFrame( + [df[columns].mean(axis=0).to_dict() for df in results] + ) + rank_df = each_average_df.rank(ascending=False) + rr_df = rank_df.map(lambda x: 1 / x) + best_index = np.array(rr_df.sum(axis=1)).argmax() + return results[best_index], metadatas[best_index] + + +def select_normalize_mean( + results: List[pd.DataFrame], + columns: Iterable[str], + metadatas: Optional[List[Any]] = None, +) -> Tuple[pd.DataFrame, Any]: + results, columns, metadatas = validate_strategy_inputs(results, columns, metadatas) + each_mean_df = pd.DataFrame([df[columns].mean(axis=0).to_dict() for df in results]) + normalized_means = (each_mean_df - each_mean_df.min()) / ( + each_mean_df.max() - each_mean_df.min() + ) + normalized_mean_sums = normalized_means.sum(axis=1) + best_index = normalized_mean_sums.argmax() + return results[best_index], metadatas[best_index] diff --git a/autorag-workspace/autorag/support.py b/autorag-workspace/autorag/support.py new file mode 100644 index 0000000..9ca884a --- /dev/null +++ b/autorag-workspace/autorag/support.py @@ -0,0 +1,216 @@ +import importlib +from typing import Callable, Dict + + +def dynamically_find_function(key: str, target_dict: Dict) -> Callable: + if key in target_dict: + module_path, func_name = target_dict[key] + module = importlib.import_module(module_path) + func = getattr(module, func_name) + return func + else: + raise KeyError(f"Input module or node {key} is not supported.") + + +def get_support_modules(module_name: str) -> Callable: + support_modules = { + # parse + "langchain_parse": ("autorag.data.parse", "langchain_parse"), + "clova": ("autorag.data.parse.clova", "clova_ocr"), + "llamaparse": ("autorag.data.parse.llamaparse", "llama_parse"), + "table_hybrid_parse": ( + "autorag.data.parse.table_hybrid_parse", + "table_hybrid_parse", + ), + # chunk + "llama_index_chunk": ("autorag.data.chunk", "llama_index_chunk"), + "langchain_chunk": ("autorag.data.chunk", "langchain_chunk"), + # query_expansion + "query_decompose": ("autorag.nodes.queryexpansion", "QueryDecompose"), + "hyde": ("autorag.nodes.queryexpansion", "HyDE"), + "pass_query_expansion": ( + "autorag.nodes.queryexpansion", + "PassQueryExpansion", + ), + "multi_query_expansion": ( + "autorag.nodes.queryexpansion", + "MultiQueryExpansion", + ), + "QueryDecompose": ("autorag.nodes.queryexpansion", "QueryDecompose"), + "HyDE": ("autorag.nodes.queryexpansion", "HyDE"), + "PassQueryExpansion": ( + "autorag.nodes.queryexpansion", + "PassQueryExpansion", + ), + "MultiQueryExpansion": ( + "autorag.nodes.queryexpansion", + "MultiQueryExpansion", + ), + # retrieval + "bm25": ("autorag.nodes.retrieval", "BM25"), + "BM25": ("autorag.nodes.retrieval", "BM25"), + "vectordb": ("autorag.nodes.retrieval", "VectorDB"), + "VectorDB": ("autorag.nodes.retrieval", "VectorDB"), + "hybrid_rrf": ("autorag.nodes.retrieval", "HybridRRF"), + "HybridRRF": ("autorag.nodes.retrieval", "HybridRRF"), + "hybrid_cc": ("autorag.nodes.retrieval", "HybridCC"), + "HybridCC": ("autorag.nodes.retrieval", "HybridCC"), + # passage_augmenter + "prev_next_augmenter": ( + "autorag.nodes.passageaugmenter", + "PrevNextPassageAugmenter", + ), + "PrevNextPassageAugmenter": ( + "autorag.nodes.passageaugmenter", + "PrevNextPassageAugmenter", + ), + "pass_passage_augmenter": ( + "autorag.nodes.passageaugmenter", + "PassPassageAugmenter", + ), + "PassPassageAugmenter": ( + "autorag.nodes.passageaugmenter", + "PassPassageAugmenter", + ), + # passage_reranker /' 250313 added dragonkue2 /' + "monot5": ("autorag.nodes.passagereranker", "MonoT5"), + "MonoT5": ("autorag.nodes.passagereranker", "MonoT5"), + "tart": ("autorag.nodes.passagereranker.tart", "Tart"), + "Tart": ("autorag.nodes.passagereranker.tart", "Tart"), + "upr": ("autorag.nodes.passagereranker", "Upr"), + "Upr": ("autorag.nodes.passagereranker", "Upr"), + "dragonkue2": ("autorag.nodes.passagereranker", "DragonKue2"), + "DragonKue2": ("autorag.nodes.passagereranker", "DragonKue2"), + "koreranker": ("autorag.nodes.passagereranker", "KoReranker"), + "KoReranker": ("autorag.nodes.passagereranker", "KoReranker"), + "pass_reranker": ("autorag.nodes.passagereranker", "PassReranker"), + "PassReranker": ("autorag.nodes.passagereranker", "PassReranker"), + "cohere_reranker": ("autorag.nodes.passagereranker", "CohereReranker"), + "CohereReranker": ("autorag.nodes.passagereranker", "CohereReranker"), + "rankgpt": ("autorag.nodes.passagereranker", "RankGPT"), + "RankGPT": ("autorag.nodes.passagereranker", "RankGPT"), + "jina_reranker": ("autorag.nodes.passagereranker", "JinaReranker"), + "JinaReranker": ("autorag.nodes.passagereranker", "JinaReranker"), + "colbert_reranker": ("autorag.nodes.passagereranker", "ColbertReranker"), + "ColbertReranker": ("autorag.nodes.passagereranker", "ColbertReranker"), + "sentence_transformer_reranker": ( + "autorag.nodes.passagereranker", + "SentenceTransformerReranker", + ), + "SentenceTransformerReranker": ( + "autorag.nodes.passagereranker", + "SentenceTransformerReranker", + ), + "flag_embedding_reranker": ( + "autorag.nodes.passagereranker", + "FlagEmbeddingReranker", + ), + "FlagEmbeddingReranker": ( + "autorag.nodes.passagereranker", + "FlagEmbeddingReranker", + ), + "flag_embedding_llm_reranker": ( + "autorag.nodes.passagereranker", + "FlagEmbeddingLLMReranker", + ), + "FlagEmbeddingLLMReranker": ( + "autorag.nodes.passagereranker", + "FlagEmbeddingLLMReranker", + ), + "time_reranker": ("autorag.nodes.passagereranker", "TimeReranker"), + "TimeReranker": ("autorag.nodes.passagereranker", "TimeReranker"), + "openvino_reranker": ("autorag.nodes.passagereranker", "OpenVINOReranker"), + "OpenVINOReranker": ("autorag.nodes.passagereranker", "OpenVINOReranker"), + "voyageai_reranker": ("autorag.nodes.passagereranker", "VoyageAIReranker"), + "VoyageAIReranker": ("autorag.nodes.passagereranker", "VoyageAIReranker"), + "mixedbreadai_reranker": ( + "autorag.nodes.passagereranker", + "MixedbreadAIReranker", + ), + "MixedbreadAIReranker": ( + "autorag.nodes.passagereranker", + "MixedbreadAIReranker", + ), + "flashrank_reranker": ("autorag.nodes.passagereranker", "FlashRankReranker"), + "FlashRankReranker": ("autorag.nodes.passagereranker", "FlashRankReranker"), + # passage_filter + "pass_passage_filter": ("autorag.nodes.passagefilter", "PassPassageFilter"), + "similarity_threshold_cutoff": ( + "autorag.nodes.passagefilter", + "SimilarityThresholdCutoff", + ), + "similarity_percentile_cutoff": ( + "autorag.nodes.passagefilter", + "SimilarityPercentileCutoff", + ), + "recency_filter": ("autorag.nodes.passagefilter", "RecencyFilter"), + "threshold_cutoff": ("autorag.nodes.passagefilter", "ThresholdCutoff"), + "percentile_cutoff": ("autorag.nodes.passagefilter", "PercentileCutoff"), + "PassPassageFilter": ("autorag.nodes.passagefilter", "PassPassageFilter"), + "SimilarityThresholdCutoff": ( + "autorag.nodes.passagefilter", + "SimilarityThresholdCutoff", + ), + "SimilarityPercentileCutoff": ( + "autorag.nodes.passagefilter", + "SimilarityPercentileCutoff", + ), + "RecencyFilter": ("autorag.nodes.passagefilter", "RecencyFilter"), + "ThresholdCutoff": ("autorag.nodes.passagefilter", "ThresholdCutoff"), + "PercentileCutoff": ("autorag.nodes.passagefilter", "PercentileCutoff"), + # passage_compressor + "tree_summarize": ("autorag.nodes.passagecompressor", "TreeSummarize"), + "pass_compressor": ("autorag.nodes.passagecompressor", "PassCompressor"), + "refine": ("autorag.nodes.passagecompressor", "Refine"), + "longllmlingua": ("autorag.nodes.passagecompressor", "LongLLMLingua"), + "TreeSummarize": ("autorag.nodes.passagecompressor", "TreeSummarize"), + "Refine": ("autorag.nodes.passagecompressor", "Refine"), + "LongLLMLingua": ("autorag.nodes.passagecompressor", "LongLLMLingua"), + "PassCompressor": ("autorag.nodes.passagecompressor", "PassCompressor"), + # prompt_maker + "fstring": ("autorag.nodes.promptmaker", "Fstring"), + "long_context_reorder": ("autorag.nodes.promptmaker", "LongContextReorder"), + "window_replacement": ("autorag.nodes.promptmaker", "WindowReplacement"), + "Fstring": ("autorag.nodes.promptmaker", "Fstring"), + "LongContextReorder": ("autorag.nodes.promptmaker", "LongContextReorder"), + "WindowReplacement": ("autorag.nodes.promptmaker", "WindowReplacement"), + # generator + "llama_index_llm": ("autorag.nodes.generator", "LlamaIndexLLM"), + "vllm": ("autorag.nodes.generator", "Vllm"), + "openai_llm": ("autorag.nodes.generator", "OpenAILLM"), + "vllm_api": ("autorag.nodes.generator", "VllmAPI"), + "LlamaIndexLLM": ("autorag.nodes.generator", "LlamaIndexLLM"), + "Vllm": ("autorag.nodes.generator", "Vllm"), + "OpenAILLM": ("autorag.nodes.generator", "OpenAILLM"), + "VllmAPI": ("autorag.nodes.generator", "VllmAPI"), + } + return dynamically_find_function(module_name, support_modules) + + +def get_support_nodes(node_name: str) -> Callable: + support_nodes = { + "query_expansion": ( + "autorag.nodes.queryexpansion.run", + "run_query_expansion_node", + ), + "retrieval": ("autorag.nodes.retrieval.run", "run_retrieval_node"), + "generator": ("autorag.nodes.generator.run", "run_generator_node"), + "prompt_maker": ("autorag.nodes.promptmaker.run", "run_prompt_maker_node"), + "passage_filter": ( + "autorag.nodes.passagefilter.run", + "run_passage_filter_node", + ), + "passage_compressor": ( + "autorag.nodes.passagecompressor.run", + "run_passage_compressor_node", + ), + "passage_reranker": ( + "autorag.nodes.passagereranker.run", + "run_passage_reranker_node", + ), + "passage_augmenter": ( + "autorag.nodes.passageaugmenter.run", + "run_passage_augmenter_node", + ), + } + return dynamically_find_function(node_name, support_nodes) diff --git a/autorag-workspace/autorag/utils/__init__.py b/autorag-workspace/autorag/utils/__init__.py new file mode 100644 index 0000000..dd156b0 --- /dev/null +++ b/autorag-workspace/autorag/utils/__init__.py @@ -0,0 +1,8 @@ +from .preprocess import ( + validate_qa_dataset, + validate_corpus_dataset, + cast_qa_dataset, + cast_corpus_dataset, + validate_qa_from_corpus_dataset, +) +from .util import fetch_contents, result_to_dataframe, sort_by_scores diff --git a/autorag-workspace/autorag/utils/preprocess.py b/autorag-workspace/autorag/utils/preprocess.py new file mode 100644 index 0000000..ce2e84c --- /dev/null +++ b/autorag-workspace/autorag/utils/preprocess.py @@ -0,0 +1,149 @@ +from datetime import datetime + +import numpy as np +import pandas as pd + +from autorag.utils.util import preprocess_text + + +def validate_qa_dataset(df: pd.DataFrame): + columns = ["qid", "query", "retrieval_gt", "generation_gt"] + assert set(columns).issubset( + df.columns + ), f"df must have columns {columns}, but got {df.columns}" + + +def validate_corpus_dataset(df: pd.DataFrame): + columns = ["doc_id", "contents", "metadata"] + assert set(columns).issubset( + df.columns + ), f"df must have columns {columns}, but got {df.columns}" + + +def cast_qa_dataset(df: pd.DataFrame): + def cast_retrieval_gt(gt): + if isinstance(gt, str): + return [[gt]] + elif isinstance(gt, list): + if isinstance(gt[0], str): + return [gt] + elif isinstance(gt[0], list): + return gt + elif isinstance(gt[0], np.ndarray): + return cast_retrieval_gt(list(map(lambda x: x.tolist(), gt))) + else: + raise ValueError( + f"retrieval_gt must be str or list, but got {type(gt[0])}" + ) + elif isinstance(gt, np.ndarray): + return cast_retrieval_gt(gt.tolist()) + else: + raise ValueError(f"retrieval_gt must be str or list, but got {type(gt)}") + + def cast_generation_gt(gt): + if isinstance(gt, str): + return [gt] + elif isinstance(gt, list): + return gt + elif isinstance(gt, np.ndarray): + return cast_generation_gt(gt.tolist()) + else: + raise ValueError(f"generation_gt must be str or list, but got {type(gt)}") + + df = df.reset_index(drop=True) + validate_qa_dataset(df) + assert df["qid"].apply(lambda x: isinstance(x, str)).sum() == len( + df + ), "qid must be string type." + assert df["query"].apply(lambda x: isinstance(x, str)).sum() == len( + df + ), "query must be string type." + df["retrieval_gt"] = df["retrieval_gt"].apply(cast_retrieval_gt) + df["generation_gt"] = df["generation_gt"].apply(cast_generation_gt) + df["query"] = df["query"].apply(preprocess_text) + df["generation_gt"] = df["generation_gt"].apply( + lambda x: list(map(preprocess_text, x)) + ) + return df + + +def cast_corpus_dataset(df: pd.DataFrame): + df = df.reset_index(drop=True) + validate_corpus_dataset(df) + + # drop rows that have empty contents + df = df[~df["contents"].apply(lambda x: x is None or x.isspace())] + + def make_datetime_metadata(x): + if x is None or x == {}: + return {"last_modified_datetime": datetime.now()} + elif x.get("last_modified_datetime") is None: + return {**x, "last_modified_datetime": datetime.now()} + else: + return x + + df["metadata"] = df["metadata"].apply(make_datetime_metadata) + + # check every metadata have a datetime key + assert sum( + df["metadata"].apply(lambda x: x.get("last_modified_datetime") is not None) + ) == len(df), "Every metadata must have a datetime key." + + def make_prev_next_id_metadata(x, id_type: str): + if x is None or x == {}: + return {id_type: None} + elif x.get(id_type) is None: + return {**x, id_type: None} + else: + return x + + df["metadata"] = df["metadata"].apply( + lambda x: make_prev_next_id_metadata(x, "prev_id") + ) + df["metadata"] = df["metadata"].apply( + lambda x: make_prev_next_id_metadata(x, "next_id") + ) + + df["contents"] = df["contents"].apply(preprocess_text) + + def normalize_unicode_metadata(metadata: dict): + result = {} + for key, value in metadata.items(): + if isinstance(value, str): + result[key] = preprocess_text(value) + else: + result[key] = value + return result + + df["metadata"] = df["metadata"].apply(normalize_unicode_metadata) + + # check every metadata have a prev_id, next_id key + assert all( + "prev_id" in metadata for metadata in df["metadata"] + ), "Every metadata must have a prev_id key." + assert all( + "next_id" in metadata for metadata in df["metadata"] + ), "Every metadata must have a next_id key." + + return df + + +def validate_qa_from_corpus_dataset(qa_df: pd.DataFrame, corpus_df: pd.DataFrame): + qa_ids = [] + for retrieval_gt in qa_df["retrieval_gt"].tolist(): + if isinstance(retrieval_gt, list) and ( + retrieval_gt[0] != [] or any(bool(g) is True for g in retrieval_gt) + ): + for gt in retrieval_gt: + qa_ids.extend(gt) + elif isinstance(retrieval_gt, np.ndarray) and retrieval_gt[0].size > 0: + for gt in retrieval_gt: + qa_ids.extend(gt) + + no_exist_ids = list( + filter(lambda qa_id: corpus_df[corpus_df["doc_id"] == qa_id].empty, qa_ids) + ) + + assert ( + len(no_exist_ids) == 0 + ), f"{len(no_exist_ids)} doc_ids in retrieval_gt do not exist in corpus_df." diff --git a/autorag-workspace/autorag/utils/util.py b/autorag-workspace/autorag/utils/util.py new file mode 100644 index 0000000..c868e1f --- /dev/null +++ b/autorag-workspace/autorag/utils/util.py @@ -0,0 +1,751 @@ +import ast +import asyncio +import datetime +import functools +import glob +import inspect +import itertools +import json +import logging +import os +import re +import string +from copy import deepcopy +from json import JSONDecoder +from typing import List, Callable, Dict, Optional, Any, Collection, Iterable + +from asyncio import AbstractEventLoop +import emoji +import numpy as np +import pandas as pd +import tiktoken +import unicodedata + +import yaml +from llama_index.embeddings.openai import OpenAIEmbedding +from pydantic import BaseModel as BM +from pydantic.v1 import BaseModel + +logger = logging.getLogger("AutoRAG") + + +def fetch_contents( + corpus_data: pd.DataFrame, ids: List[List[str]], column_name: str = "contents" +) -> List[List[Any]]: + def fetch_contents_pure( + ids: List[str], corpus_data: pd.DataFrame, column_name: str + ): + return list(map(lambda x: fetch_one_content(corpus_data, x, column_name), ids)) + + result = flatten_apply( + fetch_contents_pure, ids, corpus_data=corpus_data, column_name=column_name + ) + return result + + +def fetch_one_content( + corpus_data: pd.DataFrame, + id_: str, + column_name: str = "contents", + id_column_name: str = "doc_id", +) -> Any: + if isinstance(id_, str): + if id_ in ["", ""]: + return None + fetch_result = corpus_data[corpus_data[id_column_name] == id_] + if fetch_result.empty: + raise ValueError(f"doc_id: {id_} not found in corpus_data.") + else: + return fetch_result[column_name].iloc[0] + else: + return None + + +def result_to_dataframe(column_names: List[str]): + """ + Decorator for converting results to pd.DataFrame. + """ + + def decorator_result_to_dataframe(func: Callable): + @functools.wraps(func) + def wrapper(*args, **kwargs) -> pd.DataFrame: + results = func(*args, **kwargs) + if len(column_names) == 1: + df_input = {column_names[0]: results} + else: + df_input = { + column_name: result + for result, column_name in zip(results, column_names) + } + result_df = pd.DataFrame(df_input) + return result_df + + return wrapper + + return decorator_result_to_dataframe + + +def load_summary_file( + summary_path: str, dict_columns: Optional[List[str]] = None +) -> pd.DataFrame: + """ + Load a summary file from summary_path. + + :param summary_path: The path of the summary file. + :param dict_columns: The columns that are dictionary type. + You must fill this parameter if you want to load summary file properly. + Default is ['module_params']. + :return: The summary dataframe. + """ + if not os.path.exists(summary_path): + raise ValueError(f"summary.csv does not exist in {summary_path}.") + summary_df = pd.read_csv(summary_path) + if dict_columns is None: + dict_columns = ["module_params"] + + if any([col not in summary_df.columns for col in dict_columns]): + raise ValueError(f"{dict_columns} must be in summary_df.columns.") + + def convert_dict(elem): + try: + return ast.literal_eval(elem) + except: + # convert datetime or date to its object (recency filter) + date_object = convert_datetime_string(elem) + if date_object is None: + raise ValueError( + f"Malformed dict received : {elem}\nCan't convert to dict properly" + ) + return {"threshold": date_object} + + summary_df[dict_columns] = summary_df[dict_columns].map(convert_dict) + return summary_df + + +def convert_datetime_string(s): + # Regex to extract datetime arguments from the string + m = re.search(r"(datetime|date)(\((\d+)(,\s*\d+)*\))", s) + if m: + args = ast.literal_eval(m.group(2)) + if m.group(1) == "datetime": + return datetime.datetime(*args) + elif m.group(1) == "date": + return datetime.date(*args) + return None + + +def make_combinations(target_dict: Dict[str, Any]) -> List[Dict[str, Any]]: + """ + Make combinations from target_dict. + The target_dict key value must be a string, + and the value can be a list of values or single value. + If generates all combinations of values from target_dict, + which means generating dictionaries that contain only one value for each key, + and all dictionaries will be different from each other. + + :param target_dict: The target dictionary. + :return: The list of generated dictionaries. + """ + dict_with_lists = dict( + map( + lambda x: (x[0], x[1] if isinstance(x[1], list) else [x[1]]), + target_dict.items(), + ) + ) + + def delete_duplicate(x): + def is_hashable(obj): + try: + hash(obj) + return True + except TypeError: + return False + + if any([not is_hashable(elem) for elem in x]): + # TODO: add duplication check for unhashable objects + return x + else: + return list(set(x)) + + dict_with_lists = dict( + map(lambda x: (x[0], delete_duplicate(x[1])), dict_with_lists.items()) + ) + combination = list(itertools.product(*dict_with_lists.values())) + combination_dicts = [ + dict(zip(dict_with_lists.keys(), combo)) for combo in combination + ] + return combination_dicts + + +def explode(index_values: Collection[Any], explode_values: Collection[Collection[Any]]): + """ + Explode index_values and explode_values. + The index_values and explode_values must have the same length. + It will flatten explode_values and keep index_values as a pair. + + :param index_values: The index values. + :param explode_values: The exploded values. + :return: Tuple of exploded index_values and exploded explode_values. + """ + assert len(index_values) == len( + explode_values + ), "Index values and explode values must have same length" + df = pd.DataFrame({"index_values": index_values, "explode_values": explode_values}) + df = df.explode("explode_values") + return df["index_values"].tolist(), df["explode_values"].tolist() + + +def replace_value_in_dict(target_dict: Dict, key: str, replace_value: Any) -> Dict: + """ + Replace the value of a certain key in target_dict. + If there is no targeted key in target_dict, it will return target_dict. + + :param target_dict: The target dictionary. + :param key: The key is to replace. + :param replace_value: The value to replace. + :return: The replaced dictionary. + """ + replaced_dict = deepcopy(target_dict) + if key not in replaced_dict: + return replaced_dict + replaced_dict[key] = replace_value + return replaced_dict + + +def normalize_string(s: str) -> str: + """ + Taken from the official evaluation script for v1.1 of the SQuAD dataset. + Lower text and remove punctuation, articles, and extra whitespace. + """ + + def remove_articles(text): + return re.sub(r"\b(a|an|the)\b", " ", text) + + def white_space_fix(text): + return " ".join(text.split()) + + def remove_punc(text): + exclude = set(string.punctuation) + return "".join(ch for ch in text if ch not in exclude) + + def lower(text): + return text.lower() + + return white_space_fix(remove_articles(remove_punc(lower(s)))) + + +def convert_string_to_tuple_in_dict(d): + """Recursively converts strings that start with '(' and end with ')' to tuples in a dictionary.""" + for key, value in d.items(): + # If the value is a dictionary, recurse + if isinstance(value, dict): + convert_string_to_tuple_in_dict(value) + # If the value is a list, iterate through its elements + elif isinstance(value, list): + for i, item in enumerate(value): + # If an item in the list is a dictionary, recurse + if isinstance(item, dict): + convert_string_to_tuple_in_dict(item) + # If an item in the list is a string matching the criteria, convert it to a tuple + elif ( + isinstance(item, str) + and item.startswith("(") + and item.endswith(")") + ): + value[i] = ast.literal_eval(item) + # If the value is a string matching the criteria, convert it to a tuple + elif isinstance(value, str) and value.startswith("(") and value.endswith(")"): + d[key] = ast.literal_eval(value) + + return d + + +def convert_env_in_dict(d: Dict): + """ + Recursively converts environment variable string in a dictionary to actual environment variable. + + :param d: The dictionary to convert. + :return: The converted dictionary. + """ + env_pattern = re.compile(r".*?\${(.*?)}.*?") + + def convert_env(val: str): + matches = env_pattern.findall(val) + for match in matches: + val = val.replace(f"${{{match}}}", os.environ.get(match, "")) + return val + + for key, value in d.items(): + if isinstance(value, dict): + convert_env_in_dict(value) + elif isinstance(value, list): + for i, item in enumerate(value): + if isinstance(item, dict): + convert_env_in_dict(item) + elif isinstance(item, str): + value[i] = convert_env(item) + elif isinstance(value, str): + d[key] = convert_env(value) + return d + + +async def process_batch(tasks, batch_size: int = 64) -> List[Any]: + """ + Processes tasks in batches asynchronously. + + :param tasks: A list of no-argument functions or coroutines to be executed. + :param batch_size: The number of tasks to process in a single batch. + Default is 64. + :return: A list of results from the processed tasks. + """ + results = [] + + for i in range(0, len(tasks), batch_size): + batch = tasks[i : i + batch_size] + batch_results = await asyncio.gather(*batch) + results.extend(batch_results) + + return results + + +def make_batch(elems: List[Any], batch_size: int) -> List[List[Any]]: + """ + Make a batch of elems with batch_size. + """ + return [elems[i : i + batch_size] for i in range(0, len(elems), batch_size)] + + +def save_parquet_safe(df: pd.DataFrame, filepath: str, upsert: bool = False): + output_file_dir = os.path.dirname(filepath) + if not os.path.isdir(output_file_dir): + raise NotADirectoryError(f"directory {output_file_dir} not found.") + if not filepath.endswith("parquet"): + raise NameError( + f'file path: {filepath} filename extension need to be ".parquet"' + ) + if os.path.exists(filepath) and not upsert: + raise FileExistsError( + f"file {filepath} already exists." + "Set upsert True if you want to overwrite the file." + ) + + df.to_parquet(filepath, index=False) + + +def openai_truncate_by_token( + texts: List[str], token_limit: int, model_name: str +) -> List[str]: + try: + tokenizer = tiktoken.encoding_for_model(model_name) + except KeyError: + # This is not a real OpenAI model + return texts + + def truncate_text(text: str, limit: int, tokenizer): + tokens = tokenizer.encode(text) + if len(tokens) <= limit: + return text + truncated_text = tokenizer.decode(tokens[:limit]) + return truncated_text + + return list(map(lambda x: truncate_text(x, token_limit, tokenizer), texts)) + + +def reconstruct_list(flat_list: List[Any], lengths: List[int]) -> List[List[Any]]: + result = [] + start = 0 + for length in lengths: + result.append(flat_list[start : start + length]) + start += length + return result + + +def flatten_apply( + func: Callable, nested_list: List[List[Any]], **kwargs +) -> List[List[Any]]: + """ + This function flattens the input list and applies the function to the elements. + After that, it reconstructs the list to the original shape. + Its speciality is that the first dimension length of the list can be different from each other. + + :param func: The function that applies to the flattened list. + :param nested_list: The nested list to be flattened. + :return: The list that is reconstructed after applying the function. + """ + df = pd.DataFrame({"col1": nested_list}) + df = df.explode("col1") + df["result"] = func(df["col1"].tolist(), **kwargs) + return df.groupby(level=0, sort=False)["result"].apply(list).tolist() + + +async def aflatten_apply( + func: Callable, nested_list: List[List[Any]], **kwargs +) -> List[List[Any]]: + """ + This function flattens the input list and applies the function to the elements. + After that, it reconstructs the list to the original shape. + Its speciality is that the first dimension length of the list can be different from each other. + + :param func: The function that applies to the flattened list. + :param nested_list: The nested list to be flattened. + :return: The list that is reconstructed after applying the function. + """ + df = pd.DataFrame({"col1": nested_list}) + df = df.explode("col1") + df["result"] = await func(df["col1"].tolist(), **kwargs) + return df.groupby(level=0, sort=False)["result"].apply(list).tolist() + + +def sort_by_scores(row, reverse=True): + """ + Sorts each row by 'scores' column. + The input column names must be 'contents', 'ids', and 'scores'. + And its elements must be list type. + """ + results = sorted( + zip(row["contents"], row["ids"], row["scores"]), + key=lambda x: x[2], + reverse=reverse, + ) + reranked_contents, reranked_ids, reranked_scores = zip(*results) + return list(reranked_contents), list(reranked_ids), list(reranked_scores) + + +def select_top_k(df, column_names: List[str], top_k: int): + for column_name in column_names: + df[column_name] = df[column_name].apply(lambda x: x[:top_k]) + return df + + +def filter_dict_keys(dict_, keys: List[str]): + result = {} + for key in keys: + if key in dict_: + result[key] = dict_[key] + else: + raise KeyError(f"Key '{key}' not found in dictionary.") + return result + + +def split_dataframe(df, chunk_size): + num_chunks = ( + len(df) // chunk_size + 1 + if len(df) % chunk_size != 0 + else len(df) // chunk_size + ) + result = list( + map(lambda x: df[x * chunk_size : (x + 1) * chunk_size], range(num_chunks)) + ) + result = list(map(lambda x: x.reset_index(drop=True), result)) + return result + + +def find_trial_dir(project_dir: str) -> List[str]: + # Pattern to match directories named with numbers + pattern = os.path.join(project_dir, "[0-9]*") + all_entries = glob.glob(pattern) + + # Filter out only directories + trial_dirs = [ + entry + for entry in all_entries + if os.path.isdir(entry) and entry.split(os.sep)[-1].isdigit() + ] + + return trial_dirs + + +def find_node_summary_files(trial_dir: str) -> List[str]: + # Find all summary.csv files recursively + all_summary_files = glob.glob( + os.path.join(trial_dir, "**", "summary.csv"), recursive=True + ) + + # Filter out files that are at a lower directory level + filtered_files = [ + f for f in all_summary_files if f.count(os.sep) > trial_dir.count(os.sep) + 2 + ] + + return filtered_files + + +def preprocess_text(text: str) -> str: + return normalize_unicode(demojize(text)) + + +def demojize(text: str) -> str: + return emoji.demojize(text) + + +def normalize_unicode(text: str) -> str: + return unicodedata.normalize("NFC", text) + + +def dict_to_markdown(d, level=1): + """ + Convert a dictionary to a Markdown formatted string. + + :param d: Dictionary to convert + :param level: Current level of heading (used for nested dictionaries) + :return: Markdown formatted string + """ + markdown = "" + for key, value in d.items(): + if isinstance(value, dict): + markdown += f"{'#' * level} {key}\n" + markdown += dict_to_markdown(value, level + 1) + elif isinstance(value, list): + markdown += f"{'#' * level} {key}\n" + for item in value: + if isinstance(item, dict): + markdown += dict_to_markdown(item, level + 1) + else: + markdown += f"- {item}\n" + else: + markdown += f"{'#' * level} {key}\n{value}\n" + return markdown + + +def dict_to_markdown_table(data, key_column_name: str, value_column_name: str): + # Check if the input is a dictionary + if not isinstance(data, dict): + raise ValueError("Input must be a dictionary") + + # Create the header of the table + header = f"| {key_column_name} | {value_column_name} |\n| :---: | :-----: |\n" + + # Create the rows of the table + rows = "" + for key, value in data.items(): + rows += f"| {key} | {value} |\n" + + # Combine header and rows + markdown_table = header + rows + return markdown_table + + +def embedding_query_content( + queries: List[str], + contents_list: List[List[str]], + embedding_model: Optional[str] = None, + batch: int = 128, +): + flatten_contents = list(itertools.chain.from_iterable(contents_list)) + + openai_embedding_limit = 8000 # all openai embedding model has 8000 max token input + if isinstance(embedding_model, OpenAIEmbedding): + queries = openai_truncate_by_token( + queries, openai_embedding_limit, embedding_model.model_name + ) + flatten_contents = openai_truncate_by_token( + flatten_contents, openai_embedding_limit, embedding_model.model_name + ) + + # Embedding using batch + embedding_model.embed_batch_size = batch + query_embeddings = embedding_model.get_text_embedding_batch(queries) + + content_lengths = list(map(len, contents_list)) + content_embeddings_flatten = embedding_model.get_text_embedding_batch( + flatten_contents + ) + content_embeddings = reconstruct_list(content_embeddings_flatten, content_lengths) + return query_embeddings, content_embeddings + + +def to_list(item): + """Recursively convert collections to Python lists.""" + if isinstance(item, np.ndarray): + # Convert numpy array to list and recursively process each element + return [to_list(sub_item) for sub_item in item.tolist()] + elif isinstance(item, pd.Series): + # Convert pandas Series to list and recursively process each element + return [to_list(sub_item) for sub_item in item.tolist()] + elif isinstance(item, Iterable) and not isinstance( + item, (str, bytes, BaseModel, BM) + ): + # Recursively process each element in other iterables + return [to_list(sub_item) for sub_item in item] + else: + return item + + +def convert_inputs_to_list(func): + """Decorator to convert all function inputs to Python lists.""" + + @functools.wraps(func) + def wrapper(*args, **kwargs): + new_args = [to_list(arg) for arg in args] + new_kwargs = {k: to_list(v) for k, v in kwargs.items()} + return func(*new_args, **new_kwargs) + + return wrapper + + +def get_best_row( + summary_df: pd.DataFrame, best_column_name: str = "is_best" +) -> pd.Series: + """ + From the summary dataframe, find the best result row by 'is_best' column and return it. + + :param summary_df: Summary dataframe created by AutoRAG. + :param best_column_name: The column name that indicates the best result. + Default is 'is_best'. + You don't have to change this unless the column name is different. + :return: Best row pandas Series instance. + """ + bests = summary_df.loc[summary_df[best_column_name]] + assert len(bests) == 1, "There must be only one best result." + return bests.iloc[0] + + +def get_event_loop() -> AbstractEventLoop: + """ + Get asyncio event loop safely. + """ + try: + loop = asyncio.get_running_loop() + except RuntimeError: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + return loop + + +def find_key_values(data, target_key: str) -> List[Any]: + """ + Recursively find all values for a specific key in a nested dictionary or list. + + :param data: The dictionary or list to search. + :param target_key: The key to search for. + :return: A list of values associated with the target key. + """ + values = [] + + if isinstance(data, dict): + for key, value in data.items(): + if key == target_key: + values.append(value) + if isinstance(value, (dict, list)): + values.extend(find_key_values(value, target_key)) + elif isinstance(data, list): + for item in data: + if isinstance(item, (dict, list)): + values.extend(find_key_values(item, target_key)) + + return values + + +def pop_params(func: Callable, kwargs: Dict) -> Dict: + """ + Pop parameters from the given func and return them. + It automatically deletes the parameters like "self" or "cls". + + :param func: The function to pop parameters. + :param kwargs: kwargs to pop parameters. + :return: The popped parameters. + """ + ignore_params = ["self", "cls"] + target_params = list(inspect.signature(func).parameters.keys()) + target_params = list(filter(lambda x: x not in ignore_params, target_params)) + + init_params = {} + kwargs_keys = list(kwargs.keys()) + for key in kwargs_keys: + if key in target_params: + init_params[key] = kwargs.pop(key) + return init_params + + +def apply_recursive(func, data): + """ + Recursively apply a function to all elements in a list, tuple, set, np.ndarray, or pd.Series and return as List. + + :param func: Function to apply to each element. + :param data: List or nested list. + :return: List with the function applied to each element. + """ + if ( + isinstance(data, list) + or isinstance(data, tuple) + or isinstance(data, set) + or isinstance(data, np.ndarray) + or isinstance(data, pd.Series) + ): + return [apply_recursive(func, item) for item in data] + else: + return func(data) + + +def empty_cuda_cache(): + try: + import torch + + if torch.cuda.is_available(): + torch.cuda.empty_cache() + except ImportError: + pass + + +def load_yaml_config(yaml_path: str) -> Dict: + """ + Load a YAML configuration file for AutoRAG. + It contains safe loading, converting string to tuple, and insert environment variables. + + :param yaml_path: The path of the YAML configuration file. + :return: The loaded configuration dictionary. + """ + if not os.path.exists(yaml_path): + raise ValueError(f"YAML file {yaml_path} does not exist.") + with open(yaml_path, "r", encoding="utf-8") as stream: + try: + yaml_dict = yaml.safe_load(stream) + except yaml.YAMLError as exc: + raise ValueError(f"YAML file {yaml_path} could not be loaded.") from exc + + yaml_dict = convert_string_to_tuple_in_dict(yaml_dict) + yaml_dict = convert_env_in_dict(yaml_dict) + return yaml_dict + + +def decode_multiple_json_from_bytes(byte_data: bytes) -> list: + """ + Decode multiple JSON objects from bytes received from SSE server. + + Args: + byte_data: Bytes containing one or more JSON objects + + Returns: + List of decoded JSON objects + """ + # Decode bytes to string + try: + text_data = byte_data.decode("utf-8").strip() + except UnicodeDecodeError: + raise ValueError("Invalid byte data: Unable to decode as UTF-8") + + # Initialize decoder and result list + decoder = JSONDecoder() + result = [] + + # Keep track of position in string + pos = 0 + text_data = text_data.strip() + + while pos < len(text_data): + try: + # Try to decode next JSON object + json_obj, json_end = decoder.raw_decode(text_data[pos:]) + result.append(json_obj) + + # Move position to end of current JSON object + pos += json_end + + # Skip any whitespace + while pos < len(text_data) and text_data[pos].isspace(): + pos += 1 + + except json.JSONDecodeError: + # If we can't decode at current position, move forward one character + pos += 1 + + return result diff --git a/autorag-workspace/autorag/validator.py b/autorag-workspace/autorag/validator.py new file mode 100644 index 0000000..4fef6cd --- /dev/null +++ b/autorag-workspace/autorag/validator.py @@ -0,0 +1,98 @@ +import itertools +import logging +import os +import tempfile + +import pandas as pd + +from autorag.evaluator import Evaluator +from autorag.utils import ( + cast_qa_dataset, + cast_corpus_dataset, + validate_qa_from_corpus_dataset, +) + +logger = logging.getLogger("AutoRAG") + + +class Validator: + def __init__(self, qa_data_path: str, corpus_data_path: str): + """ + Initialize a Validator object. + + :param qa_data_path: The path to the QA dataset. + Must be parquet file. + :param corpus_data_path: The path to the corpus dataset. + Must be parquet file. + """ + # validate data paths + if not os.path.exists(qa_data_path): + raise ValueError(f"QA data path {qa_data_path} does not exist.") + if not os.path.exists(corpus_data_path): + raise ValueError(f"Corpus data path {corpus_data_path} does not exist.") + if not qa_data_path.endswith(".parquet"): + raise ValueError(f"QA data path {qa_data_path} is not a parquet file.") + if not corpus_data_path.endswith(".parquet"): + raise ValueError( + f"Corpus data path {corpus_data_path} is not a parquet file." + ) + self.qa_data = pd.read_parquet(qa_data_path, engine="pyarrow") + self.corpus_data = pd.read_parquet(corpus_data_path, engine="pyarrow") + self.qa_data = cast_qa_dataset(self.qa_data) + self.corpus_data = cast_corpus_dataset(self.corpus_data) + + def validate(self, yaml_path: str, qa_cnt: int = 5, random_state: int = 42): + # Determine the sample size and log a warning if qa_cnt is larger than available records + available_records = len(self.qa_data) + safe_sample_size = min(qa_cnt, available_records) # 먼저 safe_sample_size 계산 + + if safe_sample_size < qa_cnt: + logger.warning( + f"Minimal Requested sample size ({qa_cnt}) is larger than available records ({available_records}). " + f"Sampling will be limited to {safe_sample_size} records. " + ) + + # safe sample QA data + sample_qa_df = self.qa_data.sample( + n=safe_sample_size, random_state=random_state + ) + sample_qa_df.reset_index(drop=True, inplace=True) + + # get doc_id + temp_qa_df = sample_qa_df.copy(deep=True) + flatten_retrieval_gts = ( + temp_qa_df["retrieval_gt"] + .apply(lambda x: list(itertools.chain.from_iterable(x))) + .tolist() + ) + target_doc_ids = list(itertools.chain.from_iterable(flatten_retrieval_gts)) + + # make sample corpus data + sample_corpus_df = self.corpus_data.loc[ + self.corpus_data["doc_id"].isin(target_doc_ids) + ] + sample_corpus_df.reset_index(drop=True, inplace=True) + + validate_qa_from_corpus_dataset(sample_qa_df, sample_corpus_df) + + # start Evaluate at temp project directory + with ( + tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as qa_path, + tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as corpus_path, + tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as temp_project_dir, + ): + sample_qa_df.to_parquet(qa_path.name, index=False) + sample_corpus_df.to_parquet(corpus_path.name, index=False) + + evaluator = Evaluator( + qa_data_path=qa_path.name, + corpus_data_path=corpus_path.name, + project_dir=temp_project_dir, + ) + evaluator.start_trial(yaml_path, skip_validation=True) + qa_path.close() + corpus_path.close() + os.unlink(qa_path.name) + os.unlink(corpus_path.name) + + logger.info("Validation complete.") diff --git a/autorag-workspace/autorag/vectordb/__init__.py b/autorag-workspace/autorag/vectordb/__init__.py new file mode 100644 index 0000000..e26fc63 --- /dev/null +++ b/autorag-workspace/autorag/vectordb/__init__.py @@ -0,0 +1,75 @@ +import os +from typing import List + +from autorag.support import dynamically_find_function +from autorag.utils.util import load_yaml_config +from autorag.vectordb.base import BaseVectorStore + + +def get_support_vectordb(vectordb_name: str): + support_vectordb = { + "chroma": ("autorag.vectordb.chroma", "Chroma"), + "Chroma": ("autorag.vectordb.chroma", "Chroma"), + "milvus": ("autorag.vectordb.milvus", "Milvus"), + "Milvus": ("autorag.vectordb.milvus", "Milvus"), + "weaviate": ("autorag.vectordb.weaviate", "Weaviate"), + "Weaviate": ("autorag.vectordb.weaviate", "Weaviate"), + "pinecone": ("autorag.vectordb.pinecone", "Pinecone"), + "Pinecone": ("autorag.vectordb.pinecone", "Pinecone"), + "couchbase": ("autorag.vectordb.couchbase", "Couchbase"), + "Couchbase": ("autorag.vectordb.couchbase", "Couchbase"), + "qdrant": ("autorag.vectordb.qdrant", "Qdrant"), + "Qdrant": ("autorag.vectordb.qdrant", "Qdrant"), + } + return dynamically_find_function(vectordb_name, support_vectordb) + + +def load_vectordb(vectordb_name: str, **kwargs): + vectordb = get_support_vectordb(vectordb_name) + return vectordb(**kwargs) + + +def load_vectordb_from_yaml(yaml_path: str, vectordb_name: str, project_dir: str): + config_dict = load_yaml_config(yaml_path) + vectordb_list = config_dict.get("vectordb", []) + if len(vectordb_list) == 0 or vectordb_name == "default": + chroma_path = os.path.join(project_dir, "resources", "chroma") + return load_vectordb( + "chroma", + client_type="persistent", + embedding_model="openai", + collection_name="openai", + path=chroma_path, + ) + + target_dict = list(filter(lambda x: x["name"] == vectordb_name, vectordb_list)) + target_dict[0].pop("name") # delete a name key + target_vectordb_name = target_dict[0].pop("db_type") + target_vectordb_params = target_dict[0] + return load_vectordb(target_vectordb_name, **target_vectordb_params) + + +def load_all_vectordb_from_yaml( + yaml_path: str, project_dir: str +) -> List[BaseVectorStore]: + config_dict = load_yaml_config(yaml_path) + vectordb_list = config_dict.get("vectordb", []) + if len(vectordb_list) == 0: + chroma_path = os.path.join(project_dir, "resources", "chroma") + return [ + load_vectordb( + "chroma", + client_type="persistent", + embedding_model="openai", + collection_name="openai", + path=chroma_path, + ) + ] + + result_vectordbs = [] + for vectordb_dict in vectordb_list: + _ = vectordb_dict.pop("name") + vectordb_type = vectordb_dict.pop("db_type") + vectordb = load_vectordb(vectordb_type, **vectordb_dict) + result_vectordbs.append(vectordb) + return result_vectordbs diff --git a/autorag-workspace/autorag/vectordb/base.py b/autorag-workspace/autorag/vectordb/base.py new file mode 100644 index 0000000..ca89c17 --- /dev/null +++ b/autorag-workspace/autorag/vectordb/base.py @@ -0,0 +1,66 @@ +from abc import abstractmethod +from typing import List, Tuple, Union + +from llama_index.embeddings.openai import OpenAIEmbedding + +from autorag.utils.util import openai_truncate_by_token +from autorag.embedding.base import EmbeddingModel + + +class BaseVectorStore: + support_similarity_metrics = ["l2", "ip", "cosine"] + + def __init__( + self, + embedding_model: Union[str, List[dict]], + similarity_metric: str = "cosine", + embedding_batch: int = 100, + ): + self.embedding = EmbeddingModel.load(embedding_model)() + self.embedding_batch = embedding_batch + self.embedding.embed_batch_size = embedding_batch + assert ( + similarity_metric in self.support_similarity_metrics + ), f"search method {similarity_metric} is not supported" + self.similarity_metric = similarity_metric + + @abstractmethod + async def add( + self, + ids: List[str], + texts: List[str], + ): + pass + + @abstractmethod + async def query( + self, queries: List[str], top_k: int, **kwargs + ) -> Tuple[List[List[str]], List[List[float]]]: + pass + + @abstractmethod + async def fetch(self, ids: List[str]) -> List[List[float]]: + """ + Fetch the embeddings of the ids. + """ + pass + + @abstractmethod + async def is_exist(self, ids: List[str]) -> List[bool]: + """ + Check if the ids exist in the Vector DB. + """ + pass + + @abstractmethod + async def delete(self, ids: List[str]): + pass + + def truncated_inputs(self, inputs: List[str]) -> List[str]: + if isinstance(self.embedding, OpenAIEmbedding): + openai_embedding_limit = 8000 + results = openai_truncate_by_token( + inputs, openai_embedding_limit, self.embedding.model_name + ) + return results + return inputs diff --git a/autorag-workspace/autorag/vectordb/chroma.py b/autorag-workspace/autorag/vectordb/chroma.py new file mode 100644 index 0000000..0d9f523 --- /dev/null +++ b/autorag-workspace/autorag/vectordb/chroma.py @@ -0,0 +1,117 @@ +from typing import List, Optional, Dict, Tuple, Union + +from chromadb import ( + EphemeralClient, + PersistentClient, + DEFAULT_TENANT, + DEFAULT_DATABASE, + CloudClient, + AsyncHttpClient, +) +from chromadb.api.models.AsyncCollection import AsyncCollection +from chromadb.api.types import IncludeEnum, QueryResult + +from autorag.utils.util import apply_recursive +from autorag.vectordb.base import BaseVectorStore + + +class Chroma(BaseVectorStore): + def __init__( + self, + embedding_model: Union[str, List[dict]], + collection_name: str, + embedding_batch: int = 100, + client_type: str = "persistent", + similarity_metric: str = "cosine", + path: str = None, + host: str = "localhost", + port: int = 8000, + ssl: bool = False, + headers: Optional[Dict[str, str]] = None, + api_key: Optional[str] = None, + tenant: str = DEFAULT_TENANT, + database: str = DEFAULT_DATABASE, + ): + super().__init__(embedding_model, similarity_metric, embedding_batch) + if client_type == "ephemeral": + self.client = EphemeralClient(tenant=tenant, database=database) + elif client_type == "persistent": + assert path is not None, "path must be provided for persistent client" + self.client = PersistentClient(path=path, tenant=tenant, database=database) + elif client_type == "http": + self.client = AsyncHttpClient( + host=host, + port=port, + ssl=ssl, + headers=headers, + tenant=tenant, + database=database, + ) + elif client_type == "cloud": + self.client = CloudClient( + tenant=tenant, + database=database, + api_key=api_key, + ) + else: + raise ValueError( + f"client_type {client_type} is not supported\n" + "supported client types are: ephemeral, persistent, http, cloud" + ) + + self.collection = self.client.get_or_create_collection( + name=collection_name, + metadata={"hnsw:space": similarity_metric}, + ) + + async def add(self, ids: List[str], texts: List[str]): + texts = self.truncated_inputs(texts) + text_embeddings = await self.embedding.aget_text_embedding_batch(texts) + if isinstance(self.collection, AsyncCollection): + await self.collection.add(ids=ids, embeddings=text_embeddings) + else: + self.collection.add(ids=ids, embeddings=text_embeddings) + + async def fetch(self, ids: List[str]) -> List[List[float]]: + if isinstance(self.collection, AsyncCollection): + fetch_result = await self.collection.get( + ids, include=[IncludeEnum.embeddings] + ) + else: + fetch_result = self.collection.get(ids, include=[IncludeEnum.embeddings]) + fetch_embeddings = fetch_result["embeddings"] + return fetch_embeddings + + async def is_exist(self, ids: List[str]) -> List[bool]: + if isinstance(self.collection, AsyncCollection): + fetched_result = await self.collection.get(ids, include=[]) + else: + fetched_result = self.collection.get(ids, include=[]) + existed_ids = fetched_result["ids"] + return list(map(lambda x: x in existed_ids, ids)) + + async def query( + self, queries: List[str], top_k: int, **kwargs + ) -> Tuple[List[List[str]], List[List[float]]]: + queries = self.truncated_inputs(queries) + query_embeddings: List[ + List[float] + ] = await self.embedding.aget_text_embedding_batch(queries) + if isinstance(self.collection, AsyncCollection): + query_result: QueryResult = await self.collection.query( + query_embeddings=query_embeddings, n_results=top_k + ) + else: + query_result: QueryResult = self.collection.query( + query_embeddings=query_embeddings, n_results=top_k + ) + ids = query_result["ids"] + scores = query_result["distances"] + scores = apply_recursive(lambda x: 1 - x, scores) + return ids, scores + + async def delete(self, ids: List[str]): + if isinstance(self.collection, AsyncCollection): + await self.collection.delete(ids) + else: + self.collection.delete(ids) diff --git a/autorag-workspace/autorag/vectordb/couchbase.py b/autorag-workspace/autorag/vectordb/couchbase.py new file mode 100644 index 0000000..f1ef135 --- /dev/null +++ b/autorag-workspace/autorag/vectordb/couchbase.py @@ -0,0 +1,218 @@ +import logging + +from datetime import timedelta + +from couchbase.auth import PasswordAuthenticator +from couchbase.cluster import Cluster +from couchbase.options import ClusterOptions + +from typing import List, Tuple, Optional, Union + +from autorag.utils.util import make_batch +from autorag.vectordb import BaseVectorStore + +logger = logging.getLogger("AutoRAG") + + +class Couchbase(BaseVectorStore): + def __init__( + self, + embedding_model: Union[str, List[dict]], + bucket_name: str, + scope_name: str, + collection_name: str, + index_name: str, + embedding_batch: int = 100, + connection_string: str = "", + username: str = "", + password: str = "", + ingest_batch: int = 100, + text_key: Optional[str] = "text", + embedding_key: Optional[str] = "embedding", + scoped_index: bool = True, + ): + super().__init__( + embedding_model=embedding_model, + similarity_metric="ip", + embedding_batch=embedding_batch, + ) + + self.index_name = index_name + self.bucket_name = bucket_name + self.scope_name = scope_name + self.collection_name = collection_name + self.scoped_index = scoped_index + self.text_key = text_key + self.embedding_key = embedding_key + self.ingest_batch = ingest_batch + + auth = PasswordAuthenticator(username, password) + self.cluster = Cluster(connection_string, ClusterOptions(auth)) + + # Wait until the cluster is ready for use. + self.cluster.wait_until_ready(timedelta(seconds=5)) + + # Check if the bucket exists + if not self._check_bucket_exists(): + raise ValueError( + f"Bucket {self.bucket_name} does not exist. " + " Please create the bucket before searching." + ) + + try: + self.bucket = self.cluster.bucket(self.bucket_name) + self.scope = self.bucket.scope(self.scope_name) + self.collection = self.scope.collection(self.collection_name) + except Exception as e: + raise ValueError( + "Error connecting to couchbase. " + "Please check the connection and credentials." + ) from e + + # Check if the index exists. Throws ValueError if it doesn't + try: + self._check_index_exists() + except Exception: + raise + + # Reinitialize to ensure a consistent state + self.bucket = self.cluster.bucket(self.bucket_name) + self.scope = self.bucket.scope(self.scope_name) + self.collection = self.scope.collection(self.collection_name) + + async def add(self, ids: List[str], texts: List[str]): + from couchbase.exceptions import DocumentExistsException + + texts = self.truncated_inputs(texts) + text_embeddings: List[ + List[float] + ] = await self.embedding.aget_text_embedding_batch(texts) + + documents_to_insert = [] + for _id, text, embedding in zip(ids, texts, text_embeddings): + doc = { + self.text_key: text, + self.embedding_key: embedding, + } + documents_to_insert.append({_id: doc}) + + batch_documents_to_insert = make_batch(documents_to_insert, self.ingest_batch) + + for batch in batch_documents_to_insert: + insert_batch = {} + for doc in batch: + insert_batch.update(doc) + try: + self.collection.upsert_multi(insert_batch) + except DocumentExistsException as e: + logger.debug(f"Document already exists: {e}") + + async def fetch(self, ids: List[str]) -> List[List[float]]: + # Fetch vectors by IDs + fetched_result = self.collection.get_multi(ids) + fetched_vectors = { + k: v.value[f"{self.embedding_key}"] + for k, v in fetched_result.results.items() + } + return list(map(lambda x: fetched_vectors[x], ids)) + + async def is_exist(self, ids: List[str]) -> List[bool]: + existed_result = self.collection.exists_multi(ids) + existed_ids = {k: v.exists for k, v in existed_result.results.items()} + return list(map(lambda x: existed_ids[x], ids)) + + async def query( + self, queries: List[str], top_k: int, **kwargs + ) -> Tuple[List[List[str]], List[List[float]]]: + import couchbase.search as search + from couchbase.options import SearchOptions + from couchbase.vector_search import VectorQuery, VectorSearch + + queries = self.truncated_inputs(queries) + query_embeddings: List[ + List[float] + ] = await self.embedding.aget_text_embedding_batch(queries) + + ids, scores = [], [] + for query_embedding in query_embeddings: + # Create Search Request + search_req = search.SearchRequest.create( + VectorSearch.from_vector_query( + VectorQuery( + self.embedding_key, + query_embedding, + top_k, + ) + ) + ) + + # Search + if self.scoped_index: + search_iter = self.scope.search( + self.index_name, + search_req, + SearchOptions(limit=top_k), + ) + + else: + search_iter = self.cluster.search( + self.index_name, + search_req, + SearchOptions(limit=top_k), + ) + + # Parse the search results + # search_iter.rows() can only be iterated once. + id_list, score_list = [], [] + for result in search_iter.rows(): + id_list.append(result.id) + score_list.append(result.score) + + ids.append(id_list) + scores.append(score_list) + + return ids, scores + + async def delete(self, ids: List[str]): + self.collection.remove_multi(ids) + + def _check_bucket_exists(self) -> bool: + """Check if the bucket exists in the linked Couchbase cluster. + + Returns: + True if the bucket exists + """ + bucket_manager = self.cluster.buckets() + try: + bucket_manager.get_bucket(self.bucket_name) + return True + except Exception as e: + logger.debug("Error checking if bucket exists:", e) + return False + + def _check_index_exists(self) -> bool: + """Check if the Search index exists in the linked Couchbase cluster + Returns: + bool: True if the index exists, False otherwise. + Raises a ValueError if the index does not exist. + """ + if self.scoped_index: + all_indexes = [ + index.name for index in self.scope.search_indexes().get_all_indexes() + ] + if self.index_name not in all_indexes: + raise ValueError( + f"Index {self.index_name} does not exist. " + " Please create the index before searching." + ) + else: + all_indexes = [ + index.name for index in self.cluster.search_indexes().get_all_indexes() + ] + if self.index_name not in all_indexes: + raise ValueError( + f"Index {self.index_name} does not exist. " + " Please create the index before searching." + ) + + return True diff --git a/autorag-workspace/autorag/vectordb/milvus.py b/autorag-workspace/autorag/vectordb/milvus.py new file mode 100644 index 0000000..6e64b62 --- /dev/null +++ b/autorag-workspace/autorag/vectordb/milvus.py @@ -0,0 +1,168 @@ +import logging +from typing import Any, Dict, List, Tuple, Optional, Union + +from pymilvus import ( + DataType, + FieldSchema, + CollectionSchema, + connections, + Collection, + MilvusException, +) +from pymilvus.orm import utility + +from autorag.utils.util import apply_recursive +from autorag.vectordb import BaseVectorStore + + +logger = logging.getLogger("AutoRAG") + + +class Milvus(BaseVectorStore): + def __init__( + self, + embedding_model: Union[str, List[dict]], + collection_name: str, + embedding_batch: int = 100, + similarity_metric: str = "cosine", + index_type: str = "IVF_FLAT", + uri: str = "http://localhost:19530", + db_name: str = "", + token: str = "", + user: str = "", + password: str = "", + timeout: Optional[float] = None, + params: Dict[str, Any] = {}, + ): + super().__init__(embedding_model, similarity_metric, embedding_batch) + + # Connect to Milvus server + connections.connect( + "default", + uri=uri, + token=token, + db_name=db_name, + user=user, + password=password, + ) + self.collection_name = collection_name + self.timeout = timeout + self.params = params + self.index_type = index_type + + # Set Collection + if not utility.has_collection(collection_name, timeout=timeout): + # Get the dimension of the embeddings + test_embedding_result: List[float] = self.embedding.get_query_embedding( + "test" + ) + dimension = len(test_embedding_result) + + pk = FieldSchema( + name="id", + dtype=DataType.VARCHAR, + max_length=128, + is_primary=True, + auto_id=False, + ) + field = FieldSchema( + name="vector", dtype=DataType.FLOAT_VECTOR, dim=dimension + ) + schema = CollectionSchema(fields=[pk, field]) + + self.collection = Collection(name=self.collection_name, schema=schema) + index_params = { + "metric_type": self.similarity_metric.upper(), + "index_type": self.index_type.upper(), + "params": self.params, + } + self.collection.create_index( + field_name="vector", index_params=index_params, timeout=self.timeout + ) + else: + self.collection = Collection(name=self.collection_name) + + async def add(self, ids: List[str], texts: List[str]): + texts = self.truncated_inputs(texts) + text_embeddings: List[ + List[float] + ] = await self.embedding.aget_text_embedding_batch(texts) + + # make data for insertion + data = list( + map(lambda _id, vector: {"id": _id, "vector": vector}, ids, text_embeddings) + ) + + # Insert data into the collection + res = self.collection.insert(data=data, timeout=self.timeout) + assert ( + res.insert_count == len(ids) + ), f"Insertion failed. Try to insert {len(ids)} but only {res['insert_count']} inserted." + + self.collection.flush(timeout=self.timeout) + + async def query( + self, queries: List[str], top_k: int, **kwargs + ) -> Tuple[List[List[str]], List[List[float]]]: + queries = self.truncated_inputs(queries) + query_embeddings: List[ + List[float] + ] = await self.embedding.aget_text_embedding_batch(queries) + + self.collection.load(timeout=self.timeout) + + # Perform similarity search + results = self.collection.search( + data=query_embeddings, + limit=top_k, + anns_field="vector", + param={"metric_type": self.similarity_metric.upper()}, + timeout=self.timeout, + **kwargs, + ) + + # Extract IDs and distances + ids = [[str(hit.id) for hit in result] for result in results] + distances = [[hit.distance for hit in result] for result in results] + + if self.similarity_metric in ["l2"]: + distances = apply_recursive(lambda x: -x, distances) + + return ids, distances + + async def fetch(self, ids: List[str]) -> List[List[float]]: + try: + self.collection.load(timeout=self.timeout) + except MilvusException as e: + logger.warning(f"Failed to load collection: {e}") + return [[]] * len(ids) + # Fetch vectors by IDs + results = self.collection.query( + expr=f"id in {ids}", output_fields=["id", "vector"], timeout=self.timeout + ) + id_vector_dict = {str(result["id"]): result["vector"] for result in results} + result = [id_vector_dict[_id] for _id in ids] + return result + + async def is_exist(self, ids: List[str]) -> List[bool]: + try: + self.collection.load(timeout=self.timeout) + except MilvusException: + return [False] * len(ids) + # Check the existence of IDs + results = self.collection.query( + expr=f"id in {ids}", output_fields=["id"], timeout=self.timeout + ) + # Determine existence + existing_ids = {str(result["id"]) for result in results} + return [str(_id) in existing_ids for _id in ids] + + async def delete(self, ids: List[str]): + # Delete entries by IDs + self.collection.delete(expr=f"id in {ids}", timeout=self.timeout) + + def delete_collection(self): + # Delete the collection + self.collection.release(timeout=self.timeout) + self.collection.drop_index(timeout=self.timeout) + self.collection.drop(timeout=self.timeout) diff --git a/autorag-workspace/autorag/vectordb/pinecone.py b/autorag-workspace/autorag/vectordb/pinecone.py new file mode 100644 index 0000000..a96c403 --- /dev/null +++ b/autorag-workspace/autorag/vectordb/pinecone.py @@ -0,0 +1,119 @@ +import logging + +from pinecone.grpc import PineconeGRPC as Pinecone_client +from pinecone import ServerlessSpec + +from typing import List, Optional, Tuple, Union + +from autorag.utils.util import make_batch, apply_recursive +from autorag.vectordb import BaseVectorStore + +logger = logging.getLogger("AutoRAG") + + +class Pinecone(BaseVectorStore): + def __init__( + self, + embedding_model: Union[str, List[dict]], + index_name: str, + embedding_batch: int = 100, + dimension: int = 1536, + similarity_metric: str = "cosine", # "cosine", "dotproduct", "euclidean" + cloud: Optional[str] = "aws", + region: Optional[str] = "us-east-1", + api_key: Optional[str] = None, + deletion_protection: Optional[str] = "disabled", # "enabled" or "disabled" + namespace: Optional[str] = "default", + ingest_batch: int = 200, + ): + super().__init__(embedding_model, similarity_metric, embedding_batch) + + self.index_name = index_name + self.namespace = namespace + self.ingest_batch = ingest_batch + + self.client = Pinecone_client(api_key=api_key) + + if similarity_metric == "ip": + similarity_metric = "dotproduct" + elif similarity_metric == "l2": + similarity_metric = "euclidean" + + if not self.client.has_index(index_name): + self.client.create_index( + name=index_name, + dimension=dimension, + metric=similarity_metric, + spec=ServerlessSpec( + cloud=cloud, + region=region, + ), + deletion_protection=deletion_protection, + ) + self.index = self.client.Index(index_name) + + async def add(self, ids: List[str], texts: List[str]): + texts = self.truncated_inputs(texts) + text_embeddings: List[ + List[float] + ] = await self.embedding.aget_text_embedding_batch(texts) + + vector_tuples = list(zip(ids, text_embeddings)) + batch_vectors = make_batch(vector_tuples, self.ingest_batch) + + async_res = [ + self.index.upsert( + vectors=batch_vector_tuples, + namespace=self.namespace, + async_req=True, + ) + for batch_vector_tuples in batch_vectors + ] + # Wait for the async requests to finish + [async_result.result() for async_result in async_res] + + async def fetch(self, ids: List[str]) -> List[List[float]]: + results = self.index.fetch(ids=ids, namespace=self.namespace) + id_vector_dict = { + str(key): val["values"] for key, val in results["vectors"].items() + } + result = [id_vector_dict[_id] for _id in ids] + return result + + async def is_exist(self, ids: List[str]) -> List[bool]: + fetched_result = self.index.fetch(ids=ids, namespace=self.namespace) + existed_ids = list(map(str, fetched_result.get("vectors", {}).keys())) + return list(map(lambda x: x in existed_ids, ids)) + + async def query( + self, queries: List[str], top_k: int, **kwargs + ) -> Tuple[List[List[str]], List[List[float]]]: + queries = self.truncated_inputs(queries) + query_embeddings: List[ + List[float] + ] = await self.embedding.aget_text_embedding_batch(queries) + + ids, scores = [], [] + for query_embedding in query_embeddings: + response = self.index.query( + vector=query_embedding, + top_k=top_k, + include_values=True, + namespace=self.namespace, + ) + + ids.append([o.id for o in response.matches]) + scores.append([o.score for o in response.matches]) + + if self.similarity_metric in ["l2"]: + scores = apply_recursive(lambda x: -x, scores) + + return ids, scores + + async def delete(self, ids: List[str]): + # Delete entries by IDs + self.index.delete(ids=ids, namespace=self.namespace) + + def delete_index(self): + # Delete the index + self.client.delete_index(self.index_name) diff --git a/autorag-workspace/autorag/vectordb/qdrant.py b/autorag-workspace/autorag/vectordb/qdrant.py new file mode 100644 index 0000000..d54e35d --- /dev/null +++ b/autorag-workspace/autorag/vectordb/qdrant.py @@ -0,0 +1,153 @@ +import logging + +from qdrant_client import QdrantClient +from qdrant_client.models import ( + Distance, + VectorParams, + PointStruct, + PointIdsList, + HasIdCondition, + Filter, + SearchRequest, +) + +from typing import List, Tuple, Union + +from autorag.vectordb import BaseVectorStore + +logger = logging.getLogger("AutoRAG") + + +class Qdrant(BaseVectorStore): + def __init__( + self, + embedding_model: Union[str, List[dict]], + collection_name: str, + embedding_batch: int = 100, + similarity_metric: str = "cosine", + client_type: str = "docker", + url: str = "http://localhost:6333", + host: str = "", + api_key: str = "", + dimension: int = 1536, + ingest_batch: int = 64, + parallel: int = 1, + max_retries: int = 3, + ): + super().__init__(embedding_model, similarity_metric, embedding_batch) + + self.collection_name = collection_name + self.ingest_batch = ingest_batch + self.parallel = parallel + self.max_retries = max_retries + + if similarity_metric == "cosine": + distance = Distance.COSINE + elif similarity_metric == "ip": + distance = Distance.DOT + elif similarity_metric == "l2": + distance = Distance.EUCLID + else: + raise ValueError( + f"similarity_metric {similarity_metric} is not supported\n" + "supported similarity metrics are: cosine, ip, l2" + ) + + if client_type == "docker": + self.client = QdrantClient( + url=url, + ) + elif client_type == "cloud": + self.client = QdrantClient( + host=host, + api_key=api_key, + ) + else: + raise ValueError( + f"client_type {client_type} is not supported\n" + "supported client types are: docker, cloud" + ) + + if not self.client.collection_exists(collection_name): + self.client.create_collection( + collection_name, + vectors_config=VectorParams( + size=dimension, + distance=distance, + ), + ) + self.collection = self.client.get_collection(collection_name) + + async def add(self, ids: List[str], texts: List[str]): + texts = self.truncated_inputs(texts) + text_embeddings = await self.embedding.aget_text_embedding_batch(texts) + + points = list( + map(lambda x: PointStruct(id=x[0], vector=x[1]), zip(ids, text_embeddings)) + ) + + self.client.upload_points( + collection_name=self.collection_name, + points=points, + batch_size=self.ingest_batch, + parallel=self.parallel, + max_retries=self.max_retries, + wait=True, + ) + + async def fetch(self, ids: List[str]) -> List[List[float]]: + # Fetch vectors by IDs + fetched_results = self.client.retrieve( + collection_name=self.collection_name, + ids=ids, + with_vectors=True, + ) + return list(map(lambda x: x.vector, fetched_results)) + + async def is_exist(self, ids: List[str]) -> List[bool]: + existed_result = self.client.scroll( + collection_name=self.collection_name, + scroll_filter=Filter( + must=[ + HasIdCondition(has_id=ids), + ], + ), + ) + # existed_result is tuple. So we use existed_result[0] to get list of Record + existed_ids = list(map(lambda x: x.id, existed_result[0])) + return list(map(lambda x: x in existed_ids, ids)) + + async def query( + self, queries: List[str], top_k: int, **kwargs + ) -> Tuple[List[List[str]], List[List[float]]]: + queries = self.truncated_inputs(queries) + query_embeddings: List[ + List[float] + ] = await self.embedding.aget_text_embedding_batch(queries) + + search_queries = list( + map( + lambda x: SearchRequest(vector=x, limit=top_k, with_vector=True), + query_embeddings, + ) + ) + + search_result = self.client.search_batch( + collection_name=self.collection_name, requests=search_queries + ) + + # Extract IDs and distances + ids = [[str(hit.id) for hit in result] for result in search_result] + scores = [[hit.score for hit in result] for result in search_result] + + return ids, scores + + async def delete(self, ids: List[str]): + self.client.delete( + collection_name=self.collection_name, + points_selector=PointIdsList(points=ids), + ) + + def delete_collection(self): + # Delete the collection + self.client.delete_collection(self.collection_name) diff --git a/autorag-workspace/autorag/vectordb/weaviate.py b/autorag-workspace/autorag/vectordb/weaviate.py new file mode 100644 index 0000000..a044889 --- /dev/null +++ b/autorag-workspace/autorag/vectordb/weaviate.py @@ -0,0 +1,167 @@ +import logging + +import weaviate +from weaviate.classes.init import Auth +from weaviate.classes.config import Property, DataType +import weaviate.classes as wvc +from weaviate.classes.query import MetadataQuery + +from typing import List, Optional, Tuple, Union + +from autorag.vectordb import BaseVectorStore + +logger = logging.getLogger("AutoRAG") + + +class Weaviate(BaseVectorStore): + def __init__( + self, + embedding_model: Union[str, List[dict]], + collection_name: str, + embedding_batch: int = 100, + similarity_metric: str = "cosine", + client_type: str = "docker", + host: str = "localhost", + port: int = 8080, + grpc_port: int = 50051, + url: Optional[str] = None, + api_key: Optional[str] = None, + text_key: str = "content", + ): + super().__init__(embedding_model, similarity_metric, embedding_batch) + + self.text_key = text_key + + if client_type == "docker": + self.client = weaviate.connect_to_local( + host=host, + port=port, + grpc_port=grpc_port, + ) + elif client_type == "cloud": + self.client = weaviate.connect_to_weaviate_cloud( + cluster_url=url, + auth_credentials=Auth.api_key(api_key), + ) + else: + raise ValueError( + f"client_type {client_type} is not supported\n" + "supported client types are: docker, cloud" + ) + if similarity_metric == "cosine": + distance_metric = wvc.config.VectorDistances.COSINE + elif similarity_metric == "ip": + distance_metric = wvc.config.VectorDistances.DOT + elif similarity_metric == "l2": + distance_metric = wvc.config.VectorDistances.L2_SQUARED + else: + raise ValueError( + f"similarity_metric {similarity_metric} is not supported\n" + "supported similarity metrics are: cosine, ip, l2" + ) + + if not self.client.collections.exists(collection_name): + self.client.collections.create( + collection_name, + properties=[ + Property( + name="content", data_type=DataType.TEXT, skip_vectorization=True + ), + ], + vectorizer_config=wvc.config.Configure.Vectorizer.none(), + vector_index_config=wvc.config.Configure.VectorIndex.hnsw( # hnsw, flat, dynamic, + distance_metric=distance_metric + ), + ) + self.collection = self.client.collections.get(collection_name) + self.collection_name = collection_name + + async def add(self, ids: List[str], texts: List[str]): + texts = self.truncated_inputs(texts) + text_embeddings = await self.embedding.aget_text_embedding_batch(texts) + + with self.client.batch.dynamic() as batch: + for i, text in enumerate(texts): + data_properties = {self.text_key: text} + + batch.add_object( + collection=self.collection_name, + properties=data_properties, + uuid=ids[i], + vector=text_embeddings[i], + ) + + failed_objs = self.client.batch.failed_objects + for obj in failed_objs: + err_message = ( + f"Failed to add object: {obj.original_uuid}\nReason: {obj.message}" + ) + + logger.error(err_message) + + async def fetch(self, ids: List[str]) -> List[List[float]]: + # Fetch vectors by IDs + results = self.collection.query.fetch_objects( + filters=wvc.query.Filter.by_property("_id").contains_any(ids), + include_vector=True, + ) + id_vector_dict = { + str(object.uuid): object.vector["default"] for object in results.objects + } + result = [id_vector_dict[_id] for _id in ids] + return result + + async def is_exist(self, ids: List[str]) -> List[bool]: + fetched_result = self.collection.query.fetch_objects( + filters=wvc.query.Filter.by_property("_id").contains_any(ids), + ) + existed_ids = [str(result.uuid) for result in fetched_result.objects] + return list(map(lambda x: x in existed_ids, ids)) + + async def query( + self, queries: List[str], top_k: int, **kwargs + ) -> Tuple[List[List[str]], List[List[float]]]: + queries = self.truncated_inputs(queries) + query_embeddings: List[ + List[float] + ] = await self.embedding.aget_text_embedding_batch(queries) + + ids, scores = [], [] + for query_embedding in query_embeddings: + response = self.collection.query.near_vector( + near_vector=query_embedding, + limit=top_k, + return_metadata=MetadataQuery(distance=True), + ) + + ids.append([o.uuid for o in response.objects]) + scores.append( + [ + distance_to_score(o.metadata.distance, self.similarity_metric) + for o in response.objects + ] + ) + + return ids, scores + + async def delete(self, ids: List[str]): + filter = wvc.query.Filter.by_id().contains_any(ids) + self.collection.data.delete_many(where=filter) + + def delete_collection(self): + # Delete the collection + self.client.collections.delete(self.collection_name) + + +def distance_to_score(distance: float, similarity_metric) -> float: + if similarity_metric == "cosine": + return 1 - distance + elif similarity_metric == "ip": + return -distance + elif similarity_metric == "l2": + return -distance + else: + raise ValueError( + f"similarity_metric {similarity_metric} is not supported\n" + "supported similarity metrics are: cosine, ip, l2" + ) diff --git a/autorag-workspace/autorag/web.py b/autorag-workspace/autorag/web.py new file mode 100644 index 0000000..05f68c3 --- /dev/null +++ b/autorag-workspace/autorag/web.py @@ -0,0 +1,81 @@ +from typing import Optional + +import click +import streamlit as st + +from autorag.deploy import Runner + + +def get_runner( + yaml_path: Optional[str], project_dir: Optional[str], trial_path: Optional[str] +): + if not yaml_path and not trial_path: + raise ValueError("yaml_path or trial_path must be given.") + elif yaml_path and trial_path: + raise ValueError("yaml_path and trial_path cannot be given at the same time.") + elif yaml_path: + return Runner.from_yaml(yaml_path, project_dir=project_dir) + elif trial_path: + return Runner.from_trial_folder(trial_path) + + +def set_initial_state(): + if "messages" not in st.session_state: + st.session_state["messages"] = [ + { + "role": "assistant", + "content": "Welcome !", + } + ] + + +def set_page_config(): + st.set_page_config( + page_title="AutoRAG", + page_icon="🤖", + layout="wide", + initial_sidebar_state="expanded", + menu_items={ + "Get help": "https://github.com/Marker-Inc-Korea/AutoRAG/discussions", + "Report a bug": "https://github.com/Marker-Inc-Korea/AutoRAG/issues", + }, + ) + + +def set_page_header(): + st.header("📚 AutoRAG", anchor=False) + st.caption("Input a question and get an answer from the given documents. ") + + +def chat_box(runner: Runner): + if query := st.chat_input("How can I help?"): + # Add the user input to messages state + st.session_state["messages"].append({"role": "user", "content": query}) + with st.chat_message("user"): + st.markdown(query) + + # Generate llama-index stream with user input + with st.chat_message("assistant"): + with st.spinner("Processing..."): + response = st.write(runner.run(query)) + + # Add the final response to messages state + st.session_state["messages"].append({"role": "assistant", "content": response}) + + +@click.command() +@click.option("--yaml_path", type=str, help="Path to the YAML file.") +@click.option("--project_dir", type=str, help="Path to the project directory.") +@click.option("--trial_path", type=str, help="Path to the trial directory.") +def run_web_server( + yaml_path: Optional[str], project_dir: Optional[str], trial_path: Optional[str] +): + runner = get_runner(yaml_path, project_dir, trial_path) + set_initial_state() + set_page_config() + set_page_header() + chat_box(runner) + + +if __name__ == "__main__": + run_web_server() diff --git a/autorag-workspace/dashboard.sh b/autorag-workspace/dashboard.sh new file mode 100644 index 0000000..cbc2e58 --- /dev/null +++ b/autorag-workspace/dashboard.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +export BOKEH_ALLOW_WS_ORIGIN="localhost:7690,172.16.9.208:7690" + +python -m autorag.cli dashboard \ + --trial_dir ../projects/daesan-dangjin_01/benchmark/1 + +echo "📊 AutoRAG 대시보드 http://localhost:7690/..." diff --git a/autorag-workspace/example/sample_config/chunk/chunk_full.yaml b/autorag-workspace/example/sample_config/chunk/chunk_full.yaml new file mode 100644 index 0000000..429daef --- /dev/null +++ b/autorag-workspace/example/sample_config/chunk/chunk_full.yaml @@ -0,0 +1,32 @@ +modules: + - module_type: llama_index_chunk + chunk_method: [ Token, Sentence ] + chunk_size: [ 1024, 512 ] + chunk_overlap: 24 + add_file_name: en + - module_type: llama_index_chunk + chunk_method: [ SentenceWindow ] + window_size: 3 + add_file_name: en + - module_type: llama_index_chunk + chunk_method: [ Semantic_llama_index ] + embed_model: openai + buffer_size: 1 + breakpoint_percentile_threshold: 95 + add_file_name: en + - module_type: llama_index_chunk + chunk_method: [ SemanticDoubleMerging ] + add_file_name: en + - module_type: llama_index_chunk + chunk_method: [ SimpleFile ] + add_file_name: en + - module_type: langchain_chunk + chunk_method: sentencetransformerstoken + - module_type: langchain_chunk + chunk_method: recursivecharacter + separators: [ " ", "\n" ] + - module_type: langchain_chunk + chunk_method: character + separator: ". " + - module_type: langchain_chunk + chunk_method: Konlpy diff --git a/autorag-workspace/example/sample_config/chunk/chunk_ko.yaml b/autorag-workspace/example/sample_config/chunk/chunk_ko.yaml new file mode 100644 index 0000000..322a971 --- /dev/null +++ b/autorag-workspace/example/sample_config/chunk/chunk_ko.yaml @@ -0,0 +1,19 @@ +modules: + - module_type: llama_index_chunk + chunk_method: [ Token, Sentence ] + chunk_size: [ 1024, 512 ] + chunk_overlap: 24 + add_file_name: ko + - module_type: llama_index_chunk + chunk_method: [ SentenceWindow ] + sentence_splitter: kiwi + add_file_name: ko + - module_type: llama_index_chunk + chunk_method: [ Semantic_llama_index ] + embed_model: openai + add_file_name: ko + - module_type: llama_index_chunk + chunk_method: [ SimpleFile ] + add_file_name: ko + - module_type: langchain_chunk + chunk_method: KonlpyTextSplitter diff --git a/autorag-workspace/example/sample_config/chunk/simple_chunk.yaml b/autorag-workspace/example/sample_config/chunk/simple_chunk.yaml new file mode 100644 index 0000000..3f65aeb --- /dev/null +++ b/autorag-workspace/example/sample_config/chunk/simple_chunk.yaml @@ -0,0 +1,3 @@ +modules: + - module_type: llama_index_chunk + chunk_method: Token diff --git a/autorag-workspace/example/sample_config/parse/all_files_full.yaml b/autorag-workspace/example/sample_config/parse/all_files_full.yaml new file mode 100644 index 0000000..9cdbf7e --- /dev/null +++ b/autorag-workspace/example/sample_config/parse/all_files_full.yaml @@ -0,0 +1,25 @@ +# You can use only one of the following modules at a time. +modules: + # Use Directory Parse + - module_type: langchain_parse + file_type: all_files + parse_method: directory + # Use Unstructured + - module_type: langchain_parse + file_type: all_files + parse_method: unstructured + # Use Upsatge Document Parse + - module_type: langchain_parse + file_type: all_files + parse_method: upstagedocumentparse + # Use Naver Clova OCR + - module_type: clova + file_type: all_files + table_detection: true + # Use Llama Parse + - module_type: llamaparse + file_type: all_files + result_type: markdown + language: ko + use_vendor_multimodal_model: true + vendor_multimodal_model_name: openai-gpt-4o-mini diff --git a/autorag-workspace/example/sample_config/parse/file_types_full.yaml b/autorag-workspace/example/sample_config/parse/file_types_full.yaml new file mode 100644 index 0000000..180842b --- /dev/null +++ b/autorag-workspace/example/sample_config/parse/file_types_full.yaml @@ -0,0 +1,26 @@ +modules: + # PDF + - module_type: langchain_parse + file_type: pdf + parse_method: pdfminer + # CSV + - module_type: langchain_parse + file_type: csv + parse_method: csv + # JSON + - module_type: langchain_parse + file_type: json + parse_method: json + jq_schema: .content + # Markdown + - module_type: langchain_parse + file_type: md + parse_method: unstructuredmarkdown + # HTML + - module_type: langchain_parse + file_type: html + parse_method: bshtml + # XML + - module_type: langchain_parse + file_type: xml + parse_method: unstructuredxml diff --git a/autorag-workspace/example/sample_config/parse/parse_hybird.yaml b/autorag-workspace/example/sample_config/parse/parse_hybird.yaml new file mode 100644 index 0000000..1ea4c7f --- /dev/null +++ b/autorag-workspace/example/sample_config/parse/parse_hybird.yaml @@ -0,0 +1,12 @@ +modules: + - module_type: table_hybrid_parse + file_type: pdf + text_parse_module: langchain_parse + text_params: + parse_method: pdfplumber + table_parse_module: llamaparse + table_params: + result_type: markdown + language: ko + use_vendor_multimodal_model: true + vendor_multimodal_model_name: openai-gpt-4o-mini diff --git a/autorag-workspace/example/sample_config/parse/parse_ko.yaml b/autorag-workspace/example/sample_config/parse/parse_ko.yaml new file mode 100644 index 0000000..b216046 --- /dev/null +++ b/autorag-workspace/example/sample_config/parse/parse_ko.yaml @@ -0,0 +1,11 @@ +modules: + - module_type: llama_parse + file_type: all_files + result_type: markdown + language: ko + - module_type: clova + file_type: all_files + table_detection: true + - module_type: langchain_parse + file_type: all_files + parse_method: upstagedocumentparse diff --git a/autorag-workspace/example/sample_config/parse/parse_multimodal.yaml b/autorag-workspace/example/sample_config/parse/parse_multimodal.yaml new file mode 100644 index 0000000..9ced86e --- /dev/null +++ b/autorag-workspace/example/sample_config/parse/parse_multimodal.yaml @@ -0,0 +1,8 @@ +modules: + - module_type: llamaparse + file_type: all_files + result_type: markdown + language: ko + use_vendor_multimodal_model: true + vendor_multimodal_model_name: openai-gpt-4o-mini + use_own_key: true diff --git a/autorag-workspace/example/sample_config/parse/parse_ocr.yaml b/autorag-workspace/example/sample_config/parse/parse_ocr.yaml new file mode 100644 index 0000000..4c0b757 --- /dev/null +++ b/autorag-workspace/example/sample_config/parse/parse_ocr.yaml @@ -0,0 +1,10 @@ +modules: + - module_type: langchain_parse + file_type: all_files + parse_method: upstagedocumentparse + - module_type: llama_parse + file_type: all_files + result_type: markdown + - module_type: clova + file_type: all_files + table_detection: true diff --git a/autorag-workspace/example/sample_config/parse/simple_parse.yaml b/autorag-workspace/example/sample_config/parse/simple_parse.yaml new file mode 100644 index 0000000..73ae58a --- /dev/null +++ b/autorag-workspace/example/sample_config/parse/simple_parse.yaml @@ -0,0 +1,4 @@ +modules: + - module_type: langchain_parse + file_type: pdf + parse_method: pdfminer diff --git a/autorag-workspace/example/sample_config/rag/english/gpu/compact_local.yaml b/autorag-workspace/example/sample_config/rag/english/gpu/compact_local.yaml new file mode 100644 index 0000000..6f66932 --- /dev/null +++ b/autorag-workspace/example/sample_config/rag/english/gpu/compact_local.yaml @@ -0,0 +1,50 @@ +vectordb: + - name: mpnet_base_chroma + db_type: chroma + client_type: persistent + embedding_model: huggingface_all_mpnet_base_v2 + collection_name: huggingface_all_mpnet_base_v2 + path: ${PROJECT_DIR}/data/chroma +node_lines: +- node_line_name: retrieve_node_line # Arbitrary node line name + nodes: + - node_type: retrieval + strategy: + metrics: [retrieval_f1, retrieval_recall, retrieval_precision] + top_k: 20 + modules: + - module_type: bm25 + - module_type: vectordb + vectordb: mpnet_base_chroma + - module_type: hybrid_rrf + - module_type: hybrid_cc + normalize_method: [ mm, tmm, z, dbsf ] + - node_type: passage_reranker + strategy: + metrics: [retrieval_f1, retrieval_recall, retrieval_precision] + top_k: 3 + modules: + - module_type: pass_reranker + - module_type: tart + - module_type: upr +- node_line_name: post_retrieve_node_line # Arbitrary node line name + nodes: + - node_type: prompt_maker + strategy: + metrics: [bleu, meteor, rouge, sem_score] + generator_modules: + - module_type: vllm + llm: mistralai/Mistral-7B-Instruct-v0.2 + modules: + - module_type: fstring + prompt: + - "Answer to given questions with the following passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:" + - "There is a passages related to user question. Please response carefully to the following question. \n\n Passage: {retrieved_contents} \n\n Question: {query} \n\n Answer the question. Think step by step." # Zero-shot CoT prompt + - "{retrieved_contents} \n\n Read the passage carefully, and answer this question. \n\n Question: {query} \n\n Answer the question. Be concise." # concise prompt + - node_type: generator + strategy: + metrics: [bleu, meteor, rouge, sem_score] + modules: + - module_type: vllm + llm: mistralai/Mistral-7B-Instruct-v0.2 + temperature: [0.1, 0.5, 1.1] diff --git a/autorag-workspace/example/sample_config/rag/english/gpu/compact_openai.yaml b/autorag-workspace/example/sample_config/rag/english/gpu/compact_openai.yaml new file mode 100644 index 0000000..26f2309 --- /dev/null +++ b/autorag-workspace/example/sample_config/rag/english/gpu/compact_openai.yaml @@ -0,0 +1,101 @@ +node_lines: +- node_line_name: retrieve_node_line # Arbitrary node line name + nodes: + - node_type: retrieval + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision, + retrieval_ndcg, retrieval_map, retrieval_mrr ] + speed_threshold: 10 + top_k: 10 + modules: + - module_type: bm25 + bm25_tokenizer: [ porter_stemmer, space, gpt2 ] + - module_type: vectordb + vectordb: default + - module_type: hybrid_rrf + weight_range: (4,80) + - module_type: hybrid_cc + normalize_method: [ mm, tmm, z, dbsf ] + weight_range: (0.0, 1.0) + test_weight_size: 101 + - node_type: passage_augmenter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + top_k: 5 + embedding_model: openai + modules: + - module_type: pass_passage_augmenter + - module_type: prev_next_augmenter + mode: next + - node_type: passage_reranker + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 10 + top_k: 5 + modules: + - module_type: pass_reranker + - module_type: tart + - module_type: monot5 + - module_type: upr + - module_type: rankgpt + - module_type: colbert_reranker + - module_type: sentence_transformer_reranker + - module_type: flag_embedding_reranker + - module_type: flag_embedding_llm_reranker + - module_type: time_reranker + - module_type: openvino_reranker + - module_type: flashrank_reranker + - node_type: passage_filter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + modules: + - module_type: pass_passage_filter + - module_type: similarity_threshold_cutoff + threshold: 0.85 + - module_type: similarity_percentile_cutoff + percentile: 0.6 + - module_type: threshold_cutoff + threshold: 0.85 + - module_type: percentile_cutoff + percentile: 0.6 +- node_line_name: post_retrieve_node_line # Arbitrary node line name + nodes: + - node_type: prompt_maker + strategy: + metrics: + - metric_name: bleu + - metric_name: meteor + - metric_name: rouge + - metric_name: sem_score + embedding_model: openai + speed_threshold: 10 + generator_modules: + - module_type: llama_index_llm + llm: openai + model: [gpt-4o-mini] + modules: + - module_type: fstring + prompt: + - "Answer to given questions with the following passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:" + - "There is a passages related to user question. Please response carefully to the following question. \n\n Passage: {retrieved_contents} \n\n Question: {query} \n\n Answer the question. Think step by step." # Zero-shot CoT prompt + - "{retrieved_contents} \n\n Read the passage carefully, and answer this question. \n\n Question: {query} \n\n Answer the question. Be concise." # concise prompt + - module_type: long_context_reorder + prompt: + - "Answer to given questions with the following passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:" + - "There is a passages related to user question. Please response carefully to the following question. \n\n Passage: {retrieved_contents} \n\n Question: {query} \n\n Answer the question. Think step by step." # Zero-shot CoT prompt + - "{retrieved_contents} \n\n Read the passage carefully, and answer this question. \n\n Question: {query} \n\n Answer the question. Be concise." # concise prompt + - node_type: generator + strategy: + metrics: + - metric_name: rouge + - embedding_model: openai + metric_name: sem_score + - metric_name: bert_score + speed_threshold: 10 + modules: + - module_type: llama_index_llm + llm: [openai] + model: [gpt-4o-mini] + temperature: [0.5, 1.0] diff --git a/autorag-workspace/example/sample_config/rag/english/gpu/full.yaml b/autorag-workspace/example/sample_config/rag/english/gpu/full.yaml new file mode 100644 index 0000000..dd1da67 --- /dev/null +++ b/autorag-workspace/example/sample_config/rag/english/gpu/full.yaml @@ -0,0 +1,154 @@ +vectordb: + - name: chroma_bge_m3 + db_type: chroma + client_type: persistent + embedding_model: huggingface_bge_m3 + collection_name: openai + path: ${PROJECT_DIR}/resources/chroma +node_lines: +- node_line_name: pre_retrieve_node_line # Arbitrary node line name + nodes: + - node_type: query_expansion + strategy: + metrics: [retrieval_f1, retrieval_recall, retrieval_precision] + speed_threshold: 10 + top_k: 10 + retrieval_modules: + - module_type: bm25 + bm25_tokenizer: [ porter_stemmer, space, gpt2 ] + - module_type: vectordb + vectordb: chroma_bge_m3 + modules: + - module_type: pass_query_expansion + - module_type: query_decompose + generator_module_type: llama_index_llm + llm: openai + model: [ gpt-4o-mini ] + - module_type: hyde + generator_module_type: llama_index_llm + llm: openai + model: [ gpt-4o-mini ] + max_token: 64 + - module_type: multi_query_expansion + generator_module_type: llama_index_llm + llm: openai + temperature: [ 0.2, 1.0 ] +- node_line_name: retrieve_node_line # Arbitrary node line name + nodes: + - node_type: retrieval + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision, + retrieval_ndcg, retrieval_map, retrieval_mrr ] + speed_threshold: 10 + top_k: 10 + modules: + - module_type: bm25 + - module_type: vectordb + vectordb: chroma_bge_m3 + - module_type: hybrid_rrf + weight_range: (4,80) + - module_type: hybrid_cc + normalize_method: [ mm, tmm, z, dbsf ] + weight_range: (0.0, 1.0) + test_weight_size: 101 + - node_type: passage_augmenter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + top_k: 5 + embedding_model: openai + modules: + - module_type: pass_passage_augmenter + - module_type: prev_next_augmenter + mode: next + - node_type: passage_reranker + strategy: + metrics: [retrieval_f1, retrieval_recall, retrieval_precision] + speed_threshold: 10 + top_k: 5 + modules: + - module_type: pass_reranker + - module_type: tart + - module_type: monot5 + - module_type: upr + - module_type: rankgpt + - module_type: colbert_reranker + - module_type: sentence_transformer_reranker + - module_type: flag_embedding_reranker + - module_type: flag_embedding_llm_reranker + - module_type: time_reranker + - module_type: openvino_reranker + - module_type: flashrank_reranker + - node_type: passage_filter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + modules: + - module_type: pass_passage_filter + - module_type: similarity_threshold_cutoff + threshold: 0.85 + - module_type: similarity_percentile_cutoff + percentile: 0.6 + - module_type: recency_filter + threshold_datetime: 2015-01-01 3:45:07 + - module_type: threshold_cutoff + threshold: 0.85 + - module_type: percentile_cutoff + percentile: 0.6 + - node_type: passage_compressor + strategy: + metrics: [retrieval_token_f1, retrieval_token_recall, retrieval_token_precision] + speed_threshold: 10 + modules: + - module_type: pass_compressor + - module_type: tree_summarize + llm: openai + model: gpt-4o-mini + - module_type: refine + llm: openai + model: gpt-4o-mini + - module_type: longllmlingua +- node_line_name: post_retrieve_node_line # Arbitrary node line name + nodes: + - node_type: prompt_maker + strategy: + metrics: + - metric_name: bleu + - metric_name: meteor + - metric_name: rouge + - metric_name: sem_score + embedding_model: openai + - metric_name: g_eval + speed_threshold: 10 + generator_modules: + - module_type: llama_index_llm + llm: openai + model: [gpt-4o-mini] + modules: + - module_type: fstring + prompt: ["Tell me something about the question: {query} \n\n {retrieved_contents}", + "Question: {query} \n Something to read: {retrieved_contents} \n What's your answer?"] + - module_type: long_context_reorder + prompt: [ "Tell me something about the question: {query} \n\n {retrieved_contents}", + "Question: {query} \n Something to read: {retrieved_contents} \n What's your answer?" ] + - module_type: window_replacement + prompt: [ "Tell me something about the question: {query} \n\n {retrieved_contents}", + "Question: {query} \n Something to read: {retrieved_contents} \n What's your answer?" ] + - node_type: generator + strategy: + metrics: + - metric_name: bleu + - metric_name: meteor + - metric_name: rouge + - metric_name: sem_score + embedding_model: openai + - metric_name: g_eval # LLM Judge Metric. Default Model: gpt-4-turbo + speed_threshold: 10 + modules: + - module_type: llama_index_llm + llm: [openai] + model: [gpt-4o-mini] + temperature: [0.5, 1.0, 1.5] + - module_type: openai_llm + llm: gpt-4o-mini + temperature: 0.8 diff --git a/autorag-workspace/example/sample_config/rag/english/gpu/half.yaml b/autorag-workspace/example/sample_config/rag/english/gpu/half.yaml new file mode 100644 index 0000000..9f25628 --- /dev/null +++ b/autorag-workspace/example/sample_config/rag/english/gpu/half.yaml @@ -0,0 +1,121 @@ +vectordb: + - name: chroma_bge_m3 + db_type: chroma + client_type: persistent + embedding_model: huggingface_bge_m3 + collection_name: openai + path: ${PROJECT_DIR}/resources/chroma +node_lines: +- node_line_name: retrieve_node_line # Arbitrary node line name + nodes: + - node_type: retrieval + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision, + retrieval_ndcg, retrieval_map, retrieval_mrr ] + speed_threshold: 10 + top_k: 10 + modules: + - module_type: bm25 + bm25_tokenizer: [ porter_stemmer, space, gpt2 ] + - module_type: vectordb + vectordb: chroma_bge_m3 + - module_type: hybrid_rrf + weight_range: (4,80) + - module_type: hybrid_cc + normalize_method: [ mm, tmm, z, dbsf ] + weight_range: (0.0, 1.0) + test_weight_size: 101 + - node_type: passage_augmenter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + top_k: 5 + embedding_model: openai + modules: + - module_type: pass_passage_augmenter + - module_type: prev_next_augmenter + mode: next + - node_type: passage_reranker + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 10 + top_k: 5 + modules: + - module_type: pass_reranker + - module_type: tart + - module_type: monot5 + - module_type: upr + - module_type: rankgpt + - module_type: colbert_reranker + - module_type: sentence_transformer_reranker + - module_type: flag_embedding_reranker + - module_type: flag_embedding_llm_reranker + - module_type: time_reranker + - module_type: openvino_reranker + - module_type: flashrank_reranker + - node_type: passage_filter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + modules: + - module_type: pass_passage_filter + - module_type: similarity_threshold_cutoff + threshold: 0.85 + - module_type: similarity_percentile_cutoff + percentile: 0.6 + - module_type: threshold_cutoff + threshold: 0.85 + - module_type: percentile_cutoff + percentile: 0.6 + - node_type: passage_compressor + strategy: + metrics: [retrieval_token_f1, retrieval_token_recall, retrieval_token_precision] + speed_threshold: 10 + modules: + - module_type: pass_compressor + - module_type: tree_summarize + llm: openai + model: gpt-4o-mini + - module_type: refine + llm: openai + model: gpt-4o-mini + - module_type: longllmlingua +- node_line_name: post_retrieve_node_line # Arbitrary node line name + nodes: + - node_type: prompt_maker + strategy: + metrics: + - metric_name: bleu + - metric_name: meteor + - metric_name: rouge + - metric_name: sem_score + embedding_model: openai + speed_threshold: 10 + generator_modules: + - module_type: llama_index_llm + llm: openai + model: [gpt-4o-mini] + modules: + - module_type: fstring + prompt: + - "Answer to given questions with the following passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:" + - "There is a passages related to user question. Please response carefully to the following question. \n\n Passage: {retrieved_contents} \n\n Question: {query} \n\n Answer the question. Think step by step." # Zero-shot CoT prompt + - "{retrieved_contents} \n\n Read the passage carefully, and answer this question. \n\n Question: {query} \n\n Answer the question. Be concise." # concise prompt + - module_type: long_context_reorder + prompt: + - "Answer to given questions with the following passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:" + - "There is a passages related to user question. Please response carefully to the following question. \n\n Passage: {retrieved_contents} \n\n Question: {query} \n\n Answer the question. Think step by step." # Zero-shot CoT prompt + - "{retrieved_contents} \n\n Read the passage carefully, and answer this question. \n\n Question: {query} \n\n Answer the question. Be concise." # concise prompt + - node_type: generator + strategy: + metrics: + - metric_name: rouge + - embedding_model: openai + metric_name: sem_score + - metric_name: bert_score + speed_threshold: 10 + modules: + - module_type: llama_index_llm + llm: [openai] + model: [gpt-4o-mini] + temperature: [0.5, 1.0] diff --git a/autorag-workspace/example/sample_config/rag/english/gpu_api/compact.yaml b/autorag-workspace/example/sample_config/rag/english/gpu_api/compact.yaml new file mode 100644 index 0000000..fc286b9 --- /dev/null +++ b/autorag-workspace/example/sample_config/rag/english/gpu_api/compact.yaml @@ -0,0 +1,105 @@ +node_lines: +- node_line_name: retrieve_node_line # Arbitrary node line name + nodes: + - node_type: retrieval + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision, + retrieval_ndcg, retrieval_map, retrieval_mrr ] + speed_threshold: 10 + top_k: 10 + modules: + - module_type: bm25 + bm25_tokenizer: [ porter_stemmer, space, gpt2 ] + - module_type: vectordb + vectordb: default + - module_type: hybrid_rrf + weight_range: (4,80) + - module_type: hybrid_cc + normalize_method: [ mm, tmm, z, dbsf ] + weight_range: (0.0, 1.0) + test_weight_size: 101 + - node_type: passage_augmenter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + top_k: 5 + embedding_model: openai + modules: + - module_type: pass_passage_augmenter + - module_type: prev_next_augmenter + mode: next + - node_type: passage_reranker + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 10 + top_k: 5 + modules: + - module_type: pass_reranker + - module_type: tart + - module_type: monot5 + - module_type: upr + - module_type: cohere_reranker + - module_type: rankgpt + - module_type: jina_reranker + - module_type: colbert_reranker + - module_type: sentence_transformer_reranker + - module_type: flag_embedding_reranker + - module_type: flag_embedding_llm_reranker + - module_type: time_reranker + - module_type: openvino_reranker + - module_type: voyageai_reranker + - module_type: mixedbreadai_reranker + - module_type: flashrank_reranker + - node_type: passage_filter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + modules: + - module_type: pass_passage_filter + - module_type: similarity_threshold_cutoff + threshold: 0.85 + - module_type: similarity_percentile_cutoff + percentile: 0.6 + - module_type: threshold_cutoff + threshold: 0.85 + - module_type: percentile_cutoff + percentile: 0.6 +- node_line_name: post_retrieve_node_line # Arbitrary node line name + nodes: + - node_type: prompt_maker + strategy: + metrics: + - metric_name: bleu + - metric_name: meteor + - metric_name: rouge + - metric_name: sem_score + embedding_model: openai + speed_threshold: 10 + generator_modules: + - module_type: llama_index_llm + llm: openai + model: [gpt-4o-mini] + modules: + - module_type: fstring + prompt: + - "Answer to given questions with the following passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:" + - "There is a passages related to user question. Please response carefully to the following question. \n\n Passage: {retrieved_contents} \n\n Question: {query} \n\n Answer the question. Think step by step." # Zero-shot CoT prompt + - "{retrieved_contents} \n\n Read the passage carefully, and answer this question. \n\n Question: {query} \n\n Answer the question. Be concise." # concise prompt + - module_type: long_context_reorder + prompt: + - "Answer to given questions with the following passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:" + - "There is a passages related to user question. Please response carefully to the following question. \n\n Passage: {retrieved_contents} \n\n Question: {query} \n\n Answer the question. Think step by step." # Zero-shot CoT prompt + - "{retrieved_contents} \n\n Read the passage carefully, and answer this question. \n\n Question: {query} \n\n Answer the question. Be concise." # concise prompt + - node_type: generator + strategy: + metrics: + - metric_name: rouge + - embedding_model: openai + metric_name: sem_score + - metric_name: bert_score + speed_threshold: 10 + modules: + - module_type: llama_index_llm + llm: [openai] + model: [gpt-4o-mini] + temperature: [0.5, 1.0] diff --git a/autorag-workspace/example/sample_config/rag/english/gpu_api/full.yaml b/autorag-workspace/example/sample_config/rag/english/gpu_api/full.yaml new file mode 100644 index 0000000..a7435af --- /dev/null +++ b/autorag-workspace/example/sample_config/rag/english/gpu_api/full.yaml @@ -0,0 +1,151 @@ +node_lines: +- node_line_name: pre_retrieve_node_line # Arbitrary node line name + nodes: + - node_type: query_expansion + strategy: + metrics: [retrieval_f1, retrieval_recall, retrieval_precision] + speed_threshold: 10 + top_k: 10 + retrieval_modules: + - module_type: bm25 + bm25_tokenizer: [ porter_stemmer, space, gpt2 ] + - module_type: vectordb + vectordb: default + modules: + - module_type: pass_query_expansion + - module_type: query_decompose + generator_module_type: llama_index_llm + llm: openai + model: [ gpt-4o-mini ] + - module_type: hyde + generator_module_type: llama_index_llm + llm: openai + model: [ gpt-4o-mini ] + max_token: 64 + - module_type: multi_query_expansion + generator_module_type: llama_index_llm + llm: openai + temperature: [ 0.2, 1.0 ] +- node_line_name: retrieve_node_line # Arbitrary node line name + nodes: + - node_type: retrieval + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision, + retrieval_ndcg, retrieval_map, retrieval_mrr ] + speed_threshold: 10 + top_k: 10 + modules: + - module_type: bm25 + - module_type: vectordb + vectordb: default + - module_type: hybrid_rrf + weight_range: (4,80) + - module_type: hybrid_cc + normalize_method: [ mm, tmm, z, dbsf ] + weight_range: (0.0, 1.0) + test_weight_size: 101 + - node_type: passage_augmenter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + top_k: 5 + embedding_model: openai + modules: + - module_type: pass_passage_augmenter + - module_type: prev_next_augmenter + mode: next + - node_type: passage_reranker + strategy: + metrics: [retrieval_f1, retrieval_recall, retrieval_precision] + speed_threshold: 10 + top_k: 5 + modules: + - module_type: pass_reranker + - module_type: tart + - module_type: monot5 + - module_type: upr + - module_type: cohere_reranker + - module_type: rankgpt + - module_type: jina_reranker + - module_type: colbert_reranker + - module_type: sentence_transformer_reranker + - module_type: flag_embedding_reranker + - module_type: flag_embedding_llm_reranker + - module_type: time_reranker + - module_type: openvino_reranker + - module_type: voyageai_reranker + - module_type: mixedbreadai_reranker + - module_type: flashrank_reranker + - node_type: passage_filter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + modules: + - module_type: pass_passage_filter + - module_type: similarity_threshold_cutoff + threshold: 0.85 + - module_type: similarity_percentile_cutoff + percentile: 0.6 + - module_type: recency_filter + threshold_datetime: 2015-01-01 3:45:07 + - module_type: threshold_cutoff + threshold: 0.85 + - module_type: percentile_cutoff + percentile: 0.6 + - node_type: passage_compressor + strategy: + metrics: [retrieval_token_f1, retrieval_token_recall, retrieval_token_precision] + speed_threshold: 10 + modules: + - module_type: pass_compressor + - module_type: tree_summarize + llm: openai + model: gpt-4o-mini + - module_type: refine + llm: openai + model: gpt-4o-mini + - module_type: longllmlingua +- node_line_name: post_retrieve_node_line # Arbitrary node line name + nodes: + - node_type: prompt_maker + strategy: + metrics: + - metric_name: bleu + - metric_name: meteor + - metric_name: rouge + - metric_name: sem_score + embedding_model: openai + - metric_name: g_eval + speed_threshold: 10 + generator_modules: + - module_type: llama_index_llm + llm: openai + model: [gpt-4o-mini] + modules: + - module_type: fstring + prompt: ["Tell me something about the question: {query} \n\n {retrieved_contents}", + "Question: {query} \n Something to read: {retrieved_contents} \n What's your answer?"] + - module_type: long_context_reorder + prompt: [ "Tell me something about the question: {query} \n\n {retrieved_contents}", + "Question: {query} \n Something to read: {retrieved_contents} \n What's your answer?" ] + - module_type: window_replacement + prompt: [ "Tell me something about the question: {query} \n\n {retrieved_contents}", + "Question: {query} \n Something to read: {retrieved_contents} \n What's your answer?" ] + - node_type: generator + strategy: + metrics: + - metric_name: bleu + - metric_name: meteor + - metric_name: rouge + - metric_name: sem_score + embedding_model: openai + - metric_name: g_eval # LLM Judge Metric. Default Model: gpt-4-turbo + speed_threshold: 10 + modules: + - module_type: llama_index_llm + llm: [openai] + model: [gpt-4o-mini] + temperature: [0.5, 1.0, 1.5] + - module_type: openai_llm + llm: gpt-4o-mini + temperature: 0.8 diff --git a/autorag-workspace/example/sample_config/rag/english/gpu_api/half.yaml b/autorag-workspace/example/sample_config/rag/english/gpu_api/half.yaml new file mode 100644 index 0000000..75e1a0c --- /dev/null +++ b/autorag-workspace/example/sample_config/rag/english/gpu_api/half.yaml @@ -0,0 +1,118 @@ +node_lines: +- node_line_name: retrieve_node_line # Arbitrary node line name + nodes: + - node_type: retrieval + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision, + retrieval_ndcg, retrieval_map, retrieval_mrr ] + speed_threshold: 10 + top_k: 10 + modules: + - module_type: bm25 + bm25_tokenizer: [ porter_stemmer, space, gpt2 ] + - module_type: vectordb + vectordb: default + - module_type: hybrid_rrf + weight_range: (4,80) + - module_type: hybrid_cc + normalize_method: [ mm, tmm, z, dbsf ] + weight_range: (0.0, 1.0) + test_weight_size: 101 + - node_type: passage_augmenter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + top_k: 5 + embedding_model: openai + modules: + - module_type: pass_passage_augmenter + - module_type: prev_next_augmenter + mode: next + - node_type: passage_reranker + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 10 + top_k: 5 + modules: + - module_type: pass_reranker + - module_type: tart + - module_type: monot5 + - module_type: upr + - module_type: cohere_reranker + - module_type: rankgpt + - module_type: jina_reranker + - module_type: colbert_reranker + - module_type: sentence_transformer_reranker + - module_type: flag_embedding_reranker + - module_type: flag_embedding_llm_reranker + - module_type: time_reranker + - module_type: openvino_reranker + - module_type: voyageai_reranker + - module_type: mixedbreadai_reranker + - module_type: flashrank_reranker + - node_type: passage_filter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + modules: + - module_type: pass_passage_filter + - module_type: similarity_threshold_cutoff + threshold: 0.85 + - module_type: similarity_percentile_cutoff + percentile: 0.6 + - module_type: threshold_cutoff + threshold: 0.85 + - module_type: percentile_cutoff + percentile: 0.6 + - node_type: passage_compressor + strategy: + metrics: [retrieval_token_f1, retrieval_token_recall, retrieval_token_precision] + speed_threshold: 10 + modules: + - module_type: pass_compressor + - module_type: tree_summarize + llm: openai + model: gpt-4o-mini + - module_type: refine + llm: openai + model: gpt-4o-mini + - module_type: longllmlingua +- node_line_name: post_retrieve_node_line # Arbitrary node line name + nodes: + - node_type: prompt_maker + strategy: + metrics: + - metric_name: bleu + - metric_name: meteor + - metric_name: rouge + - metric_name: sem_score + embedding_model: openai + speed_threshold: 10 + generator_modules: + - module_type: llama_index_llm + llm: openai + model: [gpt-4o-mini] + modules: + - module_type: fstring + prompt: + - "Answer to given questions with the following passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:" + - "There is a passages related to user question. Please response carefully to the following question. \n\n Passage: {retrieved_contents} \n\n Question: {query} \n\n Answer the question. Think step by step." # Zero-shot CoT prompt + - "{retrieved_contents} \n\n Read the passage carefully, and answer this question. \n\n Question: {query} \n\n Answer the question. Be concise." # concise prompt + - module_type: long_context_reorder + prompt: + - "Answer to given questions with the following passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:" + - "There is a passages related to user question. Please response carefully to the following question. \n\n Passage: {retrieved_contents} \n\n Question: {query} \n\n Answer the question. Think step by step." # Zero-shot CoT prompt + - "{retrieved_contents} \n\n Read the passage carefully, and answer this question. \n\n Question: {query} \n\n Answer the question. Be concise." # concise prompt + - node_type: generator + strategy: + metrics: + - metric_name: rouge + - embedding_model: openai + metric_name: sem_score + - metric_name: bert_score + speed_threshold: 10 + modules: + - module_type: llama_index_llm + llm: [openai] + model: [gpt-4o-mini] + temperature: [0.5, 1.0] diff --git a/autorag-workspace/example/sample_config/rag/english/non_gpu/compact.yaml b/autorag-workspace/example/sample_config/rag/english/non_gpu/compact.yaml new file mode 100644 index 0000000..63ad754 --- /dev/null +++ b/autorag-workspace/example/sample_config/rag/english/non_gpu/compact.yaml @@ -0,0 +1,83 @@ +node_lines: +- node_line_name: retrieve_node_line # Arbitrary node line name + nodes: + - node_type: retrieval + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision, + retrieval_ndcg, retrieval_map, retrieval_mrr ] + speed_threshold: 10 + top_k: 10 + modules: + - module_type: bm25 + bm25_tokenizer: [ porter_stemmer, space, gpt2 ] + - module_type: vectordb + vectordb: default + - module_type: hybrid_rrf + weight_range: (4,80) + - module_type: hybrid_cc + normalize_method: [ mm, tmm, z, dbsf ] + weight_range: (0.0, 1.0) + test_weight_size: 101 + - node_type: passage_augmenter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + top_k: 5 + embedding_model: openai + modules: + - module_type: pass_passage_augmenter + - module_type: prev_next_augmenter + mode: next + - node_type: passage_filter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + modules: + - module_type: pass_passage_filter + - module_type: similarity_threshold_cutoff + threshold: 0.85 + - module_type: similarity_percentile_cutoff + percentile: 0.6 + - module_type: threshold_cutoff + threshold: 0.85 + - module_type: percentile_cutoff + percentile: 0.6 +- node_line_name: post_retrieve_node_line # Arbitrary node line name + nodes: + - node_type: prompt_maker + strategy: + metrics: + - metric_name: bleu + - metric_name: meteor + - metric_name: rouge + - metric_name: sem_score + embedding_model: openai + speed_threshold: 10 + generator_modules: + - module_type: llama_index_llm + llm: openai + model: [gpt-4o-mini] + modules: + - module_type: fstring + prompt: + - "Answer to given questions with the following passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:" + - "There is a passages related to user question. Please response carefully to the following question. \n\n Passage: {retrieved_contents} \n\n Question: {query} \n\n Answer the question. Think step by step." # Zero-shot CoT prompt + - "{retrieved_contents} \n\n Read the passage carefully, and answer this question. \n\n Question: {query} \n\n Answer the question. Be concise." # concise prompt + - module_type: long_context_reorder + prompt: + - "Answer to given questions with the following passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:" + - "There is a passages related to user question. Please response carefully to the following question. \n\n Passage: {retrieved_contents} \n\n Question: {query} \n\n Answer the question. Think step by step." # Zero-shot CoT prompt + - "{retrieved_contents} \n\n Read the passage carefully, and answer this question. \n\n Question: {query} \n\n Answer the question. Be concise." # concise prompt + - node_type: generator + strategy: + metrics: + - metric_name: rouge + - embedding_model: openai + metric_name: sem_score + - metric_name: bert_score + speed_threshold: 10 + modules: + - module_type: llama_index_llm + llm: [openai] + model: [gpt-4o-mini] + temperature: [0.5, 1.0] diff --git a/autorag-workspace/example/sample_config/rag/english/non_gpu/full.yaml b/autorag-workspace/example/sample_config/rag/english/non_gpu/full.yaml new file mode 100644 index 0000000..ba18f58 --- /dev/null +++ b/autorag-workspace/example/sample_config/rag/english/non_gpu/full.yaml @@ -0,0 +1,129 @@ +node_lines: +- node_line_name: pre_retrieve_node_line # Arbitrary node line name + nodes: + - node_type: query_expansion + strategy: + metrics: [retrieval_f1, retrieval_recall, retrieval_precision] + speed_threshold: 10 + top_k: 10 + retrieval_modules: + - module_type: bm25 + bm25_tokenizer: [ porter_stemmer, space, gpt2 ] + - module_type: vectordb + vectordb: default + modules: + - module_type: pass_query_expansion + - module_type: query_decompose + generator_module_type: llama_index_llm + llm: openai + model: [ gpt-4o-mini ] + - module_type: hyde + generator_module_type: llama_index_llm + llm: openai + model: [ gpt-4o-mini ] + max_token: 64 + - module_type: multi_query_expansion + generator_module_type: llama_index_llm + llm: openai + temperature: [ 0.2, 1.0 ] +- node_line_name: retrieve_node_line # Arbitrary node line name + nodes: + - node_type: retrieval + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision, + retrieval_ndcg, retrieval_map, retrieval_mrr ] + speed_threshold: 10 + top_k: 10 + modules: + - module_type: bm25 + - module_type: vectordb + embedding_model: openai + embedding_batch: 256 + - module_type: hybrid_rrf + weight_range: (4,80) + - module_type: hybrid_cc + normalize_method: [ mm, tmm, z, dbsf ] + weight_range: (0.0, 1.0) + test_weight_size: 101 + - node_type: passage_augmenter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + top_k: 5 + embedding_model: openai + modules: + - module_type: pass_passage_augmenter + - module_type: prev_next_augmenter + mode: next + - node_type: passage_filter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + modules: + - module_type: pass_passage_filter + - module_type: similarity_threshold_cutoff + threshold: 0.85 + - module_type: similarity_percentile_cutoff + percentile: 0.6 + - module_type: recency_filter + threshold_datetime: 2015-01-01 3:45:07 + - module_type: threshold_cutoff + threshold: 0.85 + - module_type: percentile_cutoff + percentile: 0.6 + - node_type: passage_compressor + strategy: + metrics: [retrieval_token_f1, retrieval_token_recall, retrieval_token_precision] + speed_threshold: 10 + modules: + - module_type: pass_compressor + - module_type: tree_summarize + llm: openai + model: gpt-4o-mini + - module_type: refine + llm: openai + model: gpt-4o-mini +- node_line_name: post_retrieve_node_line # Arbitrary node line name + nodes: + - node_type: prompt_maker + strategy: + metrics: + - metric_name: bleu + - metric_name: meteor + - metric_name: rouge + - metric_name: sem_score + embedding_model: openai + - metric_name: g_eval + speed_threshold: 10 + generator_modules: + - module_type: llama_index_llm + llm: openai + model: [gpt-4o-mini] + modules: + - module_type: fstring + prompt: ["Tell me something about the question: {query} \n\n {retrieved_contents}", + "Question: {query} \n Something to read: {retrieved_contents} \n What's your answer?"] + - module_type: long_context_reorder + prompt: [ "Tell me something about the question: {query} \n\n {retrieved_contents}", + "Question: {query} \n Something to read: {retrieved_contents} \n What's your answer?" ] + - module_type: window_replacement + prompt: [ "Tell me something about the question: {query} \n\n {retrieved_contents}", + "Question: {query} \n Something to read: {retrieved_contents} \n What's your answer?" ] + - node_type: generator + strategy: + metrics: + - metric_name: bleu + - metric_name: meteor + - metric_name: rouge + - metric_name: sem_score + embedding_model: openai + - metric_name: g_eval # LLM Judge Metric. Default Model: gpt-4-turbo + speed_threshold: 10 + modules: + - module_type: llama_index_llm + llm: [openai] + model: [gpt-4o-mini] + temperature: [0.5, 1.0, 1.5] + - module_type: openai_llm + llm: gpt-4o-mini + temperature: 0.8 diff --git a/autorag-workspace/example/sample_config/rag/english/non_gpu/half.yaml b/autorag-workspace/example/sample_config/rag/english/non_gpu/half.yaml new file mode 100644 index 0000000..19aaf7a --- /dev/null +++ b/autorag-workspace/example/sample_config/rag/english/non_gpu/half.yaml @@ -0,0 +1,95 @@ +node_lines: +- node_line_name: retrieve_node_line # Arbitrary node line name + nodes: + - node_type: retrieval + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision, + retrieval_ndcg, retrieval_map, retrieval_mrr ] + speed_threshold: 10 + top_k: 10 + modules: + - module_type: bm25 + bm25_tokenizer: [ porter_stemmer, space, gpt2 ] + - module_type: vectordb + vectordb: default + - module_type: hybrid_rrf + weight_range: (4,80) + - module_type: hybrid_cc + normalize_method: [ mm, tmm, z, dbsf ] + weight_range: (0.0, 1.0) + test_weight_size: 101 + - node_type: passage_augmenter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + top_k: 5 + embedding_model: openai + modules: + - module_type: pass_passage_augmenter + - module_type: prev_next_augmenter + mode: next + - node_type: passage_filter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + modules: + - module_type: pass_passage_filter + - module_type: similarity_threshold_cutoff + threshold: 0.85 + - module_type: similarity_percentile_cutoff + percentile: 0.6 + - module_type: threshold_cutoff + threshold: 0.85 + - module_type: percentile_cutoff + percentile: 0.6 + - node_type: passage_compressor + strategy: + metrics: [retrieval_token_f1, retrieval_token_recall, retrieval_token_precision] + speed_threshold: 10 + modules: + - module_type: pass_compressor + - module_type: tree_summarize + llm: openai + model: gpt-4o-mini + - module_type: refine + llm: openai + model: gpt-4o-mini +- node_line_name: post_retrieve_node_line # Arbitrary node line name + nodes: + - node_type: prompt_maker + strategy: + metrics: + - metric_name: bleu + - metric_name: meteor + - metric_name: rouge + - metric_name: sem_score + embedding_model: openai + speed_threshold: 10 + generator_modules: + - module_type: llama_index_llm + llm: openai + model: [gpt-4o-mini] + modules: + - module_type: fstring + prompt: + - "Answer to given questions with the following passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:" + - "There is a passages related to user question. Please response carefully to the following question. \n\n Passage: {retrieved_contents} \n\n Question: {query} \n\n Answer the question. Think step by step." # Zero-shot CoT prompt + - "{retrieved_contents} \n\n Read the passage carefully, and answer this question. \n\n Question: {query} \n\n Answer the question. Be concise." # concise prompt + - module_type: long_context_reorder + prompt: + - "Answer to given questions with the following passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:" + - "There is a passages related to user question. Please response carefully to the following question. \n\n Passage: {retrieved_contents} \n\n Question: {query} \n\n Answer the question. Think step by step." # Zero-shot CoT prompt + - "{retrieved_contents} \n\n Read the passage carefully, and answer this question. \n\n Question: {query} \n\n Answer the question. Be concise." # concise prompt + - node_type: generator + strategy: + metrics: + - metric_name: rouge + - embedding_model: openai + metric_name: sem_score + - metric_name: bert_score + speed_threshold: 10 + modules: + - module_type: llama_index_llm + llm: [openai] + model: [gpt-4o-mini] + temperature: [0.5, 1.0] diff --git a/autorag-workspace/example/sample_config/rag/english/non_gpu/simple_bedrock.yaml b/autorag-workspace/example/sample_config/rag/english/non_gpu/simple_bedrock.yaml new file mode 100644 index 0000000..e59fb63 --- /dev/null +++ b/autorag-workspace/example/sample_config/rag/english/non_gpu/simple_bedrock.yaml @@ -0,0 +1,33 @@ +vectordb: + - name: mpnet_base_chroma + db_type: chroma + client_type: persistent + embedding_model: huggingface_all_mpnet_base_v2 + collection_name: huggingface_all_mpnet_base_v2 + path: ${PROJECT_DIR}/data/chroma +node_lines: + - node_line_name: retrieve_node_line + nodes: + - node_type: retrieval + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + top_k: 3 + modules: + - module_type: vectordb + vectordb: mpnet_base_chroma + - node_line_name: post_retrieve_node_line + nodes: + - node_type: prompt_maker + strategy: + metrics: [ meteor, rouge, bert_score ] + modules: + - module_type: fstring + prompt: "Read the passages and answer the given question. \n Question: {query} \n Passage: {retrieved_contents} \n Answer : " + - node_type: generator + strategy: + metrics: [ bleu, rouge, bert_score ] + modules: + - module_type: llama_index_llm + llm: bedrock + model: amazon.titan-text-express-v1 + profile_name: your_profile_name # Plz replace this with your profile name diff --git a/autorag-workspace/example/sample_config/rag/english/non_gpu/simple_local.yaml b/autorag-workspace/example/sample_config/rag/english/non_gpu/simple_local.yaml new file mode 100644 index 0000000..f366b66 --- /dev/null +++ b/autorag-workspace/example/sample_config/rag/english/non_gpu/simple_local.yaml @@ -0,0 +1,31 @@ +vectordb: + - name: baai_chroma + db_type: chroma + client_type: persistent + embedding_model: huggingface_baai_bge_small + collection_name: huggingface_baai_bge_small + path: ${PROJECT_DIR}/data/chroma +node_lines: +- node_line_name: retrieve_node_line # Arbitrary node line name + nodes: + - node_type: retrieval + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + top_k: 3 + modules: + - module_type: vectordb + vectordb: baai_chroma +- node_line_name: post_retrieve_node_line # Arbitrary node line name + nodes: + - node_type: prompt_maker + strategy: + metrics: [ meteor, rouge, bert_score ] + modules: + - module_type: fstring + prompt: "Read the passages and answer the given question. \n Question: {query} \n Passage: {retrieved_contents} \n Answer : " + - node_type: generator + strategy: + metrics: [ bleu, rouge, bert_score ] + modules: + - module_type: vllm + llm: mistralai/Mistral-7B-Instruct-v0.2 diff --git a/autorag-workspace/example/sample_config/rag/english/non_gpu/simple_ollama.yaml b/autorag-workspace/example/sample_config/rag/english/non_gpu/simple_ollama.yaml new file mode 100644 index 0000000..32eb8ca --- /dev/null +++ b/autorag-workspace/example/sample_config/rag/english/non_gpu/simple_ollama.yaml @@ -0,0 +1,34 @@ +vectordb: + - name: mpnet_base_chroma + db_type: chroma + client_type: persistent + embedding_model: huggingface_all_mpnet_base_v2 + collection_name: huggingface_all_mpnet_base_v2 + path: ${PROJECT_DIR}/data/chroma +node_lines: + - node_line_name: retrieve_node_line + nodes: + - node_type: retrieval + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + top_k: 3 + modules: + - module_type: vectordb + vectordb: mpnet_base_chroma + - node_line_name: post_retrieve_node_line + nodes: + - node_type: prompt_maker + strategy: + metrics: [ meteor, rouge, bert_score ] + modules: + - module_type: fstring + prompt: "Read the passages and answer the given question. \n Question: {query} \n Passage: {retrieved_contents} \n Answer : " + - node_type: generator + strategy: + metrics: [ bleu, rouge, bert_score ] + modules: + - module_type: llama_index_llm + llm: ollama + model: llama3 + batch: 1 + request_timeout: 100 # You can increase this value if your model is big (slow) diff --git a/autorag-workspace/example/sample_config/rag/english/non_gpu/simple_openai.yaml b/autorag-workspace/example/sample_config/rag/english/non_gpu/simple_openai.yaml new file mode 100644 index 0000000..923c490 --- /dev/null +++ b/autorag-workspace/example/sample_config/rag/english/non_gpu/simple_openai.yaml @@ -0,0 +1,25 @@ +node_lines: +- node_line_name: retrieve_node_line # Arbitrary node line name + nodes: + - node_type: retrieval + strategy: + metrics: [retrieval_f1, retrieval_recall, retrieval_precision] + top_k: 3 + modules: + - module_type: vectordb + vectordb: default +- node_line_name: post_retrieve_node_line # Arbitrary node line name + nodes: + - node_type: prompt_maker + strategy: + metrics: [bleu, meteor, rouge] + modules: + - module_type: fstring + prompt: "Read the passages and answer the given question. \n Question: {query} \n Passage: {retrieved_contents} \n Answer : " + - node_type: generator + strategy: + metrics: [bleu, rouge] + modules: + - module_type: llama_index_llm + llm: openai + model: [ gpt-4o-mini ] diff --git a/autorag-workspace/example/sample_config/rag/extracted_sample.yaml b/autorag-workspace/example/sample_config/rag/extracted_sample.yaml new file mode 100644 index 0000000..01f48ed --- /dev/null +++ b/autorag-workspace/example/sample_config/rag/extracted_sample.yaml @@ -0,0 +1,47 @@ +vectordb: + - name: default + db_type: chroma + client_type: persistent + embedding_model: openai + collection_name: openai + path: ${PROJECT_DIR}/data/chroma +node_lines: +- node_line_name: retrieve_node_line + nodes: + - node_type: retrieval + modules: + - module_type: vectordb + vectordb: default + top_k: 3 + strategy: + metrics: + - retrieval_f1 + - retrieval_recall + - retrieval_precision +- node_line_name: post_retrieve_node_line + nodes: + - node_type: prompt_maker + modules: + - module_type: fstring + prompt: "Read the passages and answer the given question. \n Question: {query} \n Passage: {retrieved_contents} \n Answer : " + strategy: + generator_modules: + - batch: 2 + llm: openai + module_type: llama_index_llm + metrics: + - bleu + - meteor + - rouge + - node_type: generator + modules: + - batch: 2 + llm: openai + model: gpt-3.5-turbo-16k + module_type: llama_index_llm + strategy: + metrics: + - metric_name: bleu + - metric_name: meteor + - embedding_model: openai + metric_name: sem_score diff --git a/autorag-workspace/example/sample_config/rag/full.yaml b/autorag-workspace/example/sample_config/rag/full.yaml new file mode 100644 index 0000000..92fd748 --- /dev/null +++ b/autorag-workspace/example/sample_config/rag/full.yaml @@ -0,0 +1,159 @@ +vectordb: + - name: chroma_large + db_type: chroma + client_type: persistent + embedding_model: openai_embed_3_large + collection_name: openai_embed_3_large + path: ${PROJECT_DIR}/resources/chroma +node_lines: +- node_line_name: pre_retrieve_node_line # Arbitrary node line name + nodes: + - node_type: query_expansion + strategy: + metrics: [retrieval_f1, retrieval_recall, retrieval_precision] + speed_threshold: 10 + top_k: 10 + retrieval_modules: + - module_type: bm25 + bm25_tokenizer: [ porter_stemmer, ko_kiwi, space, gpt2, ko_okt, ko_kkma, sudachipy ] + - module_type: vectordb + vectordb: chroma_large + modules: + - module_type: pass_query_expansion + - module_type: query_decompose + generator_module_type: llama_index_llm + llm: openai + model: [ gpt-3.5-turbo-16k, gpt-3.5-turbo-1106 ] + - module_type: hyde + generator_module_type: llama_index_llm + llm: openai + model: [ gpt-3.5-turbo-16k ] + max_token: 64 + - module_type: multi_query_expansion + generator_module_type: llama_index_llm + llm: openai + temperature: [ 0.2, 1.0 ] +- node_line_name: retrieve_node_line # Arbitrary node line name + nodes: + - node_type: retrieval + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision, + retrieval_ndcg, retrieval_map, retrieval_mrr ] + speed_threshold: 10 + top_k: 10 + modules: + - module_type: bm25 + - module_type: vectordb + vectordb: chroma_large + embedding_batch: 256 + - module_type: hybrid_rrf + weight_range: (4,80) + - module_type: hybrid_cc + normalize_method: [ mm, tmm, z, dbsf ] + weight_range: (0.0, 1.0) + test_weight_size: 101 + - node_type: passage_augmenter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + top_k: 5 + embedding_model: openai + modules: + - module_type: pass_passage_augmenter + - module_type: prev_next_augmenter + mode: next + - node_type: passage_reranker + strategy: + metrics: [retrieval_f1, retrieval_recall, retrieval_precision] + speed_threshold: 10 + top_k: 5 + modules: + - module_type: pass_reranker + - module_type: tart + - module_type: monot5 + - module_type: upr + - module_type: cohere_reranker + - module_type: rankgpt + - module_type: jina_reranker + - module_type: colbert_reranker + - module_type: sentence_transformer_reranker + - module_type: flag_embedding_reranker + - module_type: flag_embedding_llm_reranker + - module_type: time_reranker + - module_type: openvino_reranker + - module_type: voyageai_reranker + - module_type: mixedbreadai_reranker + - module_type: flashrank_reranker + - node_type: passage_filter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + modules: + - module_type: pass_passage_filter + - module_type: similarity_threshold_cutoff + threshold: 0.85 + - module_type: similarity_percentile_cutoff + percentile: 0.6 + - module_type: recency_filter + threshold_datetime: 2015-01-01 3:45:07 + - module_type: threshold_cutoff + threshold: 0.85 + - module_type: percentile_cutoff + percentile: 0.6 + - node_type: passage_compressor + strategy: + metrics: [retrieval_token_f1, retrieval_token_recall, retrieval_token_precision] + speed_threshold: 10 + modules: + - module_type: pass_compressor + - module_type: tree_summarize + llm: openai + model: gpt-3.5-turbo-16k + - module_type: refine + llm: openai + model: gpt-3.5-turbo-16k + - module_type: longllmlingua +- node_line_name: post_retrieve_node_line # Arbitrary node line name + nodes: + - node_type: prompt_maker + strategy: + metrics: + - metric_name: bleu + - metric_name: meteor + - metric_name: rouge + - metric_name: sem_score + embedding_model: openai + - metric_name: g_eval + speed_threshold: 10 + generator_modules: + - module_type: llama_index_llm + llm: openai + model: [gpt-3.5-turbo-16k, gpt-3.5-turbo-1106] + modules: + - module_type: fstring + prompt: ["Tell me something about the question: {query} \n\n {retrieved_contents}", + "Question: {query} \n Something to read: {retrieved_contents} \n What's your answer?"] + - module_type: long_context_reorder + prompt: [ "Tell me something about the question: {query} \n\n {retrieved_contents}", + "Question: {query} \n Something to read: {retrieved_contents} \n What's your answer?" ] + - module_type: window_replacement + prompt: [ "Tell me something about the question: {query} \n\n {retrieved_contents}", + "Question: {query} \n Something to read: {retrieved_contents} \n What's your answer?" ] + - node_type: generator + strategy: + metrics: + - metric_name: bleu + - metric_name: meteor + - metric_name: rouge + - metric_name: sem_score + embedding_model: openai + - metric_name: g_eval + speed_threshold: 10 + modules: + - module_type: llama_index_llm + llm: [openai] + model: [gpt-3.5-turbo-16k, gpt-3.5-turbo-1106] + temperature: [0.5, 1.0, 1.5] + - module_type: openai_llm + llm: gpt-3.5-turbo + temperature: 0.8 diff --git a/autorag-workspace/example/sample_config/rag/korean/gpu/compact_korean.yaml b/autorag-workspace/example/sample_config/rag/korean/gpu/compact_korean.yaml new file mode 100644 index 0000000..3bcc84b --- /dev/null +++ b/autorag-workspace/example/sample_config/rag/korean/gpu/compact_korean.yaml @@ -0,0 +1,93 @@ +vectordb: + - name: chroma_bge_m3 + db_type: chroma + client_type: persistent + embedding_model: huggingface_bge_m3 + collection_name: openai + path: ${PROJECT_DIR}/resources/chroma +node_lines: +- node_line_name: retrieve_node_line # Arbitrary node line name + nodes: + - node_type: retrieval + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision, + retrieval_ndcg, retrieval_map, retrieval_mrr ] + speed_threshold: 10 + top_k: 10 + modules: + - module_type: bm25 + bm25_tokenizer: [ ko_kiwi, ko_okt, ko_kkma ] + - module_type: vectordb + vectordb: chroma_bge_m3 + - module_type: hybrid_rrf + weight_range: (4,80) + - module_type: hybrid_cc + normalize_method: [ mm, tmm, z, dbsf ] + weight_range: (0.0, 1.0) + test_weight_size: 101 + - node_type: passage_augmenter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + top_k: 5 + embedding_model: openai + modules: + - module_type: pass_passage_augmenter + - module_type: prev_next_augmenter + mode: next + - node_type: passage_reranker + modules: + - module_type: koreranker + - module_type: flag_embedding_llm_reranker # Requires enough GPU resources + - module_type: pass_reranker + strategy: + metrics: [ retrieval_recall, retrieval_precision, retrieval_map ] + top_k: 3 + - node_type: passage_filter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + modules: + - module_type: pass_passage_filter + - module_type: similarity_threshold_cutoff + threshold: 0.85 + - module_type: similarity_percentile_cutoff + percentile: 0.6 + - module_type: threshold_cutoff + threshold: 0.85 + - module_type: percentile_cutoff + percentile: 0.6 +- node_line_name: post_retrieve_node_line # Arbitrary node line name + nodes: + - node_type: prompt_maker + strategy: + metrics: + - metric_name: bleu + - metric_name: meteor + - metric_name: rouge + - metric_name: sem_score + embedding_model: openai + speed_threshold: 10 + generator_modules: + - module_type: llama_index_llm + llm: openai + model: [gpt-4o-mini] + modules: + - module_type: fstring + prompt: ["주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"] + - module_type: long_context_reorder + prompt: ["주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"] + - node_type: generator + strategy: + metrics: + - metric_name: rouge + - embedding_model: openai + metric_name: sem_score + - metric_name: bert_score + lang: ko + speed_threshold: 10 + modules: + - module_type: llama_index_llm + llm: [openai] + model: [gpt-4o-mini] + temperature: [0.5, 1.0] diff --git a/autorag-workspace/example/sample_config/rag/korean/gpu/full_korean.yaml b/autorag-workspace/example/sample_config/rag/korean/gpu/full_korean.yaml new file mode 100644 index 0000000..70bc4b2 --- /dev/null +++ b/autorag-workspace/example/sample_config/rag/korean/gpu/full_korean.yaml @@ -0,0 +1,157 @@ +vectordb: + - name: chroma_bge_m3 + db_type: chroma + client_type: persistent + embedding_model: huggingface_bge_m3 + collection_name: openai + path: ${PROJECT_DIR}/resources/chroma +node_lines: +- node_line_name: pre_retrieve_node_line # Arbitrary node line name + nodes: + - node_type: query_expansion + strategy: + metrics: [retrieval_f1, retrieval_recall, retrieval_precision] + speed_threshold: 10 + top_k: 10 + retrieval_modules: + - module_type: bm25 + bm25_tokenizer: [ porter_stemmer, space, gpt2 ] + - module_type: vectordb + vectordb: chroma_bge_m3 + modules: + - module_type: pass_query_expansion + - module_type: hyde + generator_module_type: llama_index_llm + llm: openai + model: [ gpt-4o-mini ] + max_token: 64 + prompt: "질문에 답하기 위한 단락을 작성해 주세요." + - module_type: multi_query_expansion + generator_module_type: llama_index_llm + llm: openai + temperature: [ 0.2, 1.0 ] + prompt: | + 당신은 인공지능 언어 모델 어시스턴트입니다. + 주어진 사용자 질문을 이용해 세 가지 버전의 새 질문을 생성하여 벡터 데이터베이스에서 관련 문서를 검색하는 것이 과제입니다. + 주어진 질문에 대한 다양한 관점을 생성함으로써 사용자가 거리 기반 유사도 검색의 한계를 극복할 수 있도록 돕는 것이 목표입니다. + 다음과 같은 대체 질문을 줄 바꿈으로 구분하여 제공하십시오. + 원래 질문: {query} +- node_line_name: retrieve_node_line # Arbitrary node line name + nodes: + - node_type: retrieval + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision, + retrieval_ndcg, retrieval_map, retrieval_mrr ] + speed_threshold: 10 + top_k: 10 + modules: + - module_type: bm25 + bm25_tokenizer: [ ko_kiwi, ko_okt, ko_kkma ] + - module_type: vectordb + vectordb: chroma_bge_m3 + - module_type: hybrid_rrf + weight_range: (4,80) + - module_type: hybrid_cc + normalize_method: [ mm, tmm, z, dbsf ] + weight_range: (0.0, 1.0) + test_weight_size: 101 + - node_type: passage_augmenter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + top_k: 5 + embedding_model: openai + modules: + - module_type: pass_passage_augmenter + - module_type: prev_next_augmenter + mode: next + - node_type: passage_reranker + modules: + - module_type: koreranker + - module_type: flag_embedding_llm_reranker # Requires enough GPU resources + - module_type: pass_reranker + strategy: + metrics: [ retrieval_recall, retrieval_precision, retrieval_map ] + top_k: 3 + - node_type: passage_filter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + modules: + - module_type: pass_passage_filter + - module_type: similarity_threshold_cutoff + threshold: 0.85 + - module_type: similarity_percentile_cutoff + percentile: 0.6 + - module_type: threshold_cutoff + threshold: 0.85 + - module_type: percentile_cutoff + percentile: 0.6 + - node_type: passage_compressor + strategy: + metrics: [retrieval_token_f1, retrieval_token_recall, retrieval_token_precision] + speed_threshold: 10 + modules: + - module_type: pass_compressor + - module_type: tree_summarize + llm: openai + model: gpt-4o-mini + prompt: | + 여러 문맥 정보는 다음과 같습니다.\n + ---------------------\n + {context_str}\n + ---------------------\n + 사전 지식이 아닌 여러 정보가 주어졌습니다, + 질문에 대답하세요.\n + 질문: {query_str}\n + 답변: + - module_type: refine + llm: openai + model: gpt-4o-mini + prompt: | + 원래 질문은 다음과 같습니다: {query_str} + 기존 답변은 다음과 같습니다: {existing_answer} + 아래에서 기존 답변을 정제할 수 있는 기회가 있습니다. + (필요한 경우에만) 아래에 몇 가지 맥락을 추가하여 기존 답변을 정제할 수 있습니다. + ------------ + {context_msg} + ------------ + 새로운 문맥이 주어지면 기존 답변을 수정하여 질문에 대한 답변을 정제합니다. + 맥락이 쓸모 없다면, 기존 답변을 그대로 답변하세요. + 정제된 답변: + - module_type: longllmlingua +- node_line_name: post_retrieve_node_line # Arbitrary node line name + nodes: + - node_type: prompt_maker + strategy: + metrics: + - metric_name: bleu + - metric_name: meteor + - metric_name: rouge + - metric_name: sem_score + embedding_model: openai + speed_threshold: 10 + generator_modules: + - module_type: llama_index_llm + llm: openai + model: [gpt-4o-mini] + modules: + - module_type: fstring + prompt: ["주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"] + - module_type: long_context_reorder + prompt: ["주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"] + - node_type: generator + strategy: + metrics: + - metric_name: rouge + - embedding_model: openai + metric_name: sem_score + - metric_name: bert_score + lang: ko + - metric_name: g_eval # LLM Judge Metric. Default Model: gpt-4-turbo + speed_threshold: 10 + modules: + - module_type: llama_index_llm + llm: [openai] + model: [gpt-4o-mini] + temperature: [0.5, 1.0] diff --git a/autorag-workspace/example/sample_config/rag/korean/gpu/half_korean.yaml b/autorag-workspace/example/sample_config/rag/korean/gpu/half_korean.yaml new file mode 100644 index 0000000..a20c356 --- /dev/null +++ b/autorag-workspace/example/sample_config/rag/korean/gpu/half_korean.yaml @@ -0,0 +1,126 @@ +vectordb: + - name: chroma_bge_m3 + db_type: chroma + client_type: persistent + embedding_model: huggingface_bge_m3 + collection_name: openai + path: ${PROJECT_DIR}/resources/chroma +node_lines: +- node_line_name: retrieve_node_line # Arbitrary node line name + nodes: + - node_type: retrieval + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision, + retrieval_ndcg, retrieval_map, retrieval_mrr ] + speed_threshold: 10 + top_k: 10 + modules: + - module_type: bm25 + bm25_tokenizer: [ ko_kiwi, ko_okt, ko_kkma ] + - module_type: vectordb + vectordb: chroma_bge_m3 + - module_type: hybrid_rrf + weight_range: (4,80) + - module_type: hybrid_cc + normalize_method: [ mm, tmm, z, dbsf ] + weight_range: (0.0, 1.0) + test_weight_size: 101 + - node_type: passage_augmenter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + top_k: 5 + embedding_model: openai + modules: + - module_type: pass_passage_augmenter + - module_type: prev_next_augmenter + mode: next + - node_type: passage_reranker + modules: + - module_type: koreranker + - module_type: flag_embedding_llm_reranker # Requires enough GPU resources + - module_type: pass_reranker + strategy: + metrics: [ retrieval_recall, retrieval_precision, retrieval_map ] + top_k: 3 + - node_type: passage_filter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + modules: + - module_type: pass_passage_filter + - module_type: similarity_threshold_cutoff + threshold: 0.85 + - module_type: similarity_percentile_cutoff + percentile: 0.6 + - module_type: threshold_cutoff + threshold: 0.85 + - module_type: percentile_cutoff + percentile: 0.6 + - node_type: passage_compressor + strategy: + metrics: [retrieval_token_f1, retrieval_token_recall, retrieval_token_precision] + speed_threshold: 10 + modules: + - module_type: pass_compressor + - module_type: tree_summarize + llm: openai + model: gpt-4o-mini + prompt: | + 여러 문맥 정보는 다음과 같습니다.\n + ---------------------\n + {context_str}\n + ---------------------\n + 사전 지식이 아닌 여러 정보가 주어졌습니다, + 질문에 대답하세요.\n + 질문: {query_str}\n + 답변: + - module_type: refine + llm: openai + model: gpt-4o-mini + prompt: | + 원래 질문은 다음과 같습니다: {query_str} + 기존 답변은 다음과 같습니다: {existing_answer} + 아래에서 기존 답변을 정제할 수 있는 기회가 있습니다. + (필요한 경우에만) 아래에 몇 가지 맥락을 추가하여 기존 답변을 정제할 수 있습니다. + ------------ + {context_msg} + ------------ + 새로운 문맥이 주어지면 기존 답변을 수정하여 질문에 대한 답변을 정제합니다. + 맥락이 쓸모 없다면, 기존 답변을 그대로 답변하세요. + 정제된 답변: + - module_type: longllmlingua +- node_line_name: post_retrieve_node_line # Arbitrary node line name + nodes: + - node_type: prompt_maker + strategy: + metrics: + - metric_name: bleu + - metric_name: meteor + - metric_name: rouge + - metric_name: sem_score + embedding_model: openai + speed_threshold: 10 + generator_modules: + - module_type: llama_index_llm + llm: openai + model: [gpt-4o-mini] + modules: + - module_type: fstring + prompt: ["주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"] + - module_type: long_context_reorder + prompt: ["주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"] + - node_type: generator + strategy: + metrics: + - metric_name: rouge + - embedding_model: openai + metric_name: sem_score + - metric_name: bert_score + lang: ko + speed_threshold: 10 + modules: + - module_type: llama_index_llm + llm: [openai] + model: [gpt-4o-mini] + temperature: [0.5, 1.0] diff --git a/autorag-workspace/example/sample_config/rag/korean/gpu_api/compact_korean.yaml b/autorag-workspace/example/sample_config/rag/korean/gpu_api/compact_korean.yaml new file mode 100644 index 0000000..c60094d --- /dev/null +++ b/autorag-workspace/example/sample_config/rag/korean/gpu_api/compact_korean.yaml @@ -0,0 +1,87 @@ +node_lines: +- node_line_name: retrieve_node_line # Arbitrary node line name + nodes: + - node_type: retrieval + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision, + retrieval_ndcg, retrieval_map, retrieval_mrr ] + speed_threshold: 10 + top_k: 10 + modules: + - module_type: bm25 + bm25_tokenizer: [ ko_kiwi, ko_okt, ko_kkma ] + - module_type: vectordb + vectordb: default + - module_type: hybrid_rrf + weight_range: (4,80) + - module_type: hybrid_cc + normalize_method: [ mm, tmm, z, dbsf ] + weight_range: (0.0, 1.0) + test_weight_size: 101 + - node_type: passage_augmenter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + top_k: 5 + embedding_model: openai + modules: + - module_type: pass_passage_augmenter + - module_type: prev_next_augmenter + mode: next + - node_type: passage_reranker + modules: + - module_type: koreranker + - module_type: flag_embedding_llm_reranker # Requires enough GPU resources + - module_type: cohere_reranker # Set Environment Variable: COHERE_API_KEY + - module_type: pass_reranker + strategy: + metrics: [ retrieval_recall, retrieval_precision, retrieval_map ] + top_k: 3 + - node_type: passage_filter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + modules: + - module_type: pass_passage_filter + - module_type: similarity_threshold_cutoff + threshold: 0.85 + - module_type: similarity_percentile_cutoff + percentile: 0.6 + - module_type: threshold_cutoff + threshold: 0.85 + - module_type: percentile_cutoff + percentile: 0.6 +- node_line_name: post_retrieve_node_line # Arbitrary node line name + nodes: + - node_type: prompt_maker + strategy: + metrics: + - metric_name: bleu + - metric_name: meteor + - metric_name: rouge + - metric_name: sem_score + embedding_model: openai + speed_threshold: 10 + generator_modules: + - module_type: llama_index_llm + llm: openai + model: [gpt-4o-mini] + modules: + - module_type: fstring + prompt: ["주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"] + - module_type: long_context_reorder + prompt: ["주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"] + - node_type: generator + strategy: + metrics: + - metric_name: rouge + - embedding_model: openai + metric_name: sem_score + - metric_name: bert_score + lang: ko + speed_threshold: 10 + modules: + - module_type: llama_index_llm + llm: [openai] + model: [gpt-4o-mini] + temperature: [0.5, 1.0] diff --git a/autorag-workspace/example/sample_config/rag/korean/gpu_api/full_korean.yaml b/autorag-workspace/example/sample_config/rag/korean/gpu_api/full_korean.yaml new file mode 100644 index 0000000..5fe9f68 --- /dev/null +++ b/autorag-workspace/example/sample_config/rag/korean/gpu_api/full_korean.yaml @@ -0,0 +1,151 @@ +node_lines: +- node_line_name: pre_retrieve_node_line # Arbitrary node line name + nodes: + - node_type: query_expansion + strategy: + metrics: [retrieval_f1, retrieval_recall, retrieval_precision] + speed_threshold: 10 + top_k: 10 + retrieval_modules: + - module_type: bm25 + bm25_tokenizer: [ porter_stemmer, space, gpt2 ] + - module_type: vectordb + vectordb: default + modules: + - module_type: pass_query_expansion + - module_type: hyde + generator_module_type: llama_index_llm + llm: openai # + model: [ gpt-4o-mini ] # + max_token: 64 + prompt: "질문에 답하기 위한 단락을 작성해 주세요." + - module_type: multi_query_expansion + generator_module_type: llama_index_llm + llm: openai + temperature: [ 0.2, 1.0 ] + prompt: | + 당신은 인공지능 언어 모델 어시스턴트입니다. + 주어진 사용자 질문을 이용해 세 가지 버전의 새 질문을 생성하여 벡터 데이터베이스에서 관련 문서를 검색하는 것이 과제입니다. + 주어진 질문에 대한 다양한 관점을 생성함으로써 사용자가 거리 기반 유사도 검색의 한계를 극복할 수 있도록 돕는 것이 목표입니다. + 다음과 같은 대체 질문을 줄 바꿈으로 구분하여 제공하십시오. + 원래 질문: {query} +- node_line_name: retrieve_node_line # Arbitrary node line name + nodes: + - node_type: retrieval + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision, + retrieval_ndcg, retrieval_map, retrieval_mrr ] + speed_threshold: 10 + top_k: 10 + modules: + - module_type: bm25 + bm25_tokenizer: [ ko_kiwi, ko_okt, ko_kkma ] # ko_kiwi, ko_okt + - module_type: vectordb + vectordb: default + - module_type: hybrid_rrf + weight_range: (4,80) + - module_type: hybrid_cc + normalize_method: [ mm, tmm, z, dbsf ] + weight_range: (0.0, 1.0) + test_weight_size: 101 + - node_type: passage_augmenter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + top_k: 5 + embedding_model: openai + modules: + - module_type: pass_passage_augmenter + - module_type: prev_next_augmenter + mode: next + - node_type: passage_reranker + modules: + - module_type: koreranker + - module_type: flag_embedding_llm_reranker # Requires enough GPU resources + - module_type: cohere_reranker # Set Environment Variable: COHERE_API_KEY + - module_type: pass_reranker + strategy: + metrics: [ retrieval_recall, retrieval_precision, retrieval_map ] + top_k: 3 + - node_type: passage_filter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + modules: + - module_type: pass_passage_filter + - module_type: similarity_threshold_cutoff + threshold: 0.85 + - module_type: similarity_percentile_cutoff + percentile: 0.6 + - module_type: threshold_cutoff + threshold: 0.85 + - module_type: percentile_cutoff + percentile: 0.6 + - node_type: passage_compressor + strategy: + metrics: [retrieval_token_f1, retrieval_token_recall, retrieval_token_precision] + speed_threshold: 10 + modules: + - module_type: pass_compressor + - module_type: tree_summarize + llm: openai + model: gpt-4o-mini + prompt: | + 여러 문맥 정보는 다음과 같습니다.\n + ---------------------\n + {context_str}\n + ---------------------\n + 사전 지식이 아닌 여러 정보가 주어졌습니다, + 질문에 대답하세요.\n + 질문: {query_str}\n + 답변: + - module_type: refine + llm: openai + model: gpt-4o-mini + prompt: | + 원래 질문은 다음과 같습니다: {query_str} + 기존 답변은 다음과 같습니다: {existing_answer} + 아래에서 기존 답변을 정제할 수 있는 기회가 있습니다. + (필요한 경우에만) 아래에 몇 가지 맥락을 추가하여 기존 답변을 정제할 수 있습니다. + ------------ + {context_msg} + ------------ + 새로운 문맥이 주어지면 기존 답변을 수정하여 질문에 대한 답변을 정제합니다. + 맥락이 쓸모 없다면, 기존 답변을 그대로 답변하세요. + 정제된 답변: + - module_type: longllmlingua +- node_line_name: post_retrieve_node_line # Arbitrary node line name + nodes: + - node_type: prompt_maker + strategy: + metrics: + - metric_name: bleu + - metric_name: meteor + - metric_name: rouge + - metric_name: sem_score + embedding_model: openai + speed_threshold: 10 + generator_modules: + - module_type: llama_index_llm + llm: openai + model: [gpt-4o-mini] + modules: + - module_type: fstring + prompt: ["주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"] + - module_type: long_context_reorder + prompt: ["주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"] + - node_type: generator + strategy: + metrics: + - metric_name: rouge + - embedding_model: openai + metric_name: sem_score + - metric_name: bert_score + lang: ko + - metric_name: g_eval # LLM Judge Metric. Default Model: gpt-4-turbo + speed_threshold: 10 + modules: + - module_type: llama_index_llm + llm: [openai] + model: [gpt-4o-mini] + temperature: [0.5, 1.0] diff --git a/autorag-workspace/example/sample_config/rag/korean/gpu_api/half_korean.yaml b/autorag-workspace/example/sample_config/rag/korean/gpu_api/half_korean.yaml new file mode 100644 index 0000000..965e9eb --- /dev/null +++ b/autorag-workspace/example/sample_config/rag/korean/gpu_api/half_korean.yaml @@ -0,0 +1,120 @@ +node_lines: +- node_line_name: retrieve_node_line # Arbitrary node line name + nodes: + - node_type: retrieval + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision, + retrieval_ndcg, retrieval_map, retrieval_mrr ] + speed_threshold: 10 + top_k: 10 + modules: + - module_type: bm25 + bm25_tokenizer: [ ko_kiwi, ko_okt, ko_kkma ] + - module_type: vectordb + vectordb: default + - module_type: hybrid_rrf + weight_range: (4,80) + - module_type: hybrid_cc + normalize_method: [ mm, tmm, z, dbsf ] + weight_range: (0.0, 1.0) + test_weight_size: 101 + - node_type: passage_augmenter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + top_k: 5 + embedding_model: openai + modules: + - module_type: pass_passage_augmenter + - module_type: prev_next_augmenter + mode: next + - node_type: passage_reranker + modules: + - module_type: koreranker + - module_type: flag_embedding_llm_reranker # Requires enough GPU resources + - module_type: cohere_reranker # Set Environment Variable: COHERE_API_KEY + - module_type: pass_reranker + strategy: + metrics: [ retrieval_recall, retrieval_precision, retrieval_map ] + top_k: 3 + - node_type: passage_filter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + modules: + - module_type: pass_passage_filter + - module_type: similarity_threshold_cutoff + threshold: 0.85 + - module_type: similarity_percentile_cutoff + percentile: 0.6 + - module_type: threshold_cutoff + threshold: 0.85 + - module_type: percentile_cutoff + percentile: 0.6 + - node_type: passage_compressor + strategy: + metrics: [retrieval_token_f1, retrieval_token_recall, retrieval_token_precision] + speed_threshold: 10 + modules: + - module_type: pass_compressor + - module_type: tree_summarize + llm: openai + model: gpt-4o-mini + prompt: | + 여러 문맥 정보는 다음과 같습니다.\n + ---------------------\n + {context_str}\n + ---------------------\n + 사전 지식이 아닌 여러 정보가 주어졌습니다, + 질문에 대답하세요.\n + 질문: {query_str}\n + 답변: + - module_type: refine + llm: openai + model: gpt-4o-mini + prompt: | + 원래 질문은 다음과 같습니다: {query_str} + 기존 답변은 다음과 같습니다: {existing_answer} + 아래에서 기존 답변을 정제할 수 있는 기회가 있습니다. + (필요한 경우에만) 아래에 몇 가지 맥락을 추가하여 기존 답변을 정제할 수 있습니다. + ------------ + {context_msg} + ------------ + 새로운 문맥이 주어지면 기존 답변을 수정하여 질문에 대한 답변을 정제합니다. + 맥락이 쓸모 없다면, 기존 답변을 그대로 답변하세요. + 정제된 답변: + - module_type: longllmlingua +- node_line_name: post_retrieve_node_line # Arbitrary node line name + nodes: + - node_type: prompt_maker + strategy: + metrics: + - metric_name: bleu + - metric_name: meteor + - metric_name: rouge + - metric_name: sem_score + embedding_model: openai + speed_threshold: 10 + generator_modules: + - module_type: llama_index_llm + llm: openai + model: [gpt-4o-mini] + modules: + - module_type: fstring + prompt: ["주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"] + - module_type: long_context_reorder + prompt: ["주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"] + - node_type: generator + strategy: + metrics: + - metric_name: rouge + - embedding_model: openai + metric_name: sem_score + - metric_name: bert_score + lang: ko + speed_threshold: 10 + modules: + - module_type: llama_index_llm + llm: [openai] + model: [gpt-4o-mini] + temperature: [0.5, 1.0] diff --git a/autorag-workspace/example/sample_config/rag/korean/non_gpu/compact_korean.yaml b/autorag-workspace/example/sample_config/rag/korean/non_gpu/compact_korean.yaml new file mode 100644 index 0000000..e10bba6 --- /dev/null +++ b/autorag-workspace/example/sample_config/rag/korean/non_gpu/compact_korean.yaml @@ -0,0 +1,78 @@ +node_lines: +- node_line_name: retrieve_node_line # Arbitrary node line name + nodes: + - node_type: retrieval + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision, + retrieval_ndcg, retrieval_map, retrieval_mrr ] + speed_threshold: 10 + top_k: 10 + modules: + - module_type: bm25 + bm25_tokenizer: [ ko_kiwi, ko_okt, ko_kkma ] + - module_type: vectordb + vectordb: default + - module_type: hybrid_rrf + weight_range: (4,80) + - module_type: hybrid_cc + normalize_method: [ mm, tmm, z, dbsf ] + weight_range: (0.0, 1.0) + test_weight_size: 101 + - node_type: passage_augmenter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + top_k: 5 + embedding_model: openai + modules: + - module_type: pass_passage_augmenter + - module_type: prev_next_augmenter + mode: next + - node_type: passage_filter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + modules: + - module_type: pass_passage_filter + - module_type: similarity_threshold_cutoff + threshold: 0.85 + - module_type: similarity_percentile_cutoff + percentile: 0.6 + - module_type: threshold_cutoff + threshold: 0.85 + - module_type: percentile_cutoff + percentile: 0.6 +- node_line_name: post_retrieve_node_line # Arbitrary node line name + nodes: + - node_type: prompt_maker + strategy: + metrics: + - metric_name: bleu + - metric_name: meteor + - metric_name: rouge + - metric_name: sem_score + embedding_model: openai + speed_threshold: 10 + generator_modules: + - module_type: llama_index_llm + llm: openai + model: [gpt-4o-mini] + modules: + - module_type: fstring + prompt: ["주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"] + - module_type: long_context_reorder + prompt: ["주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"] + - node_type: generator + strategy: + metrics: + - metric_name: rouge + - embedding_model: openai + metric_name: sem_score + - metric_name: bert_score + lang: ko + speed_threshold: 10 + modules: + - module_type: llama_index_llm + llm: [openai] + model: [gpt-4o-mini] + temperature: [0.5, 1.0] diff --git a/autorag-workspace/example/sample_config/rag/korean/non_gpu/full_korean.yaml b/autorag-workspace/example/sample_config/rag/korean/non_gpu/full_korean.yaml new file mode 100644 index 0000000..e6b6a65 --- /dev/null +++ b/autorag-workspace/example/sample_config/rag/korean/non_gpu/full_korean.yaml @@ -0,0 +1,142 @@ +node_lines: +- node_line_name: pre_retrieve_node_line # Arbitrary node line name + nodes: + - node_type: query_expansion + strategy: + metrics: [retrieval_f1, retrieval_recall, retrieval_precision] + speed_threshold: 10 + top_k: 10 + retrieval_modules: + - module_type: bm25 + bm25_tokenizer: [ porter_stemmer, space, gpt2 ] + - module_type: vectordb + vectordb: default + modules: + - module_type: pass_query_expansion + - module_type: hyde + generator_module_type: llama_index_llm + llm: openai + model: [ gpt-4o-mini ] + max_token: 64 + prompt: "질문에 답하기 위한 단락을 작성해 주세요." + - module_type: multi_query_expansion + generator_module_type: llama_index_llm + llm: openai + temperature: [ 0.2, 1.0 ] + prompt: | + 당신은 인공지능 언어 모델 어시스턴트입니다. + 주어진 사용자 질문을 이용해 세 가지 버전의 새 질문을 생성하여 벡터 데이터베이스에서 관련 문서를 검색하는 것이 과제입니다. + 주어진 질문에 대한 다양한 관점을 생성함으로써 사용자가 거리 기반 유사도 검색의 한계를 극복할 수 있도록 돕는 것이 목표입니다. + 다음과 같은 대체 질문을 줄 바꿈으로 구분하여 제공하십시오. + 원래 질문: {query} +- node_line_name: retrieve_node_line # Arbitrary node line name + nodes: + - node_type: retrieval + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision, + retrieval_ndcg, retrieval_map, retrieval_mrr ] + speed_threshold: 10 + top_k: 10 + modules: + - module_type: bm25 + bm25_tokenizer: [ ko_kiwi, ko_okt, ko_kkma ] + - module_type: vectordb + vectordb: default + - module_type: hybrid_rrf + weight_range: (4,80) + - module_type: hybrid_cc + normalize_method: [ mm, tmm, z, dbsf ] + weight_range: (0.0, 1.0) + test_weight_size: 101 + - node_type: passage_augmenter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + top_k: 5 + embedding_model: openai + modules: + - module_type: pass_passage_augmenter + - module_type: prev_next_augmenter + mode: next + - node_type: passage_filter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + modules: + - module_type: pass_passage_filter + - module_type: similarity_threshold_cutoff + threshold: 0.85 + - module_type: similarity_percentile_cutoff + percentile: 0.6 + - module_type: threshold_cutoff + threshold: 0.85 + - module_type: percentile_cutoff + percentile: 0.6 + - node_type: passage_compressor + strategy: + metrics: [retrieval_token_f1, retrieval_token_recall, retrieval_token_precision] + speed_threshold: 10 + modules: + - module_type: pass_compressor + - module_type: tree_summarize + llm: openai + model: gpt-4o-mini + prompt: | + 여러 문맥 정보는 다음과 같습니다.\n + ---------------------\n + {context_str}\n + ---------------------\n + 사전 지식이 아닌 여러 정보가 주어졌습니다, + 질문에 대답하세요.\n + 질문: {query_str}\n + 답변: + - module_type: refine + llm: openai + model: gpt-4o-mini + prompt: | + 원래 질문은 다음과 같습니다: {query_str} + 기존 답변은 다음과 같습니다: {existing_answer} + 아래에서 기존 답변을 정제할 수 있는 기회가 있습니다. + (필요한 경우에만) 아래에 몇 가지 맥락을 추가하여 기존 답변을 정제할 수 있습니다. + ------------ + {context_msg} + ------------ + 새로운 문맥이 주어지면 기존 답변을 수정하여 질문에 대한 답변을 정제합니다. + 맥락이 쓸모 없다면, 기존 답변을 그대로 답변하세요. + 정제된 답변: + - module_type: longllmlingua +- node_line_name: post_retrieve_node_line # Arbitrary node line name + nodes: + - node_type: prompt_maker + strategy: + metrics: + - metric_name: bleu + - metric_name: meteor + - metric_name: rouge + - metric_name: sem_score + embedding_model: openai + speed_threshold: 10 + generator_modules: + - module_type: llama_index_llm + llm: openai + model: [gpt-4o-mini] + modules: + - module_type: fstring + prompt: ["주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"] + - module_type: long_context_reorder + prompt: ["주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"] + - node_type: generator + strategy: + metrics: + - metric_name: rouge + - embedding_model: openai + metric_name: sem_score + - metric_name: bert_score + lang: ko + - metric_name: g_eval # LLM Judge Metric. Default Model: gpt-4-turbo + speed_threshold: 10 + modules: + - module_type: llama_index_llm + llm: [openai] + model: [gpt-4o-mini] + temperature: [0.5, 1.0] diff --git a/autorag-workspace/example/sample_config/rag/korean/non_gpu/half_korean.yaml b/autorag-workspace/example/sample_config/rag/korean/non_gpu/half_korean.yaml new file mode 100644 index 0000000..18d098f --- /dev/null +++ b/autorag-workspace/example/sample_config/rag/korean/non_gpu/half_korean.yaml @@ -0,0 +1,111 @@ +node_lines: +- node_line_name: retrieve_node_line # Arbitrary node line name + nodes: + - node_type: retrieval + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision, + retrieval_ndcg, retrieval_map, retrieval_mrr ] + speed_threshold: 10 + top_k: 10 + modules: + - module_type: bm25 + bm25_tokenizer: [ ko_kiwi, ko_okt, ko_kkma ] + - module_type: vectordb + vectordb: default + - module_type: hybrid_rrf + weight_range: (4,80) + - module_type: hybrid_cc + normalize_method: [ mm, tmm, z, dbsf ] + weight_range: (0.0, 1.0) + test_weight_size: 101 + - node_type: passage_augmenter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + top_k: 5 + embedding_model: openai + modules: + - module_type: pass_passage_augmenter + - module_type: prev_next_augmenter + mode: next + - node_type: passage_filter + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + speed_threshold: 5 + modules: + - module_type: pass_passage_filter + - module_type: similarity_threshold_cutoff + threshold: 0.85 + - module_type: similarity_percentile_cutoff + percentile: 0.6 + - module_type: threshold_cutoff + threshold: 0.85 + - module_type: percentile_cutoff + percentile: 0.6 + - node_type: passage_compressor + strategy: + metrics: [retrieval_token_f1, retrieval_token_recall, retrieval_token_precision] + speed_threshold: 10 + modules: + - module_type: pass_compressor + - module_type: tree_summarize + llm: openai + model: gpt-4o-mini + prompt: | + 여러 문맥 정보는 다음과 같습니다.\n + ---------------------\n + {context_str}\n + ---------------------\n + 사전 지식이 아닌 여러 정보가 주어졌습니다, + 질문에 대답하세요.\n + 질문: {query_str}\n + 답변: + - module_type: refine + llm: openai + model: gpt-4o-mini + prompt: | + 원래 질문은 다음과 같습니다: {query_str} + 기존 답변은 다음과 같습니다: {existing_answer} + 아래에서 기존 답변을 정제할 수 있는 기회가 있습니다. + (필요한 경우에만) 아래에 몇 가지 맥락을 추가하여 기존 답변을 정제할 수 있습니다. + ------------ + {context_msg} + ------------ + 새로운 문맥이 주어지면 기존 답변을 수정하여 질문에 대한 답변을 정제합니다. + 맥락이 쓸모 없다면, 기존 답변을 그대로 답변하세요. + 정제된 답변: + - module_type: longllmlingua +- node_line_name: post_retrieve_node_line # Arbitrary node line name + nodes: + - node_type: prompt_maker + strategy: + metrics: + - metric_name: bleu + - metric_name: meteor + - metric_name: rouge + - metric_name: sem_score + embedding_model: openai + speed_threshold: 10 + generator_modules: + - module_type: llama_index_llm + llm: openai + model: [gpt-4o-mini] + modules: + - module_type: fstring + prompt: ["주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"] + - module_type: long_context_reorder + prompt: ["주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"] + - node_type: generator + strategy: + metrics: + - metric_name: rouge + - embedding_model: openai + metric_name: sem_score + - metric_name: bert_score + lang: ko + speed_threshold: 10 + modules: + - module_type: llama_index_llm + llm: [openai] + model: [gpt-4o-mini] + temperature: [0.5, 1.0] diff --git a/autorag-workspace/example/sample_config/rag/korean/non_gpu/simple_korean.yaml b/autorag-workspace/example/sample_config/rag/korean/non_gpu/simple_korean.yaml new file mode 100644 index 0000000..465baf2 --- /dev/null +++ b/autorag-workspace/example/sample_config/rag/korean/non_gpu/simple_korean.yaml @@ -0,0 +1,30 @@ +node_lines: + - node_line_name: retrieve_node_line # Arbitrary node line name + nodes: + - node_type: retrieval + strategy: + metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ] + top_k: 3 + modules: + - module_type: vectordb + vectordb: default + - node_line_name: post_retrieve_node_line # Arbitrary node line name + nodes: + - node_type: prompt_maker + strategy: + metrics: [ bleu, meteor, rouge ] + modules: + - module_type: fstring + prompt: "주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:" + - node_type: generator + strategy: + metrics: + - metric_name: rouge + - embedding_model: openai + metric_name: sem_score + - metric_name: bert_score + lang: ko + modules: + - module_type: llama_index_llm + llm: openai + model: [ gpt-4o-mini ] diff --git a/autorag-workspace/example/sample_dataset/README.md b/autorag-workspace/example/sample_dataset/README.md new file mode 100644 index 0000000..c249ad7 --- /dev/null +++ b/autorag-workspace/example/sample_dataset/README.md @@ -0,0 +1,25 @@ +# sample_dataset handling + +The sample_dataset folder does not includes a `qa.parquet`, `corpus.parquet` file that is significantly large and cannot be uploaded directly to Git due to size limitations. + +To prepare and use datasets available in the sample_dataset folder, specifically `triviaqa`, `hotpotqa`, `msmarco` and `eli5`, you can follow the outlined methods below. + +## Usage + + The example provided uses `triviaqa`, but the same approach applies to `msmarco`, `eli5` and `hotpotqa`. + +### 1. Run with a specified save path +To execute the Python script from the terminal and save the dataset to a specified path, use the command: + +```bash +python ./sample_dataset/triviaqa/load_triviaqa_dataset.py --save_path /path/to/save/dataset +``` +This runs the `load_triviaqa_dataset.py` script located in the `./sample_dataset/triviaqa/` directory, +using the `--save_path` argument to specify the dataset's save location. + +### 2. Run without specifying a save path +If you run the script without the `--save_path` argument, the dataset will be saved to a default location, which is the directory containing the `load_triviaqa_dataset.py` file, essentially `./sample_dataset/triviaqa/`: +```bash +python ./sample_dataset/triviaqa/load_triviaqa_dataset.py +``` +This behavior allows for a straightforward execution without needing to specify a path, making it convenient for quick tests or when working directly within the target directory. diff --git a/autorag-workspace/example/sample_dataset/eli5/load_eli5_dataset.py b/autorag-workspace/example/sample_dataset/eli5/load_eli5_dataset.py new file mode 100644 index 0000000..69a07d0 --- /dev/null +++ b/autorag-workspace/example/sample_dataset/eli5/load_eli5_dataset.py @@ -0,0 +1,35 @@ +import os +import pathlib + +import click +from datasets import load_dataset + + +@click.command() +@click.option( + "--save_path", + type=str, + default=pathlib.PurePath(__file__).parent, + help="Path to save sample eli5 dataset.", +) +def load_eli5_dataset(save_path): + # set file path + file_path = "MarkrAI/eli5_sample_autorag" + + # load dataset + corpus_dataset = load_dataset(file_path, "corpus")["train"].to_pandas() + qa_train_dataset = load_dataset(file_path, "qa")["train"].to_pandas() + qa_test_dataset = load_dataset(file_path, "qa")["test"].to_pandas() + + # save data + if os.path.exists(os.path.join(save_path, "corpus.parquet")) is True: + raise ValueError("corpus.parquet already exists") + if os.path.exists(os.path.join(save_path, "qa.parquet")) is True: + raise ValueError("qa.parquet already exists") + corpus_dataset.to_parquet(os.path.join(save_path, "corpus.parquet")) + qa_train_dataset.to_parquet(os.path.join(save_path, "qa_train.parquet")) + qa_test_dataset.to_parquet(os.path.join(save_path, "qa_test.parquet")) + + +if __name__ == "__main__": + load_eli5_dataset() diff --git a/autorag-workspace/example/sample_dataset/hotpotqa/load_hotpotqa_dataset.py b/autorag-workspace/example/sample_dataset/hotpotqa/load_hotpotqa_dataset.py new file mode 100644 index 0000000..973d61c --- /dev/null +++ b/autorag-workspace/example/sample_dataset/hotpotqa/load_hotpotqa_dataset.py @@ -0,0 +1,35 @@ +import os +import pathlib + +import click +from datasets import load_dataset + + +@click.command() +@click.option( + "--save_path", + type=str, + default=pathlib.PurePath(__file__).parent, + help="Path to save sample hotpotqa dataset.", +) +def load_hotpotqa_dataset(save_path): + # set file path + file_path = "gnekt/hotpotqa_small_sample_autorag" + + # load dataset + corpus_dataset = load_dataset(file_path, "corpus")["train"].to_pandas() + qa_validation_dataset = load_dataset(file_path, "qa")["validation"].to_pandas() + + # save corpus data + if os.path.exists(os.path.join(save_path, "corpus.parquet")) is True: + raise ValueError("corpus.parquet already exists") + if os.path.exists(os.path.join(save_path, "qa.parquet")) is True: + raise ValueError("qa.parquet already exists") + corpus_dataset.to_parquet(os.path.join(save_path, "corpus.parquet"), index=False) + qa_validation_dataset.to_parquet( + os.path.join(save_path, "qa_validation.parquet"), index=False + ) + + +if __name__ == "__main__": + load_hotpotqa_dataset() diff --git a/autorag-workspace/example/sample_dataset/msmarco/load_msmarco_dataset.py b/autorag-workspace/example/sample_dataset/msmarco/load_msmarco_dataset.py new file mode 100644 index 0000000..8a8abae --- /dev/null +++ b/autorag-workspace/example/sample_dataset/msmarco/load_msmarco_dataset.py @@ -0,0 +1,37 @@ +import os +import pathlib + +import click +from datasets import load_dataset + + +@click.command() +@click.option( + "--save_path", + type=str, + default=pathlib.PurePath(__file__).parent, + help="Path to save sample msmarco dataset.", +) +def load_msmarco_dataset(save_path): + # set file path + file_path = "MarkrAI/msmarco_sample_autorag" + + # load dataset + corpus_dataset = load_dataset(file_path, "corpus")["train"].to_pandas() + qa_train_dataset = load_dataset(file_path, "qa")["train"].to_pandas() + qa_test_dataset = load_dataset(file_path, "qa")["test"].to_pandas() + + # save corpus data + if os.path.exists(os.path.join(save_path, "corpus.parquet")) is True: + raise ValueError("corpus.parquet already exists") + if os.path.exists(os.path.join(save_path, "qa.parquet")) is True: + raise ValueError("qa.parquet already exists") + corpus_dataset.to_parquet(os.path.join(save_path, "corpus.parquet"), index=False) + qa_train_dataset.to_parquet( + os.path.join(save_path, "qa_train.parquet"), index=False + ) + qa_test_dataset.to_parquet(os.path.join(save_path, "qa_test.parquet"), index=False) + + +if __name__ == "__main__": + load_msmarco_dataset() diff --git a/autorag-workspace/example/sample_dataset/triviaqa/load_triviaqa_dataset.py b/autorag-workspace/example/sample_dataset/triviaqa/load_triviaqa_dataset.py new file mode 100644 index 0000000..1067c17 --- /dev/null +++ b/autorag-workspace/example/sample_dataset/triviaqa/load_triviaqa_dataset.py @@ -0,0 +1,37 @@ +import os +import pathlib + +import click +from datasets import load_dataset + + +@click.command() +@click.option( + "--save_path", + type=str, + default=pathlib.PurePath(__file__).parent, + help="Path to save sample triviaqa dataset.", +) +def load_triviaqa_dataset(save_path): + # set file path + file_path = "MarkrAI/triviaqa_sample_autorag" + + # load dataset + corpus_dataset = load_dataset(file_path, "corpus")["train"].to_pandas() + qa_train_dataset = load_dataset(file_path, "qa")["train"].to_pandas() + qa_test_dataset = load_dataset(file_path, "qa")["test"].to_pandas() + + # save corpus data + if os.path.exists(os.path.join(save_path, "corpus.parquet")) is True: + raise ValueError("corpus.parquet already exists") + if os.path.exists(os.path.join(save_path, "qa.parquet")) is True: + raise ValueError("qa.parquet already exists") + corpus_dataset.to_parquet(os.path.join(save_path, "corpus.parquet"), index=False) + qa_train_dataset.to_parquet( + os.path.join(save_path, "qa_train.parquet"), index=False + ) + qa_test_dataset.to_parquet(os.path.join(save_path, "qa_test.parquet"), index=False) + + +if __name__ == "__main__": + load_triviaqa_dataset() diff --git a/autorag-workspace/main.py b/autorag-workspace/main.py new file mode 100644 index 0000000..4ddea57 --- /dev/null +++ b/autorag-workspace/main.py @@ -0,0 +1,35 @@ +import os +import autorag +import click +from autorag.evaluator import Evaluator +from dotenv import load_dotenv + +from llama_index.llms.ollama import Ollama + +data_path = '../projects/daesan-dangjin_01' # 폴더명 확인 + +OLLAMA_BASE_URL = "autorag-ollama:11434" +autorag.generator_models["ollama"] = autorag.LazyInit(Ollama, base_url=OLLAMA_BASE_URL, model="phi4", request_timeout=300, num_gpus=1) +autorag.generator_models["ollama"] = autorag.LazyInit(Ollama, base_url=OLLAMA_BASE_URL, model="gemma3:12b", request_timeout=300, num_gpus=1) +autorag.generator_models["ollama"] = autorag.LazyInit(Ollama, base_url=OLLAMA_BASE_URL, model="deepseek-r1:14b", request_timeout=300, num_gpus=1) +autorag.generator_models["ollama"] = autorag.LazyInit(Ollama, base_url=OLLAMA_BASE_URL, model="aya-expanse:8b", request_timeout=300, num_gpus=1) + +# autorag/embedding/base.py 임베딩 모델추가함 + +@click.command() +@click.option('--config', type=click.Path(exists=True), default=os.path.join(data_path, 'config.yaml')) +@click.option('--qa_data_path', type=click.Path(exists=True), default=os.path.join(data_path, 'qa.parquet')) +@click.option('--corpus_data_path', type=click.Path(exists=True), default=os.path.join(data_path, 'corpus.parquet')) +@click.option('--project_dir', type=click.Path(exists=False), default=os.path.join(data_path, 'benchmark')) + +def main(config, qa_data_path, corpus_data_path, project_dir): + load_dotenv() + if os.getenv('OPENAI_API_KEY') is None: + raise ValueError('OPENAI_API_KEY environment variable is not set') + if not os.path.exists(project_dir): + os.makedirs(project_dir) + evaluator = Evaluator(qa_data_path, corpus_data_path, project_dir=project_dir) + evaluator.start_trial(config) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/autorag-workspace/making.sh b/autorag-workspace/making.sh new file mode 100644 index 0000000..adf450f --- /dev/null +++ b/autorag-workspace/making.sh @@ -0,0 +1,67 @@ +#!/bin/bash + +PROJECT_DIR="../projects/daesan-dangjin_01" +CONFIG_DIR="$PROJECT_DIR/config" +RAW_DATA_DIR="$PROJECT_DIR/raw_data" + +# TIMESTAMP=$(date +"%Y%m%d_%H%M") +PARSE_DIR="$PROJECT_DIR/parse" +CHUNK_DIR="$PROJECT_DIR/chunk" + +mkdir -p "$PARSE_DIR" "$CHUNK_DIR"; + +# ----------------------------------------------------------------------- # +echo "1️⃣ Parsing PDF 문서 시작..." +python3 -c " +from autorag.parser import Parser +parser = Parser(data_path_glob='$RAW_DATA_DIR/*.pdf', project_dir='$PARSE_DIR') +parser.start_parsing('$CONFIG_DIR/parse.yaml') +" +echo "✅ Parse 데이터 생성 완료" +# ----------------------------------------------------------------------- # +echo "2️⃣ Chunking 데이터 생성 시작..." +python3 -c " +from autorag.chunker import Chunker +chunker = Chunker.from_parquet(parsed_data_path='$PARSE_DIR/parsed_result.parquet', project_dir='$CHUNK_DIR') +chunker.start_chunking('$CONFIG_DIR/chunk.yaml') +" +echo "✅ Chunk 데이터 생성 완료" +# ----------------------------------------------------------------------- # +QA_SIZE=20 +echo "3️⃣ QA 데이터 생성 시작..." +python3 -c " +import os +import pandas as pd +from autorag.data.qa.filter.dontknow import dontknow_filter_rule_based +from autorag.data.qa.generation_gt.llama_index_gen_gt import ( + make_basic_gen_gt, + make_concise_gen_gt, +) +from autorag.data.qa.query.llama_gen_query import factoid_query_gen +from autorag.data.qa.sample import random_single_hop +from autorag.data.qa.schema import Raw, Corpus +from dotenv import load_dotenv +from llama_index.llms.openai import OpenAI + +load_dotenv() +print('API Key:', os.getenv('OPENAI_API_KEY')) + +llm = OpenAI(model='gpt-4o-mini') + +initial_raw = Raw(pd.read_parquet('$PARSE_DIR/parsed_result.parquet', engine='pyarrow')) +initial_corpus = Corpus(pd.read_parquet('$CHUNK_DIR/0.parquet', engine='pyarrow'), initial_raw) + +qa = ( + initial_corpus + .sample(random_single_hop, n=$QA_SIZE) + .map(lambda df: df.reset_index(drop=True)) + .make_retrieval_gt_contents() + .batch_apply(factoid_query_gen, llm=llm, lang='ko') + .batch_apply(make_basic_gen_gt, llm=llm, lang='ko') + .batch_apply(make_concise_gen_gt, llm=llm, lang='ko') + .filter(dontknow_filter_rule_based, lang='ko') +) + +qa.to_parquet('$PROJECT_DIR/qa.parquet', '$PROJECT_DIR/corpus.parquet') +" +echo "✅ QA 데이터 생성 완료" \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..dd74931 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,45 @@ +services: + autorag-api: + image: autorag-base + container_name: autorag-api + environment: + - CUDA_VISIBLE_DEVICES=0 + - OPENAI_API_KEY=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 + - OLLAMA_API_BASE_URL=http://autorag-ollama:11434 # Ollama 컨테이너로 연결결 + volumes: + - ~/.cache/huggingface:/root/.cache/huggingface + - ./:/usr/src/app/ + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + stdin_open: true + tty: true + working_dir: /usr/src/app + depends_on: + - autorag-ollama # Ollama가 먼저 실행되도록 설정 + networks: + - autorag_network + + autorag-ollama: + image: ollama/ollama + container_name: autorag-ollama + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + ports: + - "11434:11434" # Ollama API 포트 설정 + networks: + - autorag_network + restart: always + +networks: + autorag_network: + driver: bridge diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..44d9989 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,149 @@ +[build-system] +requires = ["setuptools", "setuptools-scm"] +build-backend = "setuptools.build_meta" + +[project] +name = "AutoRAG" +authors = [ + { name = "Marker-Inc", email = "vkehfdl1@gmail.com" } +] +description = 'Automatically Evaluate RAG pipelines with your own data. Find optimal structure for new RAG product.' +readme = "README.md" +requires-python = ">=3.10" +keywords = ['RAG', 'AutoRAG', 'autorag', 'rag-evaluation', 'evaluation', 'rag-auto', 'AutoML', 'AutoML-RAG'] +license = { file = "LICENSE" } +classifiers = [ + "Intended Audience :: Developers", + "Intended Audience :: Information Technology", + "Intended Audience :: Science/Research", + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.12', + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Software Development :: Libraries", + "Topic :: Software Development :: Libraries :: Python Modules", +] +urls = { Homepage = "https://github.com/Marker-Inc-Korea/AutoRAG" } +dynamic = ["version", "dependencies"] + +[tool.poetry] +name = "AutoRAG" +version = "0.0.2" #initial version +description = "Automatically Evaluate RAG pipelines with your own data. Find optimal structure for new RAG product." +authors = ["Marker-Inc "] + +[tool.setuptools.dynamic] +version = { file = ["autorag/VERSION"] } +dependencies = { file = ["requirements.txt"] } + +[tool.setuptools] +include-package-data = true + +[tool.setuptools.packages.find] +where = ["."] +include = ["autorag*"] +exclude = ["tests"] + +[tool.pytest.ini_options] +pythonpath = ["."] +testpaths = ["tests"] +addopts = ["--import-mode=importlib"] # default is prepend + +[project.optional-dependencies] +ko = ["kiwipiepy >= 0.18.0", "konlpy"] +dev = ["ruff", "pre-commit"] +parse = ["PyMuPDF", "pdfminer.six", "pdfplumber", "unstructured", "jq", "unstructured[pdf]", "PyPDF2<3.0", "pdf2image"] +ja = ["sudachipy>=0.6.8", "sudachidict_core"] +gpu = ["torch", "sentencepiece", "bert_score", "optimum[openvino,nncf]", "peft", "llmlingua", "FlagEmbedding", + "sentence-transformers", "transformers", "llama-index-llms-ollama", "llama-index-embeddings-huggingface", + "llama-index-llms-huggingface", "onnxruntime"] +all = ["AutoRAG[gpu]", "AutoRAG[ko]", "AutoRAG[dev]", "AutoRAG[parse]", "AutoRAG[ja]"] + +[project.entry-points.console_scripts] +autorag = "autorag.cli:cli" + +[tool.ruff] +# Exclude a variety of commonly ignored directories. +exclude = [ + ".bzr", + ".direnv", + ".eggs", + ".git", + ".git-rewrite", + ".hg", + ".ipynb_checkpoints", + ".mypy_cache", + ".nox", + ".pants.d", + ".pyenv", + ".pytest_cache", + ".pytype", + ".ruff_cache", + ".svn", + ".tox", + ".venv", + ".vscode", + "__pypackages__", + "_build", + "buck-out", + "build", + "dist", + "node_modules", + "site-packages", + "venv", +] + +# Same as Black. +line-length = 88 +indent-width = 4 + +# Assume Python 3.9 +target-version = "py39" + +[tool.ruff.lint] +# Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default. +# Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or +# McCabe complexity (`C901`) by default. +select = ["E4", "E7", "E9", "F"] +ignore = ["E722", "F821"] + +# Allow fix for all enabled rules (when `--fix`) is provided. +fixable = ["ALL"] +unfixable = ["B"] + +# Allow unused variables when underscore-prefixed. +dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["E402", "F401"] +"**/{docs}/*" = ["E402", "F401"] +"test_*.py" = ["F401", "F811"] +"*_test.py" = ["F401", "F811"] +"resources/parse_data/*" = ["W292"] + +[tool.ruff.format] +# Like Black, use double quotes for strings. +quote-style = "double" + +# Like Black, indent with spaces, rather than tabs. +indent-style = "tab" + +# Like Black, respect magic trailing commas. +skip-magic-trailing-comma = false + +# Like Black, automatically detect the appropriate line ending. +line-ending = "auto" + +# Enable auto-formatting of code examples in docstrings. Markdown, +# reStructuredText code/literal blocks and doctests are all supported. +# +# This is currently disabled by default, but it is planned for this +# to be opt-out in the future. +docstring-code-format = true + +# Set the line length limit used when formatting code snippets in +# docstrings. +# +# This only has an effect when the `docstring-code-format` setting is +# enabled. +docstring-code-line-length = "dynamic" diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..6378e06 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,65 @@ +pydantic<2.10.0 # incompatible with llama index +numpy<2.0.0 # temporal not using numpy 2.0.0 +pandas>=2.1.0 +tqdm +tiktoken>=0.7.0 # for counting token +openai>=1.0.0 +rank_bm25 # for bm25 retrieval +pyyaml # for yaml file +pyarrow # for pandas with parquet +fastparquet # for pandas with parquet +sacrebleu # for bleu score +evaluate # for meteor and other scores +rouge_score # for rouge score +rich # for pretty logging +click # for cli +cohere>=5.8.0 # for cohere services +tokenlog>=0.0.2 # for token logging +aiohttp # for async http requests +voyageai # for voyageai reranker +mixedbread-ai # for mixedbread-ai reranker +llama-index-llms-bedrock +scikit-learn +emoji + +### Vector DB ### +pymilvus>=2.3.0 # for using milvus vectordb +chromadb>=0.5.0 # for chroma vectordb +weaviate-client # for weaviate vectordb +pinecone[grpc] # for pinecone vectordb +couchbase # for couchbase vectordb +qdrant-client # for qdrant vectordb + +### API server ### +quart +pyngrok +### LlamaIndex ### +llama-index>=0.11.0 +llama-index-core>=0.11.0 +# readers +llama-index-readers-file +# Embeddings +llama-index-embeddings-openai +llama-index-embeddings-ollama +# LLMs +llama-index-llms-openai>=0.2.7 +llama-index-llms-openai-like +# Retriever +llama-index-retrievers-bm25 + +# WebUI +streamlit +gradio + +### Langchain ### +langchain-core==0.3.0 +langchain-unstructured>=0.1.5 +langchain-upstage +langchain-community>=0.3.0 + +# autorag dashboard +panel +seaborn +ipykernel +ipywidgets +ipywidgets_bokeh \ No newline at end of file diff --git a/requirements_custom.txt b/requirements_custom.txt new file mode 100644 index 0000000..35bda56 --- /dev/null +++ b/requirements_custom.txt @@ -0,0 +1,9 @@ +# added library - 김용연 +llama_index.llms.ollama +llama_index.embeddings.huggingface +pdfplumber +pypdfium2 +pymupdf +AutoRAG[gpu] +AutoRAG[ko] +konlpy \ No newline at end of file