Fix Dockerfile build issue

2025-03-18 16:41:12 +09:00
parent 6814230bfb
commit 9323aa254a
228 changed files with 467 additions and 3488 deletions
--- a/autorag/evaluation/init.py
+++ b/autorag/evaluation/init.py
@@ -0,0 +1,3 @@
+from .retrieval import evaluate_retrieval
+from .generation import evaluate_generation
+from .retrieval_contents import evaluate_retrieval_contents
--- a/autorag/evaluation/generation.py
+++ b/autorag/evaluation/generation.py
@@ -0,0 +1,86 @@
+import functools
+import warnings
+from typing import List, Callable, Union, Dict
+
+import pandas as pd
+
+from autorag.evaluation.metric.generation import (
+	bleu,
+	meteor,
+	rouge,
+	sem_score,
+	g_eval,
+	bert_score,
+	deepeval_faithfulness,
+)
+from autorag.evaluation.util import cast_metrics
+from autorag.schema.metricinput import MetricInput
+
+GENERATION_METRIC_FUNC_DICT = {
+	func.__name__: func
+	for func in [
+		bleu,
+		meteor,
+		rouge,
+		sem_score,
+		g_eval,
+		bert_score,
+		deepeval_faithfulness,
+	]
+}
+
+
+def evaluate_generation(
+	metric_inputs: List[MetricInput], metrics: Union[List[str], List[Dict]]
+):
+	def decorator_evaluate_generation(func: Callable):
+		@functools.wraps(func)
+		def wrapper(*args, **kwargs) -> pd.DataFrame:
+			generation_result = func(*args, **kwargs)
+			if type(generation_result) is tuple:
+				assert (
+					type(generation_result[0]) is list
+					and type(generation_result[0][0]) is str
+				), "Input func must return string list as generated answer at the first return value."
+				generated_str = generation_result[0]
+			elif type(generation_result) is list:
+				assert (
+					type(generation_result[0]) is str
+				), "Input func must return string list as generated answer at the first return value."
+				generated_str = generation_result
+			else:
+				raise ValueError(
+					"Input func must return string list as generated answer at the first return value."
+				)
+			for metric_input, generated_text in zip(metric_inputs, generated_str):
+				metric_input.generated_texts = generated_text
+
+			metric_scores = {}
+			metric_names, metric_params = cast_metrics(metrics)
+
+			for metric_name, metric_param in zip(metric_names, metric_params):
+				if metric_name not in GENERATION_METRIC_FUNC_DICT:
+					warnings.warn(
+						f"metric {metric_name} is not in supported metrics: {GENERATION_METRIC_FUNC_DICT.keys()}"
+						f"{metric_name} will be ignored."
+					)
+				else:
+					metric_scores[metric_name] = GENERATION_METRIC_FUNC_DICT[
+						metric_name
+					](
+						metric_inputs=metric_inputs,
+						**metric_param,
+					)
+
+			metric_result_df = pd.DataFrame(metric_scores)
+			execution_result_df = pd.DataFrame({"generated_texts": generated_str})
+			if type(generation_result) is tuple:
+				execution_result_df["generated_tokens"] = generation_result[1]
+				execution_result_df["generated_log_probs"] = generation_result[2]
+
+			result_df = pd.concat([execution_result_df, metric_result_df], axis=1)
+			return result_df
+
+		return wrapper
+
+	return decorator_evaluate_generation
--- a/autorag/evaluation/metric/init.py
+++ b/autorag/evaluation/metric/init.py
@@ -0,0 +1,22 @@
+from .generation import (
+	bleu,
+	meteor,
+	rouge,
+	sem_score,
+	g_eval,
+	bert_score,
+	deepeval_faithfulness,
+)
+from .retrieval import (
+	retrieval_f1,
+	retrieval_recall,
+	retrieval_precision,
+	retrieval_mrr,
+	retrieval_ndcg,
+	retrieval_map,
+)
+from .retrieval_contents import (
+	retrieval_token_f1,
+	retrieval_token_precision,
+	retrieval_token_recall,
+)
--- a/autorag/evaluation/metric/deepeval_prompt.py
+++ b/autorag/evaluation/metric/deepeval_prompt.py
@@ -0,0 +1,322 @@
+class FaithfulnessTemplate:
+	@staticmethod
+	def generate_claims(text, lang: str = "en"):
+		if lang == "en":
+			return f"""Based on the given text, please generate a comprehensive list of FACTUAL claims that can inferred from the provided text.
+
+	Example:
+	Example Text:
+	"Einstein won the noble prize in 1968 for his discovery of the photoelectric effect."
+
+	Example JSON:
+	{{
+	    "claims": [
+	        "Einstein won the noble prize for his discovery of the photoelectric effect.",
+	        "Einstein won the noble prize in 1968."
+	    ]
+	}}
+	===== END OF EXAMPLE ======
+
+	**
+	IMPORTANT: Please make sure to only return in JSON format, with the "claims" key as a list of strings. No words or explanation is needed.
+	Only include claims that are factual, and the claims you extract should include the full context it was presented in, NOT cherry picked facts.
+	You should NOT include any prior knowledge, and take the text at face value when extracting claims.
+	**
+
+	Text:
+	{text}
+
+	JSON:
+	"""
+		elif lang == "ko":
+			return f"""주어진 텍스트에서 찾을 수 있는 사실적 정보들의 목록을 생성하세요.
+
+예시:
+예시 텍스트:
+“아인슈타인은 1968년에 광전 효과 발견으로 노벨상을 수상했다.”
+
+예시 JSON:
+{{
+“claims”: [
+“아인슈타인은 광전 효과 발견으로 노벨상을 수상했다.”,
+“아인슈타인은 1968년에 노벨상을 수상했다.”
+]
+}}
+===== 예시 끝 ======
+
+**
+중요: 오직 JSON 형식으로 “claims” 키가 문자열 목록으로 반환되도록 해야 합니다. 다른 단어나 설명은 필요하지 않습니다.
+사실에 기반한 주장만 포함하며, 추출한 주장은 전체 맥락을 유지해야 하며, 부분적으로 선택된 사실을 포함하지 않아야 합니다.
+사전 지식은 포함하지 말고, 텍스트에만 기초해 주장들을 추출해야 합니다.
+**
+
+텍스트:
+{text}
+
+JSON:
+"""
+		elif lang == "ja":
+			return f"""与えられたテキストに基づいて、そこから推測できる事実に基づく主張のリストを生成してください。
+
+例:
+例のテキスト:
+「アインシュタインは1968年に光電効果の発見でノーベル賞を受賞しました。」
+
+例のJSON:
+{{
+    "claims": [
+        "アインシュタインは光電効果の発見でノーベル賞を受賞しました。",
+        "アインシュタインは1968年にノーベル賞を受賞しました。"
+    ]
+}}
+===== 例の終わり ======
+
+**
+重要: 必ずJSON形式で"claims"キーが文字列のリストとして返されるようにしてください。説明や余計な言葉は不要です。
+事実に基づく主張のみを含め、抽出された主張は提示された文脈全体を含むものでなければなりません。一部の事実のみを抜粋することは避けてください。
+事前知識を使用せず、テキストに基づいて主張を抽出してください。
+**
+
+テキスト:
+{text}
+
+JSON:
+"""
+		else:
+			raise ValueError(f"Language {lang} is not supported.")
+
+	@staticmethod
+	def generate_truths(text, lang: str = "en"):
+		if lang == "en":
+			return f"""Based on the given text, please generate a comprehensive list of FACTUAL, undisputed truths that can inferred from the provided text.
+
+	Example:
+	Example Text:
+	"Einstein won the noble prize in 1968 for his discovery of the photoelectric effect."
+
+	Example JSON:
+	{{
+	    "truths": [
+	        "Einstein won the noble prize for his discovery of the photoelectric effect.",
+	        "Einstein won the noble prize in 1968."
+	    ]
+	}}
+	===== END OF EXAMPLE ======
+
+	**
+	IMPORTANT: Please make sure to only return in JSON format, with the "truths" key as a list of strings. No words or explanation is needed.
+	Only include truths that are factual.
+	**
+
+	Text:
+	{text}
+
+	JSON:
+	"""
+		elif lang == "ko":
+			return f"""주어진 텍스트에서 추출할 수 있는 사실적이고 논란이 없는 진실들의 목록을 생성하세요.
+
+예시:
+예시 텍스트:
+"아인슈타인은 1968년에 광전 효과 발견으로 노벨상을 수상했다."
+
+예시 JSON:
+{{
+    "truths": [
+        "아인슈타인은 광전 효과 발견으로 노벨상을 수상했다.",
+        "아인슈타인은 1968년에 노벨상을 수상했다."
+    ]
+}}
+===== 예시 끝 ======
+
+**
+중요: 오직 JSON 형식으로 "truths" 키가 문자열 목록으로 반환되도록 해야 합니다. 다른 단어나 설명은 필요하지 않습니다.
+사실에 기반한 진실만 포함해야 합니다.
+**
+
+텍스트:
+{text}
+
+JSON:
+"""
+		elif lang == "ja":
+			return f"""与えられたテキストに基づいて、そこから推測できる事実で議論の余地のない真実のリストを生成してください。
+
+例:
+例のテキスト:
+「アインシュタインは1968年に光電効果の発見でノーベル賞を受賞しました。」
+
+例のJSON:
+{{
+    "truths": [
+        "アインシュタインは光電効果の発見でノーベル賞を受賞しました。",
+        "アインシュタインは1968年にノーベル賞を受賞しました。"
+    ]
+}}
+===== 例の終わり ======
+
+**
+重要: 必ずJSON形式で"truths"キーが文字列のリストとして返されるようにしてください。説明や余計な言葉は不要です。
+事実に基づく真実のみを含めてください。
+**
+
+テキスト:
+{text}
+
+JSON:
+"""
+		else:
+			raise ValueError(f"Language {lang} is not supported.")
+
+	@staticmethod
+	def generate_verdicts(claims, retrieval_context, lang: str = "en"):
+		if lang == "en":
+			return f"""Based on the given claims, which is a list of strings, generate a list of JSON objects to indicate whether EACH claim contradicts any facts in the retrieval context. The JSON will have 2 fields: 'verdict' and 'reason'.
+	The 'verdict' key should STRICTLY be either 'yes', 'no', or 'idk', which states whether the given claim agrees with the context.
+	Provide a 'reason' ONLY if the answer is 'no'.
+	The provided claim is drawn from the actual output. Try to provide a correction in the reason using the facts in the retrieval context.
+
+	**
+	IMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' key as a list of JSON objects.
+	Example retrieval contexts: "Einstein won the Nobel Prize for his discovery of the photoelectric effect. Einstein won the Nobel Prize in 1968. Einstein is a German Scientist."
+	Example claims: ["Barack Obama is a caucasian male.", "Zurich is a city in London", "Einstein won the Nobel Prize for the discovery of the photoelectric effect which may have contributed to his fame.", "Einstein won the Nobel Prize in 1969 for his discovery of the photoelectric effect.", "Einstein was a Germen chef."]
+
+	Example:
+	{{
+	    "verdicts": [
+	        {{
+	            "verdict": "idk"
+	        }},
+	        {{
+	            "verdict": "idk"
+	        }},
+	        {{
+	            "verdict": "yes"
+	        }},
+	        {{
+	            "verdict": "no",
+	            "reason": "The actual output claims Einstein won the Nobel Prize in 1969, which is untrue as the retrieval context states it is 1968 instead."
+	        }},
+	        {{
+	            "verdict": "no",
+	            "reason": "The actual output claims Einstein is a Germen chef, which is not correct as the retrieval context states he was a German scientist instead."
+	        }},
+	    ]
+	}}
+	===== END OF EXAMPLE ======
+
+	The length of 'verdicts' SHOULD BE STRICTLY EQUAL to that of claims.
+	You DON'T have to provide a reason if the answer is 'yes' or 'idk'.
+	ONLY provide a 'no' answer if the retrieval context DIRECTLY CONTRADICTS the claims. YOU SHOULD NEVER USE YOUR PRIOR KNOWLEDGE IN YOUR JUDGEMENT.
+	Claims made using vague, suggestive, speculative language such as 'may have', 'possibility due to', does NOT count as a contradiction.
+	Claims that is not backed up due to a lack of information/is not mentioned in the retrieval contexts MUST be answered 'idk', otherwise I WILL DIE.
+	**
+
+	Retrieval Contexts:
+	{retrieval_context}
+
+	Claims:
+	{claims}
+
+	JSON:
+	"""
+		elif lang == "ko":
+			return f"""주어진 주장에 대해, 각 주장이 주어진 문맥의 사실들과 모순되는지를 나타내는 JSON 객체 목록을 생성하세요. JSON은 두 개의 필드인 'verdict'와 'reason'으로 구성됩니다.
+'verdict'는 'yes', 'no', 또는 'idk' 중 하나여야 하며, 주어진 주장이 문맥과 일치하는지를 나타냅니다.
+'verdict'가 'no'인 경우에만 'reason'을 제공하세요. 'reason'에는 문맥에 따라 주장을 수정하는 내용이 포함되어야 합니다.
+
+**
+중요: 오직 JSON 형식으로 'verdicts' 키가 JSON 객체 목록으로 반환되도록 해야 합니다.
+예시 문맥: "아인슈타인은 광전 효과 발견으로 노벨상을 수상했다. 아인슈타인은 1968년에 노벨상을 수상했다. 아인슈타인은 독일 과학자이다."
+예시 주장: ["버락 오바마는 백인 남성이다.", "취리히는 런던에 있는 도시이다.", "아인슈타인은 광전 효과 발견으로 노벨상을 수상했으며, 이는 그의 명성에 기여했을 것이다.", "아인슈타인은 1969년에 광전 효과 발견으로 노벨상을 수상했다.", "아인슈타인은 독일 요리사였다."]
+
+예시:
+{{
+    "verdicts": [
+        {{
+            "verdict": "idk"
+        }},
+        {{
+            "verdict": "idk"
+        }},
+        {{
+            "verdict": "yes"
+        }},
+        {{
+            "verdict": "no",
+            "reason": "실제 출력은 아인슈타인이 1969년에 노벨상을 수상했다고 주장하지만, 문맥에서는 1968년이라고 명시되어 있습니다."
+        }},
+        {{
+            "verdict": "no",
+            "reason": "실제 출력은 아인슈타인이 독일 요리사라고 주장하지만, 문맥에서는 그가 독일 과학자라고 명시되어 있습니다."
+        }},
+    ]
+}}
+===== 예시 끝 ======
+
+'verdicts' 리스트의 길이는 반드시 주장들의 길이와 같아야 합니다.
+'yes' 또는 'idk'일 경우 'reason'을 제공할 필요가 없습니다.
+검색된 문맥과 직접적으로 모순되는 경우에만 'no' 답변을 제공하세요. 절대로 선험적인 지식을 사용하지 마세요.
+'~일 수 있다', '가능성이 있다'와 같은 모호한 표현은 모순으로 간주하지 마세요.
+문맥에 대한 정보 부족으로 뒷받침되지 않거나 언급되지 않은 주장은 반드시 'idk'로 답변하세요, 그렇지 않으면 내가 죽습니다.
+**
+
+주어진 문맥:
+{retrieval_context}
+
+주장:
+{claims}
+
+JSON:
+"""
+		elif lang == "ja":
+			return f"""与えられた主張について、それぞれの主張が取得された文脈の事実と矛盾しているかどうかを示すJSONオブジェクトのリストを生成してください。JSONには2つのフィールド、'verdict'と'reason'があります。
+'verdict'フィールドは、主張が文脈に一致するかどうかを示すため、厳密に'yes', 'no', 'idk'のいずれかを使用します。
+'verdict'が'no'の場合にのみ、'reason'を提供してください。'reason'には、文脈に基づいて主張を修正する内容が含まれている必要があります。
+
+**
+重要: 必ずJSON形式で'verdicts'キーがJSONオブジェクトのリストとして返されるようにしてください。
+例の文脈:「アインシュタインは光電効果の発見でノーベル賞を受賞しました。アインシュタインは1968年にノーベル賞を受賞しました。アインシュタインはドイツの科学者です。」
+例の主張: ["バラク・オバマは白人男性です。", "チューリッヒはロンドンにある都市です。", "アインシュタインは光電効果の発見でノーベル賞を受賞し、これが彼の名声に貢献したかもしれません。", "アインシュタインは1969年に光電効果の発見でノーベル賞を受賞しました。", "アインシュタインはドイツのシェフでした。"]
+
+例のJSON:
+{{
+    "verdicts": [
+        {{
+            "verdict": "idk"
+        }},
+        {{
+            "verdict": "idk"
+        }},
+        {{
+            "verdict": "yes"
+        }},
+        {{
+            "verdict": "no",
+            "reason": "実際の出力は、アインシュタインが1969年にノーベル賞を受賞したと主張していますが、文脈では1968年と述べられています。"
+        }},
+        {{
+            "verdict": "no",
+            "reason": "実際の出力は、アインシュタインがドイツのシェフだと主張していますが、文脈では彼がドイツの科学者であると述べられています。"
+        }},
+    ]
+}}
+===== 例の終わり ======
+
+'verdicts'のリストの長さは、主張のリストの長さと必ず等しくなければなりません。
+'yes'または'idk'の場合、'reason'を提供する必要はありません。
+文脈と直接矛盾する場合にのみ、'no'を提供してください。決して事前知識を使用しないでください。
+「〜かもしれない」や「〜の可能性がある」といった曖昧な表現は矛盾とは見なされません。
+情報が不足している、または文脈で言及されていない主張には必ず'idk'で答えてください。さもないと私は死んでしまいます。
+**
+
+文脈:
+{retrieval_context}
+
+主張:
+{claims}
+
+JSON:
+"""
+		else:
+			raise ValueError(f"Language {lang} is not supported.")
--- a/autorag/evaluation/metric/g_eval_prompts/coh_detailed.txt
+++ b/autorag/evaluation/metric/g_eval_prompts/coh_detailed.txt
@@ -0,0 +1,32 @@
+You will be given one summary written for a news article.
+
+Your task is to rate the summary on one metric.
+
+Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.
+
+Evaluation Criteria:
+
+Coherence (1-5) - the collective quality of all sentences. We align this dimension with the DUC quality question of structure and coherence whereby "the summary should be well-structured and well-organized. The summary should not just be a heap of related information, but should build from sentence to a coherent body of information about a topic."
+
+Evaluation Steps:
+
+1. Read the news article carefully and identify the main topic and key points.
+2. Read the summary and compare it to the news article. Check if the summary covers the main topic and key points of the news article, and if it presents them in a clear and logical order.
+3. Assign a score for coherence on a scale of 1 to 5, where 1 is the lowest and 5 is the highest based on the Evaluation Criteria.
+
+
+Example:
+
+
+Source Text:
+
+{{Document}}
+
+Summary:
+
+{{Summary}}
+
+
+Evaluation Form (scores ONLY):
+
+- Coherence:
--- a/autorag/evaluation/metric/g_eval_prompts/con_detailed.txt
+++ b/autorag/evaluation/metric/g_eval_prompts/con_detailed.txt
@@ -0,0 +1,33 @@
+You will be given a news article. You will then be given one summary written for this article.
+
+Your task is to rate the summary on one metric.
+
+Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.
+
+
+Evaluation Criteria:
+
+Consistency (1-5) - the factual alignment between the summary and the summarized source. A factually consistent summary contains only statements that are entailed by the source document. Annotators were also asked to penalize summaries that contained hallucinated facts.
+
+Evaluation Steps:
+
+1. Read the news article carefully and identify the main facts and details it presents.
+2. Read the summary and compare it to the article. Check if the summary contains any factual errors that are not supported by the article.
+3. Assign a score for consistency based on the Evaluation Criteria.
+
+
+Example:
+
+
+Source Text:
+
+{{Document}}
+
+Summary:
+
+{{Summary}}
+
+
+Evaluation Form (scores ONLY):
+
+- Consistency:
--- a/autorag/evaluation/metric/g_eval_prompts/flu_detailed.txt
+++ b/autorag/evaluation/metric/g_eval_prompts/flu_detailed.txt
@@ -0,0 +1,26 @@
+You will be given one summary written for a news article.
+
+Your task is to rate the summary on one metric.
+
+Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.
+
+
+Evaluation Criteria:
+
+Fluency (1-3): the quality of the summary in terms of grammar, spelling, punctuation, word choice, and sentence structure.
+
+- 1: Poor. The summary has many errors that make it hard to understand or sound unnatural.
+- 2: Fair. The summary has some errors that affect the clarity or smoothness of the text, but the main points are still comprehensible.
+- 3: Good. The summary has few or no errors and is easy to read and follow.
+
+
+Example:
+
+Summary:
+
+{{Summary}}
+
+
+Evaluation Form (scores ONLY):
+
+- Fluency (1-3):
--- a/autorag/evaluation/metric/g_eval_prompts/rel_detailed.txt
+++ b/autorag/evaluation/metric/g_eval_prompts/rel_detailed.txt
@@ -0,0 +1,33 @@
+You will be given one summary written for a news article.
+
+Your task is to rate the summary on one metric.
+
+Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.
+
+Evaluation Criteria:
+
+Relevance (1-5) - selection of important content from the source. The summary should include only important information from the source document. Annotators were instructed to penalize summaries which contained redundancies and excess information.
+
+Evaluation Steps:
+
+1. Read the summary and the source document carefully.
+2. Compare the summary to the source document and identify the main points of the article.
+3. Assess how well the summary covers the main points of the article, and how much irrelevant or redundant information it contains.
+4. Assign a relevance score from 1 to 5.
+
+
+Example:
+
+
+Source Text:
+
+{{Document}}
+
+Summary:
+
+{{Summary}}
+
+
+Evaluation Form (scores ONLY):
+
+- Relevance:
--- a/autorag/evaluation/metric/generation.py
+++ b/autorag/evaluation/metric/generation.py
@@ -0,0 +1,504 @@
+import asyncio
+import itertools
+import os
+from typing import List, Optional
+
+import evaluate
+import nltk
+import pandas as pd
+from llama_index.core.embeddings import BaseEmbedding
+from llama_index.embeddings.openai import OpenAIEmbedding
+from openai import AsyncOpenAI
+from pydantic import BaseModel
+from rouge_score import tokenizers
+from rouge_score.rouge_scorer import RougeScorer
+from sacrebleu.metrics.bleu import BLEU
+
+from autorag.embedding.base import embedding_models
+from autorag.evaluation.metric.deepeval_prompt import FaithfulnessTemplate
+from autorag.evaluation.metric.util import (
+	autorag_metric_loop,
+	calculate_cosine_similarity,
+)
+from autorag.nodes.generator import OpenAILLM
+from autorag.nodes.generator.base import BaseGenerator
+from autorag.schema.metricinput import MetricInput
+from autorag.support import get_support_modules
+from autorag.utils.util import (
+	get_event_loop,
+	process_batch,
+	openai_truncate_by_token,
+	convert_inputs_to_list,
+	pop_params,
+	empty_cuda_cache,
+)
+
+
+@convert_inputs_to_list
+def huggingface_evaluate(
+	instance, key: str, metric_inputs: List[MetricInput], **kwargs
+) -> List[float]:
+	"""
+	Compute huggingface evaluate metric.
+
+	:param instance: The instance of huggingface evaluates metric.
+	:param key: The key to retrieve result score from huggingface evaluate result.
+	:param metric_inputs: A list of MetricInput schema
+	:param kwargs: The additional arguments for metric function.
+	:return: The list of scores.
+	"""
+
+	def compute_score(gt: List[str], pred: str) -> float:
+		return max(
+			list(
+				map(
+					lambda x: instance.compute(
+						predictions=[pred], references=[x], **kwargs
+					)[key],
+					gt,
+				)
+			)
+		)
+
+	result = list(
+		map(lambda x: compute_score(x.generation_gt, x.generated_texts), metric_inputs)
+	)
+	return result
+
+
+def make_generator_instance(generator_module_type: str, llm: str, **kwargs):
+	llm_class = get_support_modules(generator_module_type)
+	init_params = pop_params(llm_class.__init__, kwargs)
+	return llm_class(project_dir="", llm=llm, **init_params)
+
+
+@autorag_metric_loop(fields_to_check=["retrieval_gt_contents", "generated_texts"])
+def deepeval_faithfulness(
+	metric_inputs: List[MetricInput],
+	generator_module_type: str = "openai_llm",
+	lang: str = "en",
+	llm: str = "gpt-4o-2024-08-06",
+	batch: int = 16,
+	**kwargs,
+) -> List[float]:
+	"""
+	Compute deepeval faithfulness metric.
+	Its default model is gpt-4o-2024-08-06.
+	Since it uses OpenAI model, please be aware of the expensive cost.
+
+	:param metric_inputs: The list of MetricInput schema (Required Field -> "generation_gt", "generated_texts")
+	:param generator_module_type: Generator module type.
+	The default is "openai_llm".
+		You can use like "llama_index_llm" or "vllm".
+	:param lang: The prompt language that you want to use.
+	"en", "ko" and "ja" are supported.
+	Korean prompt is not officially supported by DeepEval, but it can be translated by AutoRAG developers.
+		Default is "en".
+	:param llm: The model name to use for generation.
+		Or llm if using llama_index_llm.
+		The default is "gpt-4o-2024-08-06".
+	:param batch: The batch size for processing.
+		Default is 16.
+	:param kwargs: The extra parameters for initializing the llm instance.
+	:return: The metric scores.
+	"""
+
+	class Truth(BaseModel):
+		truths: List[str]
+
+	class Claim(BaseModel):
+		claims: List[str]
+
+	class Verdict(BaseModel):
+		verdict: str
+		reason: Optional[str]
+
+	class FaithfulnessVerdicts(BaseModel):
+		verdicts: List[Verdict]
+
+	def calculate_score(verdicts: List[Verdict]) -> float:
+		number_of_verdicts = len(verdicts)
+		if number_of_verdicts == 0:
+			return 1
+
+		faithfulness_count = 0
+		for verdict in verdicts:
+			if verdict.verdict.strip().lower() != "no":
+				faithfulness_count += 1
+
+		score = faithfulness_count / number_of_verdicts
+		return score
+
+	retrieval_contexts = list(map(lambda x: x.retrieval_gt_contents, metric_inputs))
+	truth_prompts = list(
+		map(lambda x: FaithfulnessTemplate.generate_truths(x, lang), retrieval_contexts)
+	)
+
+	generated_texts = list(map(lambda x: x.generated_texts, metric_inputs))
+	claim_prompts = list(
+		map(lambda x: FaithfulnessTemplate.generate_claims(x, lang), generated_texts)
+	)
+
+	generator: BaseGenerator = make_generator_instance(
+		generator_module_type, llm=llm, batch=batch, **kwargs
+	)
+	if isinstance(generator, OpenAILLM):  # Because of the event loop error at the httpx
+		# TODO: Fix the httpx APIConnectionError at the many repetitive request to the OpenAILLM on the same instance
+		truth_responses: List[Truth] = generator.structured_output(truth_prompts, Truth)
+		claim_responses: List[Claim] = make_generator_instance(
+			generator_module_type, llm=llm, batch=batch, **kwargs
+		).structured_output(claim_prompts, Claim)
+		verdict_prompts = list(
+			map(
+				lambda claim, truth: FaithfulnessTemplate.generate_verdicts(
+					"\n\n".join(claim.claims), "\n\n".join(truth.truths), lang
+				),
+				claim_responses,
+				truth_responses,
+			)
+		)
+		verdict_responses: List[FaithfulnessVerdicts] = make_generator_instance(
+			generator_module_type, llm=llm, batch=batch, **kwargs
+		).structured_output(verdict_prompts, FaithfulnessVerdicts)
+	else:
+		truth_responses: List[Truth] = generator.structured_output(truth_prompts, Truth)
+		claim_responses: List[Claim] = generator.structured_output(claim_prompts, Claim)
+		verdict_prompts = list(
+			map(
+				lambda claim, truth: FaithfulnessTemplate.generate_verdicts(
+					"\n\n".join(claim.claims), "\n\n".join(truth.truths), lang
+				),
+				claim_responses,
+				truth_responses,
+			)
+		)
+		verdict_responses: List[FaithfulnessVerdicts] = generator.structured_output(
+			verdict_prompts, FaithfulnessVerdicts
+		)
+
+	result = list(map(lambda x: calculate_score(x.verdicts), verdict_responses))
+	return result
+
+
+@autorag_metric_loop(fields_to_check=["generation_gt", "generated_texts"])
+def bleu(
+	metric_inputs: List[MetricInput],
+	tokenize: Optional[str] = None,
+	smooth_method: str = "exp",
+	smooth_value: Optional[float] = None,
+	max_ngram_order: int = 4,
+	trg_lang: str = "",
+	effective_order: bool = True,
+	**kwargs,
+) -> List[float]:
+	"""
+	Computes the BLEU metric given pred and ground-truth.
+
+	:param metric_inputs: A list of MetricInput schema (Required Field -> "generation_gt", "generated_texts")
+	:param tokenize: The tokenizer to use. If None, defaults to language-specific tokenizers with '13a' as the fallback default. check #https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/metrics/bleu.py
+	:param smooth_method: The smoothing method to use ('floor', 'add-k', 'exp' or 'none').
+	:param smooth_value: The smoothing value for `floor` and `add-k` methods. `None` falls back to default value.
+	:param max_ngram_order: If given, it overrides the maximum n-gram order (default: 4) when computing precisions.
+	:param trg_lang: An optional language code to raise potential tokenizer warnings.
+	:param effective_order: If `True`, stop including n-gram orders for which precision is 0. This should be
+	`True`, if sentence-level BLEU will be computed.
+	"""
+	bleu_instance = BLEU(
+		tokenize=tokenize,
+		smooth_method=smooth_method,
+		smooth_value=smooth_value,
+		max_ngram_order=max_ngram_order,
+		trg_lang=trg_lang,
+		effective_order=effective_order,
+		**kwargs,
+	)
+
+	result = list(
+		map(
+			lambda x: bleu_instance.sentence_score(
+				x.generated_texts, x.generation_gt
+			).score,
+			metric_inputs,
+		)
+	)
+	return result
+
+
+@autorag_metric_loop(fields_to_check=["generation_gt", "generated_texts"])
+def meteor(
+	metric_inputs: List[MetricInput],
+	alpha: float = 0.9,
+	beta: float = 3.0,
+	gamma: float = 0.5,
+) -> List[float]:
+	"""
+	Compute meteor score for generation.
+
+	:param metric_inputs: A list of MetricInput schema (Required Field -> "generation_gt", "generated_texts")
+	:param alpha: Parameter for controlling relative weights of precision and recall.
+	    Default is 0.9.
+	:param beta: Parameter for controlling shape of penalty as a
+	    function of as a function of fragmentation.
+	    Default is 3.0.
+	:param gamma: Relative weight assigned to fragmentation penalty.
+	    Default is 0.5.
+	:return: A list of computed metric scores.
+	"""
+	nltk.download("punkt", quiet=True)
+	meteor_instance = evaluate.load("meteor")
+	result = huggingface_evaluate(
+		meteor_instance,
+		"meteor",
+		metric_inputs,
+		alpha=alpha,
+		beta=beta,
+		gamma=gamma,
+	)
+	del meteor_instance
+	return result
+
+
+@autorag_metric_loop(fields_to_check=["generation_gt", "generated_texts"])
+def rouge(
+	metric_inputs: List[MetricInput],
+	rouge_type: Optional[str] = "rougeL",
+	use_stemmer: bool = False,
+	split_summaries: bool = False,
+	batch: int = os.cpu_count(),
+) -> List[float]:
+	"""
+	Compute rouge score for generation.
+
+	:param metric_inputs: A list of MetricInput schema (Required Field -> "generation_gt", "generated_texts")
+	:param rouge_type: A rouge type to use for evaluation.
+	    Default is 'RougeL'.
+	    Choose between rouge1, rouge2, rougeL, and rougeLSum.
+	    - rouge1: unigram (1-gram) based scoring.
+	    - rouge2: bigram (2-gram) based scoring.
+	    - rougeL: Longest Common Subsequence based scoring.
+	    - rougeLSum: splits text using "\n"
+	:param use_stemmer: Bool indicating whether Porter stemmer should be used to
+	    strip word suffixes to improve matching. This arg is used in the
+	    DefaultTokenizer, but other tokenizers might or might not choose to
+	    use this. Default is False.
+	:param split_summaries: Whether to add newlines between sentences for rougeLsum.
+	    Default is False.
+	:param batch: The batch size for processing.
+	    Default is your cpu count.
+	:return: A list of computed metric scores.
+	"""
+	rouge_instance = RougeScorer(
+		rouge_types=[rouge_type],
+		use_stemmer=use_stemmer,
+		split_summaries=split_summaries,
+		tokenizer=tokenizers.DefaultTokenizer(use_stemmer),
+	)
+
+	async def compute(gt: List[str], pred: str) -> float:
+		return rouge_instance.score_multi(targets=gt, prediction=pred)[
+			rouge_type
+		].fmeasure
+
+	tasks = [
+		compute(metric_input.generation_gt, metric_input.generated_texts)
+		for metric_input in metric_inputs
+	]
+	loop = get_event_loop()
+	result = loop.run_until_complete(process_batch(tasks, batch_size=batch))
+
+	del rouge_instance
+	return result
+
+
+@autorag_metric_loop(fields_to_check=["generation_gt", "generated_texts"])
+def sem_score(
+	metric_inputs: List[MetricInput],
+	embedding_model: Optional[BaseEmbedding] = None,
+	batch: int = 128,
+) -> List[float]:
+	"""
+	Compute sem score between generation gt and pred with cosine similarity.
+
+	:param metric_inputs: A list of MetricInput schema (Required Field -> "generation_gt", "generated_texts")
+	:param embedding_model: Embedding model to use for compute cosine similarity.
+	    Default is all-mpnet-base-v2 embedding model.
+	    The paper used this embedding model.
+	:param batch: The batch size for processing.
+	    Default is 128
+	:return: A list of computed metric scores.
+	"""
+	generations = [metric_input.generated_texts for metric_input in metric_inputs]
+	generation_gt = [metric_input.generation_gt for metric_input in metric_inputs]
+	if embedding_model is None:
+		embedding_model = embedding_models.get("huggingface_all_mpnet_base_v2")()
+
+	embedding_model.embed_batch_size = batch
+
+	openai_embedding_max_length = 8000
+	if isinstance(embedding_model, OpenAIEmbedding):
+		generations = openai_truncate_by_token(
+			generations, openai_embedding_max_length, embedding_model.model_name
+		)
+
+	embedded_pred: List[List[float]] = embedding_model.get_text_embedding_batch(
+		generations, show_progress=True
+	)
+	gt_lengths = list(map(len, generation_gt))
+	flatten_gt = list(itertools.chain.from_iterable(generation_gt))
+	if isinstance(embedding_model, OpenAIEmbedding):
+		flatten_gt = openai_truncate_by_token(
+			flatten_gt, openai_embedding_max_length, embedding_model.model_name
+		)
+	embedded_gt_flatten = embedding_model.get_text_embedding_batch(
+		flatten_gt, show_progress=True
+	)
+	# re-group embedded_gt_flatten with gt_lengths
+	iterator = iter(embedded_gt_flatten)
+	embedded_gt: List[List[List[float]]] = [
+		list(itertools.islice(iterator, length)) for length in gt_lengths
+	]
+
+	result = []
+	for gt, pred in zip(embedded_gt, embedded_pred):
+		similarity_scores: List[float] = list(
+			map(lambda x: calculate_cosine_similarity(x, pred), gt)
+		)
+		result.append(max(similarity_scores))
+
+	del embedding_model
+	empty_cuda_cache()
+
+	return result
+
+
+@autorag_metric_loop(fields_to_check=["generation_gt", "generated_texts"])
+def g_eval(
+	metric_inputs: List[MetricInput],
+	metrics: Optional[List[str]] = None,
+	model: str = "gpt-4-0125-preview",
+	batch_size: int = 8,
+) -> List[float]:
+	"""
+	Calculate G-Eval score.
+	G-eval is a metric that uses high-performance LLM model to evaluate generation performance.
+	It evaluates the generation result by coherence, consistency, fluency, and relevance.
+	It uses only 'openai' model, and we recommend to use gpt-4 for evaluation accuracy.
+
+	:param metric_inputs: A list of MetricInput schema (Required Field -> "generation_gt", "generated_texts")
+	:param metrics: A list of metrics to use for evaluation.
+	    Default is all metrics, which is ['coherence', 'consistency', 'fluency', 'relevance'].
+	:param model: OpenAI model name.
+	    Default is 'gpt-4-0125-preview'.
+	:param batch_size: The batch size for processing.
+	    Default is 8.
+	:return: G-Eval score.
+	"""
+	generations = [metric_input.generated_texts for metric_input in metric_inputs]
+	generation_gt = [metric_input.generation_gt for metric_input in metric_inputs]
+	loop = get_event_loop()
+	tasks = [
+		async_g_eval(gt, pred, metrics, model)
+		for gt, pred in zip(generation_gt, generations)
+	]
+	result = loop.run_until_complete(process_batch(tasks, batch_size=batch_size))
+	return result
+
+
+async def async_g_eval(
+	generation_gt: List[str],
+	pred: str,
+	metrics: Optional[List[str]] = None,
+	model: str = "gpt-4-0125-preview",
+) -> float:
+	available_metrics = ["coherence", "consistency", "fluency", "relevance"]
+	if metrics is None:
+		metrics = available_metrics
+	else:
+		assert len(metrics) > 0, "metrics must be a list of string"
+		metrics = [metric for metric in metrics if metric in available_metrics]
+
+	current_path = os.path.dirname(os.path.realpath(__file__))
+	prompt_path = os.path.join(current_path, "g_eval_prompts")
+	g_eval_prompts = {
+		"coherence": open(os.path.join(prompt_path, "coh_detailed.txt")).read(),
+		"consistency": open(os.path.join(prompt_path, "con_detailed.txt")).read(),
+		"fluency": open(os.path.join(prompt_path, "flu_detailed.txt")).read(),
+		"relevance": open(os.path.join(prompt_path, "rel_detailed.txt")).read(),
+	}
+
+	client = AsyncOpenAI()
+
+	async def g_eval_score(prompt: str, gen_gt: List[str], pred: str):
+		scores = []
+		for gt in gen_gt:
+			input_prompt = prompt.replace("{{Document}}", gt).replace(
+				"{{Summary}}", pred
+			)
+			response = await client.chat.completions.create(
+				model=model,
+				messages=[{"role": "system", "content": input_prompt}],
+				logprobs=True,
+				top_logprobs=5,
+				temperature=0,
+				max_tokens=2,
+				frequency_penalty=0,
+				presence_penalty=0,
+				stop=None,
+				n=20,
+			)
+			if "(1-3):" in prompt:
+				scores.append(get_g_eval_score(response, max_score=3))
+			else:
+				scores.append(get_g_eval_score(response))
+		return max(scores)
+
+	def get_g_eval_score(responses, max_score: int = 5) -> int:
+		target_tokens = {str(i): 0 for i in range(1, max_score + 1)}
+		for choice in responses.choices:
+			first_top_log_probs = choice.logprobs.content[0].top_logprobs
+			for i, top_log_prob in enumerate(
+				list(map(lambda x: x.token, first_top_log_probs))
+			):
+				if top_log_prob in target_tokens:
+					target_tokens[top_log_prob] += 5 - i
+
+		return int(max(target_tokens, key=target_tokens.get))
+
+	g_eval_scores = await asyncio.gather(
+		*(g_eval_score(g_eval_prompts[x], generation_gt, pred) for x in metrics)
+	)
+	return sum(g_eval_scores) / len(g_eval_scores)
+
+
+@autorag_metric_loop(fields_to_check=["generation_gt", "generated_texts"])
+def bert_score(
+	metric_inputs: List[MetricInput],
+	lang: str = "en",
+	batch: int = 128,
+	n_threads: int = os.cpu_count(),
+) -> List[float]:
+	generations = [metric_input.generated_texts for metric_input in metric_inputs]
+	generation_gt = [metric_input.generation_gt for metric_input in metric_inputs]
+	evaluator = evaluate.load("bertscore")
+
+	df = pd.DataFrame(
+		{
+			"reference": generation_gt,
+			"prediction": generations,
+			"lang": lang,
+		}
+	)
+
+	df = df.explode("reference", ignore_index=False)
+	df["bert_score"] = evaluator.compute(
+		predictions=df["prediction"].tolist(),
+		references=df["reference"].tolist(),
+		lang=lang,
+		nthreads=n_threads,
+		batch_size=batch,
+	)["f1"]
+
+	del evaluator
+	empty_cuda_cache()
+
+	return df.groupby(level=0)["bert_score"].max().tolist()
--- a/autorag/evaluation/metric/retrieval.py
+++ b/autorag/evaluation/metric/retrieval.py
@@ -0,0 +1,115 @@
+import itertools
+import math
+
+from autorag.evaluation.metric.util import autorag_metric
+from autorag.schema.metricinput import MetricInput
+
+
+@autorag_metric(fields_to_check=["retrieval_gt", "retrieved_ids"])
+def retrieval_f1(metric_input: MetricInput):
+	"""
+	Compute f1 score for retrieval.
+
+	:param metric_input: The MetricInput schema for AutoRAG metric.
+	:return: The f1 score.
+	"""
+	recall_score = retrieval_recall.__wrapped__(metric_input)
+	precision_score = retrieval_precision.__wrapped__(metric_input)
+	if recall_score + precision_score == 0:
+		return 0
+	else:
+		return 2 * (recall_score * precision_score) / (recall_score + precision_score)
+
+
+@autorag_metric(fields_to_check=["retrieval_gt", "retrieved_ids"])
+def retrieval_recall(metric_input: MetricInput) -> float:
+	gt, pred = metric_input.retrieval_gt, metric_input.retrieved_ids
+
+	gt_sets = [frozenset(g) for g in gt]
+	pred_set = set(pred)
+	hits = sum(any(pred_id in gt_set for pred_id in pred_set) for gt_set in gt_sets)
+	recall = hits / len(gt) if len(gt) > 0 else 0.0
+	return recall
+
+
+@autorag_metric(fields_to_check=["retrieval_gt", "retrieved_ids"])
+def retrieval_precision(metric_input: MetricInput) -> float:
+	gt, pred = metric_input.retrieval_gt, metric_input.retrieved_ids
+
+	gt_sets = [frozenset(g) for g in gt]
+	pred_set = set(pred)
+	hits = sum(any(pred_id in gt_set for gt_set in gt_sets) for pred_id in pred_set)
+	precision = hits / len(pred) if len(pred) > 0 else 0.0
+	return precision
+
+
+@autorag_metric(fields_to_check=["retrieval_gt", "retrieved_ids"])
+def retrieval_ndcg(metric_input: MetricInput) -> float:
+	gt, pred = metric_input.retrieval_gt, metric_input.retrieved_ids
+
+	gt_sets = [frozenset(g) for g in gt]
+	pred_set = set(pred)
+	relevance_scores = {
+		pred_id: 1 if any(pred_id in gt_set for gt_set in gt_sets) else 0
+		for pred_id in pred_set
+	}
+
+	dcg = sum(
+		(2 ** relevance_scores[doc_id] - 1) / math.log2(i + 2)
+		for i, doc_id in enumerate(pred)
+	)
+
+	len_flatten_gt = len(list(itertools.chain.from_iterable(gt)))
+	len_pred = len(pred)
+	ideal_pred = [1] * min(len_flatten_gt, len_pred) + [0] * max(
+		0, len_pred - len_flatten_gt
+	)
+	idcg = sum(relevance / math.log2(i + 2) for i, relevance in enumerate(ideal_pred))
+
+	ndcg = dcg / idcg if idcg > 0 else 0
+	return ndcg
+
+
+@autorag_metric(fields_to_check=["retrieval_gt", "retrieved_ids"])
+def retrieval_mrr(metric_input: MetricInput) -> float:
+	"""
+	Reciprocal Rank (RR) is the reciprocal of the rank of the first relevant item.
+	Mean of RR in whole queries is MRR.
+	"""
+	gt, pred = metric_input.retrieval_gt, metric_input.retrieved_ids
+
+	# Flatten the ground truth list of lists into a single set of relevant documents
+	gt_sets = [frozenset(g) for g in gt]
+
+	rr_list = []
+	for gt_set in gt_sets:
+		for i, pred_id in enumerate(pred):
+			if pred_id in gt_set:
+				rr_list.append(1.0 / (i + 1))
+				break
+	return sum(rr_list) / len(gt_sets) if rr_list else 0.0
+
+
+@autorag_metric(fields_to_check=["retrieval_gt", "retrieved_ids"])
+def retrieval_map(metric_input: MetricInput) -> float:
+	"""
+	Mean Average Precision (MAP) is the mean of Average Precision (AP) for all queries.
+	"""
+	gt, pred = metric_input.retrieval_gt, metric_input.retrieved_ids
+
+	gt_sets = [frozenset(g) for g in gt]
+
+	ap_list = []
+
+	for gt_set in gt_sets:
+		pred_hits = [1 if pred_id in gt_set else 0 for pred_id in pred]
+		precision_list = [
+			sum(pred_hits[: i + 1]) / (i + 1)
+			for i, hit in enumerate(pred_hits)
+			if hit == 1
+		]
+		ap_list.append(
+			sum(precision_list) / len(precision_list) if precision_list else 0.0
+		)
+
+	return sum(ap_list) / len(gt_sets) if ap_list else 0.0
--- a/autorag/evaluation/metric/retrieval_contents.py
+++ b/autorag/evaluation/metric/retrieval_contents.py
@@ -0,0 +1,65 @@
+"""
+This file contains the retrieval contents metric,
+which means calculate the metric based on the contents of the retrieved items.
+"""
+
+import itertools
+from collections import Counter
+
+import numpy as np
+
+from autorag.evaluation.metric.util import autorag_metric
+from autorag.schema.metricinput import MetricInput
+from autorag.utils.util import normalize_string
+
+
+def single_token_f1(ground_truth: str, prediction: str):
+	prediction_tokens = normalize_string(prediction).split()
+	ground_truth_tokens = normalize_string(ground_truth).split()
+	common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
+	num_same = sum(common.values())
+	if num_same == 0:
+		return 0, 0, 0
+	precision = 1.0 * num_same / len(prediction_tokens)
+	recall = 1.0 * num_same / len(ground_truth_tokens)
+	f1 = (2 * precision * recall) / (precision + recall)
+	return precision, recall, f1
+
+
+@autorag_metric(fields_to_check=["retrieved_contents", "retrieval_gt_contents"])
+def retrieval_token_f1(metric_input: MetricInput):
+	pred = metric_input.retrieved_contents
+	gt = itertools.chain.from_iterable(metric_input.retrieval_gt_contents)
+
+	calculated_results = list(
+		map(lambda x: single_token_f1(x[1], x[0]), list(itertools.product(pred, gt)))
+	)
+	_, _, result = zip(*calculated_results)
+	result_np = np.array(list(result)).reshape(len(pred), -1)
+	return result_np.max(axis=1).mean()
+
+
+@autorag_metric(fields_to_check=["retrieved_contents", "retrieval_gt_contents"])
+def retrieval_token_precision(metric_input: MetricInput):
+	pred = metric_input.retrieved_contents
+	gt = itertools.chain.from_iterable(metric_input.retrieval_gt_contents)
+
+	calculated_results = list(
+		map(lambda x: single_token_f1(x[1], x[0]), list(itertools.product(pred, gt)))
+	)
+	result, _, _ = zip(*calculated_results)
+	result_np = np.array(list(result)).reshape(len(pred), -1)
+	return result_np.max(axis=1).mean()
+
+
+@autorag_metric(fields_to_check=["retrieved_contents", "retrieval_gt_contents"])
+def retrieval_token_recall(metric_input: MetricInput):
+	pred = metric_input.retrieved_contents
+	gt = itertools.chain.from_iterable(metric_input.retrieval_gt_contents)
+
+	calculated_results = list(
+		map(lambda x: single_token_f1(x[1], x[0]), list(itertools.product(pred, gt)))
+	)
+	_, result, _ = zip(*calculated_results)
+	result_np = np.array(list(result)).reshape(len(pred), -1)
+	return result_np.max(axis=1).mean()
--- a/autorag/evaluation/metric/util.py
+++ b/autorag/evaluation/metric/util.py
@@ -0,0 +1,88 @@
+import functools
+from typing import List
+
+import numpy as np
+
+from autorag.schema.metricinput import MetricInput
+from autorag.utils.util import convert_inputs_to_list
+
+
+def calculate_cosine_similarity(a, b):
+	dot_product = np.dot(a, b)
+	norm_a = np.linalg.norm(a)
+	norm_b = np.linalg.norm(b)
+	similarity = dot_product / (norm_a * norm_b)
+	return similarity
+
+
+def calculate_l2_distance(a, b):
+	return np.linalg.norm(a - b)
+
+
+def calculate_inner_product(a, b):
+	return np.dot(a, b)
+
+
+def autorag_metric(fields_to_check: List[str]):
+	def decorator_autorag_metric(func):
+		@functools.wraps(func)
+		@convert_inputs_to_list
+		def wrapper(metric_inputs: List[MetricInput], **kwargs) -> List[float]:
+			"""
+			Use 'for loop' to run each metric input.
+			Put the single metric input into the metric function.
+
+			:param metric_inputs: A list MetricInput schema for AutoRAG metric.
+			:param kwargs: The additional arguments for metric function.
+			:return: A list of computed metric scores.
+			"""
+			results = []
+			for metric_input in metric_inputs:
+				if metric_input.is_fields_notnone(fields_to_check=fields_to_check):
+					results.append(func(metric_input, **kwargs))
+				else:
+					results.append(None)
+			return results
+
+		return wrapper
+
+	return decorator_autorag_metric
+
+
+def autorag_metric_loop(fields_to_check: List[str]):
+	def decorator_autorag_generation_metric(func):
+		@functools.wraps(func)
+		@convert_inputs_to_list
+		def wrapper(metric_inputs: List[MetricInput], **kwargs) -> List[float]:
+			"""
+			Put the list of metric inputs into the metric function.
+
+			:param metric_inputs: A list MetricInput schema for AutoRAG metric.
+			:param kwargs: The additional arguments for metric function.
+			:return: A list of computed metric scores.
+			"""
+			bool_list = [
+				metric_input.is_fields_notnone(fields_to_check=fields_to_check)
+				for metric_input in metric_inputs
+			]
+			valid_inputs = [
+				metric_input
+				for metric_input, is_valid in zip(metric_inputs, bool_list)
+				if is_valid
+			]
+
+			results = [None] * len(metric_inputs)
+			if valid_inputs:
+				processed_valid = func(valid_inputs, **kwargs)
+
+				valid_index = 0
+				for i, is_valid in enumerate(bool_list):
+					if is_valid:
+						results[i] = processed_valid[valid_index]
+						valid_index += 1
+
+			return results
+
+		return wrapper
+
+	return decorator_autorag_generation_metric
--- a/autorag/evaluation/retrieval.py
+++ b/autorag/evaluation/retrieval.py
@@ -0,0 +1,83 @@
+import functools
+import warnings
+from typing import List, Callable, Any, Tuple, Union, Dict
+
+import pandas as pd
+
+from autorag.evaluation.metric import (
+	retrieval_recall,
+	retrieval_precision,
+	retrieval_f1,
+	retrieval_ndcg,
+	retrieval_mrr,
+	retrieval_map,
+)
+from autorag.evaluation.util import cast_metrics
+from autorag.schema.metricinput import MetricInput
+
+RETRIEVAL_METRIC_FUNC_DICT = {
+	func.__name__: func
+	for func in [
+		retrieval_recall,
+		retrieval_precision,
+		retrieval_f1,
+		retrieval_ndcg,
+		retrieval_mrr,
+		retrieval_map,
+	]
+}
+
+
+def evaluate_retrieval(
+	metric_inputs: List[MetricInput],
+	metrics: Union[List[str], List[Dict]],
+):
+	def decorator_evaluate_retrieval(
+		func: Callable[
+			[Any], Tuple[List[List[str]], List[List[str]], List[List[float]]]
+		],
+	):
+		"""
+		Decorator for evaluating retrieval results.
+		You can use this decorator to any method that returns (contents, scores, ids),
+		which is the output of conventional retrieval modules.
+
+		:param func: Must return (contents, scores, ids)
+		:return: wrapper function that returns pd.DataFrame, which is the evaluation result.
+		"""
+
+		@functools.wraps(func)
+		def wrapper(*args, **kwargs) -> pd.DataFrame:
+			contents, pred_ids, scores = func(*args, **kwargs)
+			for metric_input, pred_id in zip(metric_inputs, pred_ids):
+				metric_input.retrieved_ids = pred_id
+
+			metric_scores = {}
+			metric_names, metric_params = cast_metrics(metrics)
+
+			for metric_name, metric_param in zip(metric_names, metric_params):
+				if metric_name in RETRIEVAL_METRIC_FUNC_DICT:
+					metric_func = RETRIEVAL_METRIC_FUNC_DICT[metric_name]
+					metric_scores[metric_name] = metric_func(
+						metric_inputs=metric_inputs, **metric_param
+					)
+				else:
+					warnings.warn(
+						f"metric {metric_name} is not in supported metrics: {RETRIEVAL_METRIC_FUNC_DICT.keys()}"
+						f"{metric_name} will be ignored."
+					)
+
+			metric_result_df = pd.DataFrame(metric_scores)
+			execution_result_df = pd.DataFrame(
+				{
+					"retrieved_contents": contents,
+					"retrieved_ids": pred_ids,
+					"retrieve_scores": scores,
+				}
+			)
+			result_df = pd.concat([execution_result_df, metric_result_df], axis=1)
+			return result_df
+
+		return wrapper
+
+	return decorator_evaluate_retrieval
--- a/autorag/evaluation/retrieval_contents.py
+++ b/autorag/evaluation/retrieval_contents.py
@@ -0,0 +1,65 @@
+import functools
+from typing import List, Callable, Any, Tuple
+
+import pandas as pd
+
+from autorag.evaluation.metric import (
+	retrieval_token_f1,
+	retrieval_token_precision,
+	retrieval_token_recall,
+)
+from autorag.schema.metricinput import MetricInput
+
+
+def evaluate_retrieval_contents(metric_inputs: List[MetricInput], metrics: List[str]):
+	def decorator_evaluate_retrieval_contents(
+		func: Callable[
+			[Any], Tuple[List[List[str]], List[List[str]], List[List[float]]]
+		],
+	):
+		"""
+		Decorator for evaluating retrieval contents.
+		You can use this decorator to any method that returns (contents, scores, ids),
+		which is the output of conventional retrieval modules.
+
+		:param func: Must return (contents, scores, ids)
+		:return: pd.DataFrame, which is the evaluation result and function result.
+		"""
+
+		@functools.wraps(func)
+		def wrapper(*args, **kwargs) -> pd.DataFrame:
+			contents, pred_ids, scores = func(*args, **kwargs)
+			metric_funcs = {
+				retrieval_token_recall.__name__: retrieval_token_recall,
+				retrieval_token_precision.__name__: retrieval_token_precision,
+				retrieval_token_f1.__name__: retrieval_token_f1,
+			}
+			for metric_input, content in zip(metric_inputs, contents):
+				metric_input.retrieved_contents = content
+
+			metrics_scores = {}
+			for metric in metrics:
+				if metric not in metric_funcs:
+					raise ValueError(
+						f"metric {metric} is not in supported metrics: {metric_funcs.keys()}"
+					)
+				else:
+					metric_func = metric_funcs[metric]
+					# Extract each required field from all payloads
+					metric_scores = metric_func(metric_inputs=metric_inputs)
+					metrics_scores[metric] = metric_scores
+
+			metric_result_df = pd.DataFrame(metrics_scores)
+			execution_result_df = pd.DataFrame(
+				{
+					"retrieved_contents": contents,
+					"retrieved_ids": pred_ids,
+					"retrieve_scores": scores,
+				}
+			)
+			result_df = pd.concat([execution_result_df, metric_result_df], axis=1)
+			return result_df
+
+		return wrapper
+
+	return decorator_evaluate_retrieval_contents
--- a/autorag/evaluation/util.py
+++ b/autorag/evaluation/util.py
@@ -0,0 +1,43 @@
+from copy import deepcopy
+from typing import Union, List, Dict, Tuple, Any
+
+from autorag.embedding.base import EmbeddingModel
+
+
+def cast_metrics(
+	metrics: Union[List[str], List[Dict]],
+) -> Tuple[List[str], List[Dict[str, Any]]]:
+	"""
+	 Turn metrics to list of metric names and parameter list.
+
+	:param metrics: List of string or dictionary.
+	:return: The list of metric names and dictionary list of metric parameters.
+	"""
+	metrics_copy = deepcopy(metrics)
+	if not isinstance(metrics_copy, list):
+		raise ValueError("metrics must be a list of string or dictionary.")
+	if isinstance(metrics_copy[0], str):
+		return metrics_copy, [{} for _ in metrics_copy]
+	elif isinstance(metrics_copy[0], dict):
+		# pop 'metric_name' key from dictionary
+		metric_names = list(map(lambda x: x.pop("metric_name"), metrics_copy))
+		metric_params = [
+			dict(
+				map(
+					lambda x, y: cast_embedding_model(x, y),
+					metric.keys(),
+					metric.values(),
+				)
+			)
+			for metric in metrics_copy
+		]
+		return metric_names, metric_params
+	else:
+		raise ValueError("metrics must be a list of string or dictionary.")
+
+
+def cast_embedding_model(key, value):
+	if key == "embedding_model":
+		return key, EmbeddingModel.load(value)()
+	else:
+		return key, value