Fix Dockerfile build issue

2025-03-18 16:41:12 +09:00
parent 6814230bfb
commit 9323aa254a
228 changed files with 467 additions and 3488 deletions
--- a/autorag/data/qa/generation_gt/init.py
+++ b/autorag/data/qa/generation_gt/init.py
--- a/autorag/data/qa/generation_gt/base.py
+++ b/autorag/data/qa/generation_gt/base.py
@@ -0,0 +1,16 @@
+from typing import Dict
+
+
+def add_gen_gt(row: Dict, new_gen_gt: str) -> Dict:
+	if "generation_gt" in list(row.keys()):
+		if isinstance(row["generation_gt"], list):
+			row["generation_gt"].append(new_gen_gt)
+		elif isinstance(row["generation_gt"], str):
+			row["generation_gt"] = [row["generation_gt"], new_gen_gt]
+		else:
+			raise ValueError(
+				"generation_gt should be either a string or a list of strings."
+			)
+		return row
+	row["generation_gt"] = [new_gen_gt]
+	return row
--- a/autorag/data/qa/generation_gt/llama_index_gen_gt.py
+++ b/autorag/data/qa/generation_gt/llama_index_gen_gt.py
@@ -0,0 +1,41 @@
+import itertools
+from typing import Dict
+
+
+from llama_index.core.base.llms.base import BaseLLM
+from llama_index.core.base.llms.types import MessageRole, ChatMessage
+
+from autorag.data.qa.generation_gt.base import add_gen_gt
+from autorag.data.qa.generation_gt.prompt import GEN_GT_SYSTEM_PROMPT
+
+
+async def make_gen_gt_llama_index(row: Dict, llm: BaseLLM, system_prompt: str) -> Dict:
+	retrieval_gt_contents = list(
+		itertools.chain.from_iterable(row["retrieval_gt_contents"])
+	)
+	query = row["query"]
+	passage_str = "\n".join(retrieval_gt_contents)
+	user_prompt = f"Text:\n<|text_start|>\n{passage_str}\n<|text_end|>\n\nQuestion:\n{query}\n\nAnswer:"
+
+	response = await llm.achat(
+		messages=[
+			ChatMessage(role=MessageRole.SYSTEM, content=system_prompt),
+			ChatMessage(role=MessageRole.USER, content=user_prompt),
+		],
+		temperature=0.0,
+	)
+	return add_gen_gt(row, response.message.content)
+
+
+async def make_concise_gen_gt(row: Dict, llm: BaseLLM, lang: str = "en") -> Dict:
+	return await make_gen_gt_llama_index(
+		row, llm, GEN_GT_SYSTEM_PROMPT["concise"][lang]
+	)
+
+
+async def make_basic_gen_gt(row: Dict, llm: BaseLLM, lang: str = "en") -> Dict:
+	return await make_gen_gt_llama_index(row, llm, GEN_GT_SYSTEM_PROMPT["basic"][lang])
+
+
+async def make_custom_gen_gt(row: Dict, llm: BaseLLM, system_prompt: str) -> Dict:
+	return await make_gen_gt_llama_index(row, llm, system_prompt)
--- a/autorag/data/qa/generation_gt/openai_gen_gt.py
+++ b/autorag/data/qa/generation_gt/openai_gen_gt.py
@@ -0,0 +1,84 @@
+import itertools
+from typing import Dict
+
+from openai import AsyncClient
+from pydantic import BaseModel
+
+from autorag.data.qa.generation_gt.base import add_gen_gt
+from autorag.data.qa.generation_gt.prompt import GEN_GT_SYSTEM_PROMPT
+
+
+class Response(BaseModel):
+	answer: str
+
+
+async def make_gen_gt_openai(
+	row: Dict,
+	client: AsyncClient,
+	system_prompt: str,
+	model_name: str = "gpt-4o-2024-08-06",
+):
+	retrieval_gt_contents = list(
+		itertools.chain.from_iterable(row["retrieval_gt_contents"])
+	)
+	query = row["query"]
+	passage_str = "\n".join(retrieval_gt_contents)
+	user_prompt = f"Text:\n<|text_start|>\n{passage_str}\n<|text_end|>\n\nQuestion:\n{query}\n\nAnswer:"
+
+	completion = await client.beta.chat.completions.parse(
+		model=model_name,
+		messages=[
+			{"role": "system", "content": system_prompt},
+			{"role": "user", "content": user_prompt},
+		],
+		temperature=0.0,
+		response_format=Response,
+	)
+	response: Response = completion.choices[0].message.parsed
+	return add_gen_gt(row, response.answer)
+
+
+async def make_concise_gen_gt(
+	row: Dict,
+	client: AsyncClient,
+	model_name: str = "gpt-4o-2024-08-06",
+	lang: str = "en",
+):
+	"""
+	Generate concise generation_gt using OpenAI Structured Output for preventing errors.
+	It generates a concise answer, so it is generally a word or just a phrase.
+
+	:param row: The input row of the qa dataframe.
+	:param client: The OpenAI async client.
+	:param model_name: The model name that supports structured output.
+	    It has to be "gpt-4o-2024-08-06" or "gpt-4o-mini-2024-07-18".
+	:param lang: The language code of the prompt.
+		Default is "en".
+	:return: The output row of the qa dataframe with added "generation_gt" in it.
+	"""
+	return await make_gen_gt_openai(
+		row, client, GEN_GT_SYSTEM_PROMPT["concise"][lang], model_name
+	)
+
+
+async def make_basic_gen_gt(
+	row: Dict,
+	client: AsyncClient,
+	model_name: str = "gpt-4o-2024-08-06",
+	lang: str = "en",
+):
+	"""
+	Generate basic generation_gt using OpenAI Structured Output for preventing errors.
+	It generates a "basic" answer, and its prompt is simple.
+
+	:param row: The input row of the qa dataframe.
+	:param client: The OpenAI async client.
+	:param model_name: The model name that supports structured output.
+	    It has to be "gpt-4o-2024-08-06" or "gpt-4o-mini-2024-07-18".
+	:param lang: The language code of the prompt.
+		Default is "en".
+	:return: The output row of the qa dataframe with added "generation_gt" in it.
+	"""
+	return await make_gen_gt_openai(
+		row, client, GEN_GT_SYSTEM_PROMPT["basic"][lang], model_name
+	)
--- a/autorag/data/qa/generation_gt/prompt.py
+++ b/autorag/data/qa/generation_gt/prompt.py
@@ -0,0 +1,27 @@
+GEN_GT_SYSTEM_PROMPT = {
+	"concise": {
+		"en": """You are an AI assistant to answer the given question in the provide evidence text.
+You can find the evidence from the given text about question, and you have to write a proper answer to the given question.
+Your answer have to be concise and relevant to the question.
+Do not make a verbose answer and make it super clear.
+It doesn't have to be an full sentence. It can be the answer is a word or a paraphrase.""",
+		"ko": """당신은 주어진 질문에 대해 제공된 Text 내에서 답을 찾는 AI 비서입니다.
+질문에 대한 답을 Text에서 찾아 적절한 답변을 작성하세요.
+답변은 간결하고 질문에 관련된 내용만 포함해야 합니다.
+불필요하게 길게 답변하지 말고, 명확하게 작성하세요.
+완전한 문장이 아니어도 되며, 답은 단어나 요약일 수 있습니다.""",
+		"ja": """
+あなたは与えられた質問に対して提供されたText内で答えを探すAI秘書です。
+質問に対する答えをTextで探して適切な答えを作成しましょう。
+回答は簡潔で、質問に関連する内容のみを含める必要があります。
+不必要に長く答えず、明確に作成しましょう。
+完全な文章でなくてもいいし、答えは単語や要約かもしれません。
+""",
+	},
+	"basic": {
+		"en": """You are an AI assistant to answer the given question in the provide evidence text.
+You can find the evidence from the given text about question, and you have to write a proper answer to the given question.""",
+		"ko": "당신은 주어진 질문에 대한 답을 제공된 Text 내에서 찾는 AI 비서입니다. 질문과 관련된 증거를 Text에서 찾아 적절한 답변을 작성하세요.",
+		"ja": "あなたは与えられた質問に対する答えを提供されたText内で探すAI秘書です。 質問に関する証拠をTextで探して適切な回答を作成しましょう。",
+	},
+}