Fix Dockerfile build issue

This commit is contained in:
kyy
2025-03-18 16:41:12 +09:00
parent 6814230bfb
commit 9323aa254a
228 changed files with 467 additions and 3488 deletions

View File

@@ -0,0 +1,16 @@
from typing import Dict
def add_gen_gt(row: Dict, new_gen_gt: str) -> Dict:
if "generation_gt" in list(row.keys()):
if isinstance(row["generation_gt"], list):
row["generation_gt"].append(new_gen_gt)
elif isinstance(row["generation_gt"], str):
row["generation_gt"] = [row["generation_gt"], new_gen_gt]
else:
raise ValueError(
"generation_gt should be either a string or a list of strings."
)
return row
row["generation_gt"] = [new_gen_gt]
return row

View File

@@ -0,0 +1,41 @@
import itertools
from typing import Dict
from llama_index.core.base.llms.base import BaseLLM
from llama_index.core.base.llms.types import MessageRole, ChatMessage
from autorag.data.qa.generation_gt.base import add_gen_gt
from autorag.data.qa.generation_gt.prompt import GEN_GT_SYSTEM_PROMPT
async def make_gen_gt_llama_index(row: Dict, llm: BaseLLM, system_prompt: str) -> Dict:
retrieval_gt_contents = list(
itertools.chain.from_iterable(row["retrieval_gt_contents"])
)
query = row["query"]
passage_str = "\n".join(retrieval_gt_contents)
user_prompt = f"Text:\n<|text_start|>\n{passage_str}\n<|text_end|>\n\nQuestion:\n{query}\n\nAnswer:"
response = await llm.achat(
messages=[
ChatMessage(role=MessageRole.SYSTEM, content=system_prompt),
ChatMessage(role=MessageRole.USER, content=user_prompt),
],
temperature=0.0,
)
return add_gen_gt(row, response.message.content)
async def make_concise_gen_gt(row: Dict, llm: BaseLLM, lang: str = "en") -> Dict:
return await make_gen_gt_llama_index(
row, llm, GEN_GT_SYSTEM_PROMPT["concise"][lang]
)
async def make_basic_gen_gt(row: Dict, llm: BaseLLM, lang: str = "en") -> Dict:
return await make_gen_gt_llama_index(row, llm, GEN_GT_SYSTEM_PROMPT["basic"][lang])
async def make_custom_gen_gt(row: Dict, llm: BaseLLM, system_prompt: str) -> Dict:
return await make_gen_gt_llama_index(row, llm, system_prompt)

View File

@@ -0,0 +1,84 @@
import itertools
from typing import Dict
from openai import AsyncClient
from pydantic import BaseModel
from autorag.data.qa.generation_gt.base import add_gen_gt
from autorag.data.qa.generation_gt.prompt import GEN_GT_SYSTEM_PROMPT
class Response(BaseModel):
answer: str
async def make_gen_gt_openai(
row: Dict,
client: AsyncClient,
system_prompt: str,
model_name: str = "gpt-4o-2024-08-06",
):
retrieval_gt_contents = list(
itertools.chain.from_iterable(row["retrieval_gt_contents"])
)
query = row["query"]
passage_str = "\n".join(retrieval_gt_contents)
user_prompt = f"Text:\n<|text_start|>\n{passage_str}\n<|text_end|>\n\nQuestion:\n{query}\n\nAnswer:"
completion = await client.beta.chat.completions.parse(
model=model_name,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
],
temperature=0.0,
response_format=Response,
)
response: Response = completion.choices[0].message.parsed
return add_gen_gt(row, response.answer)
async def make_concise_gen_gt(
row: Dict,
client: AsyncClient,
model_name: str = "gpt-4o-2024-08-06",
lang: str = "en",
):
"""
Generate concise generation_gt using OpenAI Structured Output for preventing errors.
It generates a concise answer, so it is generally a word or just a phrase.
:param row: The input row of the qa dataframe.
:param client: The OpenAI async client.
:param model_name: The model name that supports structured output.
It has to be "gpt-4o-2024-08-06" or "gpt-4o-mini-2024-07-18".
:param lang: The language code of the prompt.
Default is "en".
:return: The output row of the qa dataframe with added "generation_gt" in it.
"""
return await make_gen_gt_openai(
row, client, GEN_GT_SYSTEM_PROMPT["concise"][lang], model_name
)
async def make_basic_gen_gt(
row: Dict,
client: AsyncClient,
model_name: str = "gpt-4o-2024-08-06",
lang: str = "en",
):
"""
Generate basic generation_gt using OpenAI Structured Output for preventing errors.
It generates a "basic" answer, and its prompt is simple.
:param row: The input row of the qa dataframe.
:param client: The OpenAI async client.
:param model_name: The model name that supports structured output.
It has to be "gpt-4o-2024-08-06" or "gpt-4o-mini-2024-07-18".
:param lang: The language code of the prompt.
Default is "en".
:return: The output row of the qa dataframe with added "generation_gt" in it.
"""
return await make_gen_gt_openai(
row, client, GEN_GT_SYSTEM_PROMPT["basic"][lang], model_name
)

View File

@@ -0,0 +1,27 @@
GEN_GT_SYSTEM_PROMPT = {
"concise": {
"en": """You are an AI assistant to answer the given question in the provide evidence text.
You can find the evidence from the given text about question, and you have to write a proper answer to the given question.
Your answer have to be concise and relevant to the question.
Do not make a verbose answer and make it super clear.
It doesn't have to be an full sentence. It can be the answer is a word or a paraphrase.""",
"ko": """당신은 주어진 질문에 대해 제공된 Text 내에서 답을 찾는 AI 비서입니다.
질문에 대한 답을 Text에서 찾아 적절한 답변을 작성하세요.
답변은 간결하고 질문에 관련된 내용만 포함해야 합니다.
불필요하게 길게 답변하지 말고, 명확하게 작성하세요.
완전한 문장이 아니어도 되며, 답은 단어나 요약일 수 있습니다.""",
"ja": """
あなたは与えられた質問に対して提供されたText内で答えを探すAI秘書です。
質問に対する答えをTextで探して適切な答えを作成しましょう。
回答は簡潔で、質問に関連する内容のみを含める必要があります。
不必要に長く答えず、明確に作成しましょう。
完全な文章でなくてもいいし、答えは単語や要約かもしれません。
""",
},
"basic": {
"en": """You are an AI assistant to answer the given question in the provide evidence text.
You can find the evidence from the given text about question, and you have to write a proper answer to the given question.""",
"ko": "당신은 주어진 질문에 대한 답을 제공된 Text 내에서 찾는 AI 비서입니다. 질문과 관련된 증거를 Text에서 찾아 적절한 답변을 작성하세요.",
"ja": "あなたは与えられた質問に対する答えを提供されたText内で探すAI秘書です。 質問に関する証拠をTextで探して適切な回答を作成しましょう。",
},
}