Initial commit
This commit is contained in:
@@ -0,0 +1,32 @@
|
||||
modules:
|
||||
- module_type: llama_index_chunk
|
||||
chunk_method: [ Token, Sentence ]
|
||||
chunk_size: [ 1024, 512 ]
|
||||
chunk_overlap: 24
|
||||
add_file_name: en
|
||||
- module_type: llama_index_chunk
|
||||
chunk_method: [ SentenceWindow ]
|
||||
window_size: 3
|
||||
add_file_name: en
|
||||
- module_type: llama_index_chunk
|
||||
chunk_method: [ Semantic_llama_index ]
|
||||
embed_model: openai
|
||||
buffer_size: 1
|
||||
breakpoint_percentile_threshold: 95
|
||||
add_file_name: en
|
||||
- module_type: llama_index_chunk
|
||||
chunk_method: [ SemanticDoubleMerging ]
|
||||
add_file_name: en
|
||||
- module_type: llama_index_chunk
|
||||
chunk_method: [ SimpleFile ]
|
||||
add_file_name: en
|
||||
- module_type: langchain_chunk
|
||||
chunk_method: sentencetransformerstoken
|
||||
- module_type: langchain_chunk
|
||||
chunk_method: recursivecharacter
|
||||
separators: [ " ", "\n" ]
|
||||
- module_type: langchain_chunk
|
||||
chunk_method: character
|
||||
separator: ". "
|
||||
- module_type: langchain_chunk
|
||||
chunk_method: Konlpy
|
||||
19
autorag-workspace/example/sample_config/chunk/chunk_ko.yaml
Normal file
19
autorag-workspace/example/sample_config/chunk/chunk_ko.yaml
Normal file
@@ -0,0 +1,19 @@
|
||||
modules:
|
||||
- module_type: llama_index_chunk
|
||||
chunk_method: [ Token, Sentence ]
|
||||
chunk_size: [ 1024, 512 ]
|
||||
chunk_overlap: 24
|
||||
add_file_name: ko
|
||||
- module_type: llama_index_chunk
|
||||
chunk_method: [ SentenceWindow ]
|
||||
sentence_splitter: kiwi
|
||||
add_file_name: ko
|
||||
- module_type: llama_index_chunk
|
||||
chunk_method: [ Semantic_llama_index ]
|
||||
embed_model: openai
|
||||
add_file_name: ko
|
||||
- module_type: llama_index_chunk
|
||||
chunk_method: [ SimpleFile ]
|
||||
add_file_name: ko
|
||||
- module_type: langchain_chunk
|
||||
chunk_method: KonlpyTextSplitter
|
||||
@@ -0,0 +1,3 @@
|
||||
modules:
|
||||
- module_type: llama_index_chunk
|
||||
chunk_method: Token
|
||||
@@ -0,0 +1,25 @@
|
||||
# You can use only one of the following modules at a time.
|
||||
modules:
|
||||
# Use Directory Parse
|
||||
- module_type: langchain_parse
|
||||
file_type: all_files
|
||||
parse_method: directory
|
||||
# Use Unstructured
|
||||
- module_type: langchain_parse
|
||||
file_type: all_files
|
||||
parse_method: unstructured
|
||||
# Use Upsatge Document Parse
|
||||
- module_type: langchain_parse
|
||||
file_type: all_files
|
||||
parse_method: upstagedocumentparse
|
||||
# Use Naver Clova OCR
|
||||
- module_type: clova
|
||||
file_type: all_files
|
||||
table_detection: true
|
||||
# Use Llama Parse
|
||||
- module_type: llamaparse
|
||||
file_type: all_files
|
||||
result_type: markdown
|
||||
language: ko
|
||||
use_vendor_multimodal_model: true
|
||||
vendor_multimodal_model_name: openai-gpt-4o-mini
|
||||
@@ -0,0 +1,26 @@
|
||||
modules:
|
||||
# PDF
|
||||
- module_type: langchain_parse
|
||||
file_type: pdf
|
||||
parse_method: pdfminer
|
||||
# CSV
|
||||
- module_type: langchain_parse
|
||||
file_type: csv
|
||||
parse_method: csv
|
||||
# JSON
|
||||
- module_type: langchain_parse
|
||||
file_type: json
|
||||
parse_method: json
|
||||
jq_schema: .content
|
||||
# Markdown
|
||||
- module_type: langchain_parse
|
||||
file_type: md
|
||||
parse_method: unstructuredmarkdown
|
||||
# HTML
|
||||
- module_type: langchain_parse
|
||||
file_type: html
|
||||
parse_method: bshtml
|
||||
# XML
|
||||
- module_type: langchain_parse
|
||||
file_type: xml
|
||||
parse_method: unstructuredxml
|
||||
@@ -0,0 +1,12 @@
|
||||
modules:
|
||||
- module_type: table_hybrid_parse
|
||||
file_type: pdf
|
||||
text_parse_module: langchain_parse
|
||||
text_params:
|
||||
parse_method: pdfplumber
|
||||
table_parse_module: llamaparse
|
||||
table_params:
|
||||
result_type: markdown
|
||||
language: ko
|
||||
use_vendor_multimodal_model: true
|
||||
vendor_multimodal_model_name: openai-gpt-4o-mini
|
||||
11
autorag-workspace/example/sample_config/parse/parse_ko.yaml
Normal file
11
autorag-workspace/example/sample_config/parse/parse_ko.yaml
Normal file
@@ -0,0 +1,11 @@
|
||||
modules:
|
||||
- module_type: llama_parse
|
||||
file_type: all_files
|
||||
result_type: markdown
|
||||
language: ko
|
||||
- module_type: clova
|
||||
file_type: all_files
|
||||
table_detection: true
|
||||
- module_type: langchain_parse
|
||||
file_type: all_files
|
||||
parse_method: upstagedocumentparse
|
||||
@@ -0,0 +1,8 @@
|
||||
modules:
|
||||
- module_type: llamaparse
|
||||
file_type: all_files
|
||||
result_type: markdown
|
||||
language: ko
|
||||
use_vendor_multimodal_model: true
|
||||
vendor_multimodal_model_name: openai-gpt-4o-mini
|
||||
use_own_key: true
|
||||
10
autorag-workspace/example/sample_config/parse/parse_ocr.yaml
Normal file
10
autorag-workspace/example/sample_config/parse/parse_ocr.yaml
Normal file
@@ -0,0 +1,10 @@
|
||||
modules:
|
||||
- module_type: langchain_parse
|
||||
file_type: all_files
|
||||
parse_method: upstagedocumentparse
|
||||
- module_type: llama_parse
|
||||
file_type: all_files
|
||||
result_type: markdown
|
||||
- module_type: clova
|
||||
file_type: all_files
|
||||
table_detection: true
|
||||
@@ -0,0 +1,4 @@
|
||||
modules:
|
||||
- module_type: langchain_parse
|
||||
file_type: pdf
|
||||
parse_method: pdfminer
|
||||
@@ -0,0 +1,50 @@
|
||||
vectordb:
|
||||
- name: mpnet_base_chroma
|
||||
db_type: chroma
|
||||
client_type: persistent
|
||||
embedding_model: huggingface_all_mpnet_base_v2
|
||||
collection_name: huggingface_all_mpnet_base_v2
|
||||
path: ${PROJECT_DIR}/data/chroma
|
||||
node_lines:
|
||||
- node_line_name: retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: retrieval
|
||||
strategy:
|
||||
metrics: [retrieval_f1, retrieval_recall, retrieval_precision]
|
||||
top_k: 20
|
||||
modules:
|
||||
- module_type: bm25
|
||||
- module_type: vectordb
|
||||
vectordb: mpnet_base_chroma
|
||||
- module_type: hybrid_rrf
|
||||
- module_type: hybrid_cc
|
||||
normalize_method: [ mm, tmm, z, dbsf ]
|
||||
- node_type: passage_reranker
|
||||
strategy:
|
||||
metrics: [retrieval_f1, retrieval_recall, retrieval_precision]
|
||||
top_k: 3
|
||||
modules:
|
||||
- module_type: pass_reranker
|
||||
- module_type: tart
|
||||
- module_type: upr
|
||||
- node_line_name: post_retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: prompt_maker
|
||||
strategy:
|
||||
metrics: [bleu, meteor, rouge, sem_score]
|
||||
generator_modules:
|
||||
- module_type: vllm
|
||||
llm: mistralai/Mistral-7B-Instruct-v0.2
|
||||
modules:
|
||||
- module_type: fstring
|
||||
prompt:
|
||||
- "Answer to given questions with the following passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"
|
||||
- "There is a passages related to user question. Please response carefully to the following question. \n\n Passage: {retrieved_contents} \n\n Question: {query} \n\n Answer the question. Think step by step." # Zero-shot CoT prompt
|
||||
- "{retrieved_contents} \n\n Read the passage carefully, and answer this question. \n\n Question: {query} \n\n Answer the question. Be concise." # concise prompt
|
||||
- node_type: generator
|
||||
strategy:
|
||||
metrics: [bleu, meteor, rouge, sem_score]
|
||||
modules:
|
||||
- module_type: vllm
|
||||
llm: mistralai/Mistral-7B-Instruct-v0.2
|
||||
temperature: [0.1, 0.5, 1.1]
|
||||
@@ -0,0 +1,101 @@
|
||||
node_lines:
|
||||
- node_line_name: retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: retrieval
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision,
|
||||
retrieval_ndcg, retrieval_map, retrieval_mrr ]
|
||||
speed_threshold: 10
|
||||
top_k: 10
|
||||
modules:
|
||||
- module_type: bm25
|
||||
bm25_tokenizer: [ porter_stemmer, space, gpt2 ]
|
||||
- module_type: vectordb
|
||||
vectordb: default
|
||||
- module_type: hybrid_rrf
|
||||
weight_range: (4,80)
|
||||
- module_type: hybrid_cc
|
||||
normalize_method: [ mm, tmm, z, dbsf ]
|
||||
weight_range: (0.0, 1.0)
|
||||
test_weight_size: 101
|
||||
- node_type: passage_augmenter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
top_k: 5
|
||||
embedding_model: openai
|
||||
modules:
|
||||
- module_type: pass_passage_augmenter
|
||||
- module_type: prev_next_augmenter
|
||||
mode: next
|
||||
- node_type: passage_reranker
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 10
|
||||
top_k: 5
|
||||
modules:
|
||||
- module_type: pass_reranker
|
||||
- module_type: tart
|
||||
- module_type: monot5
|
||||
- module_type: upr
|
||||
- module_type: rankgpt
|
||||
- module_type: colbert_reranker
|
||||
- module_type: sentence_transformer_reranker
|
||||
- module_type: flag_embedding_reranker
|
||||
- module_type: flag_embedding_llm_reranker
|
||||
- module_type: time_reranker
|
||||
- module_type: openvino_reranker
|
||||
- module_type: flashrank_reranker
|
||||
- node_type: passage_filter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
modules:
|
||||
- module_type: pass_passage_filter
|
||||
- module_type: similarity_threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: similarity_percentile_cutoff
|
||||
percentile: 0.6
|
||||
- module_type: threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: percentile_cutoff
|
||||
percentile: 0.6
|
||||
- node_line_name: post_retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: prompt_maker
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: bleu
|
||||
- metric_name: meteor
|
||||
- metric_name: rouge
|
||||
- metric_name: sem_score
|
||||
embedding_model: openai
|
||||
speed_threshold: 10
|
||||
generator_modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: openai
|
||||
model: [gpt-4o-mini]
|
||||
modules:
|
||||
- module_type: fstring
|
||||
prompt:
|
||||
- "Answer to given questions with the following passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"
|
||||
- "There is a passages related to user question. Please response carefully to the following question. \n\n Passage: {retrieved_contents} \n\n Question: {query} \n\n Answer the question. Think step by step." # Zero-shot CoT prompt
|
||||
- "{retrieved_contents} \n\n Read the passage carefully, and answer this question. \n\n Question: {query} \n\n Answer the question. Be concise." # concise prompt
|
||||
- module_type: long_context_reorder
|
||||
prompt:
|
||||
- "Answer to given questions with the following passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"
|
||||
- "There is a passages related to user question. Please response carefully to the following question. \n\n Passage: {retrieved_contents} \n\n Question: {query} \n\n Answer the question. Think step by step." # Zero-shot CoT prompt
|
||||
- "{retrieved_contents} \n\n Read the passage carefully, and answer this question. \n\n Question: {query} \n\n Answer the question. Be concise." # concise prompt
|
||||
- node_type: generator
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: rouge
|
||||
- embedding_model: openai
|
||||
metric_name: sem_score
|
||||
- metric_name: bert_score
|
||||
speed_threshold: 10
|
||||
modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: [openai]
|
||||
model: [gpt-4o-mini]
|
||||
temperature: [0.5, 1.0]
|
||||
@@ -0,0 +1,154 @@
|
||||
vectordb:
|
||||
- name: chroma_bge_m3
|
||||
db_type: chroma
|
||||
client_type: persistent
|
||||
embedding_model: huggingface_bge_m3
|
||||
collection_name: openai
|
||||
path: ${PROJECT_DIR}/resources/chroma
|
||||
node_lines:
|
||||
- node_line_name: pre_retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: query_expansion
|
||||
strategy:
|
||||
metrics: [retrieval_f1, retrieval_recall, retrieval_precision]
|
||||
speed_threshold: 10
|
||||
top_k: 10
|
||||
retrieval_modules:
|
||||
- module_type: bm25
|
||||
bm25_tokenizer: [ porter_stemmer, space, gpt2 ]
|
||||
- module_type: vectordb
|
||||
vectordb: chroma_bge_m3
|
||||
modules:
|
||||
- module_type: pass_query_expansion
|
||||
- module_type: query_decompose
|
||||
generator_module_type: llama_index_llm
|
||||
llm: openai
|
||||
model: [ gpt-4o-mini ]
|
||||
- module_type: hyde
|
||||
generator_module_type: llama_index_llm
|
||||
llm: openai
|
||||
model: [ gpt-4o-mini ]
|
||||
max_token: 64
|
||||
- module_type: multi_query_expansion
|
||||
generator_module_type: llama_index_llm
|
||||
llm: openai
|
||||
temperature: [ 0.2, 1.0 ]
|
||||
- node_line_name: retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: retrieval
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision,
|
||||
retrieval_ndcg, retrieval_map, retrieval_mrr ]
|
||||
speed_threshold: 10
|
||||
top_k: 10
|
||||
modules:
|
||||
- module_type: bm25
|
||||
- module_type: vectordb
|
||||
vectordb: chroma_bge_m3
|
||||
- module_type: hybrid_rrf
|
||||
weight_range: (4,80)
|
||||
- module_type: hybrid_cc
|
||||
normalize_method: [ mm, tmm, z, dbsf ]
|
||||
weight_range: (0.0, 1.0)
|
||||
test_weight_size: 101
|
||||
- node_type: passage_augmenter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
top_k: 5
|
||||
embedding_model: openai
|
||||
modules:
|
||||
- module_type: pass_passage_augmenter
|
||||
- module_type: prev_next_augmenter
|
||||
mode: next
|
||||
- node_type: passage_reranker
|
||||
strategy:
|
||||
metrics: [retrieval_f1, retrieval_recall, retrieval_precision]
|
||||
speed_threshold: 10
|
||||
top_k: 5
|
||||
modules:
|
||||
- module_type: pass_reranker
|
||||
- module_type: tart
|
||||
- module_type: monot5
|
||||
- module_type: upr
|
||||
- module_type: rankgpt
|
||||
- module_type: colbert_reranker
|
||||
- module_type: sentence_transformer_reranker
|
||||
- module_type: flag_embedding_reranker
|
||||
- module_type: flag_embedding_llm_reranker
|
||||
- module_type: time_reranker
|
||||
- module_type: openvino_reranker
|
||||
- module_type: flashrank_reranker
|
||||
- node_type: passage_filter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
modules:
|
||||
- module_type: pass_passage_filter
|
||||
- module_type: similarity_threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: similarity_percentile_cutoff
|
||||
percentile: 0.6
|
||||
- module_type: recency_filter
|
||||
threshold_datetime: 2015-01-01 3:45:07
|
||||
- module_type: threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: percentile_cutoff
|
||||
percentile: 0.6
|
||||
- node_type: passage_compressor
|
||||
strategy:
|
||||
metrics: [retrieval_token_f1, retrieval_token_recall, retrieval_token_precision]
|
||||
speed_threshold: 10
|
||||
modules:
|
||||
- module_type: pass_compressor
|
||||
- module_type: tree_summarize
|
||||
llm: openai
|
||||
model: gpt-4o-mini
|
||||
- module_type: refine
|
||||
llm: openai
|
||||
model: gpt-4o-mini
|
||||
- module_type: longllmlingua
|
||||
- node_line_name: post_retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: prompt_maker
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: bleu
|
||||
- metric_name: meteor
|
||||
- metric_name: rouge
|
||||
- metric_name: sem_score
|
||||
embedding_model: openai
|
||||
- metric_name: g_eval
|
||||
speed_threshold: 10
|
||||
generator_modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: openai
|
||||
model: [gpt-4o-mini]
|
||||
modules:
|
||||
- module_type: fstring
|
||||
prompt: ["Tell me something about the question: {query} \n\n {retrieved_contents}",
|
||||
"Question: {query} \n Something to read: {retrieved_contents} \n What's your answer?"]
|
||||
- module_type: long_context_reorder
|
||||
prompt: [ "Tell me something about the question: {query} \n\n {retrieved_contents}",
|
||||
"Question: {query} \n Something to read: {retrieved_contents} \n What's your answer?" ]
|
||||
- module_type: window_replacement
|
||||
prompt: [ "Tell me something about the question: {query} \n\n {retrieved_contents}",
|
||||
"Question: {query} \n Something to read: {retrieved_contents} \n What's your answer?" ]
|
||||
- node_type: generator
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: bleu
|
||||
- metric_name: meteor
|
||||
- metric_name: rouge
|
||||
- metric_name: sem_score
|
||||
embedding_model: openai
|
||||
- metric_name: g_eval # LLM Judge Metric. Default Model: gpt-4-turbo
|
||||
speed_threshold: 10
|
||||
modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: [openai]
|
||||
model: [gpt-4o-mini]
|
||||
temperature: [0.5, 1.0, 1.5]
|
||||
- module_type: openai_llm
|
||||
llm: gpt-4o-mini
|
||||
temperature: 0.8
|
||||
@@ -0,0 +1,121 @@
|
||||
vectordb:
|
||||
- name: chroma_bge_m3
|
||||
db_type: chroma
|
||||
client_type: persistent
|
||||
embedding_model: huggingface_bge_m3
|
||||
collection_name: openai
|
||||
path: ${PROJECT_DIR}/resources/chroma
|
||||
node_lines:
|
||||
- node_line_name: retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: retrieval
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision,
|
||||
retrieval_ndcg, retrieval_map, retrieval_mrr ]
|
||||
speed_threshold: 10
|
||||
top_k: 10
|
||||
modules:
|
||||
- module_type: bm25
|
||||
bm25_tokenizer: [ porter_stemmer, space, gpt2 ]
|
||||
- module_type: vectordb
|
||||
vectordb: chroma_bge_m3
|
||||
- module_type: hybrid_rrf
|
||||
weight_range: (4,80)
|
||||
- module_type: hybrid_cc
|
||||
normalize_method: [ mm, tmm, z, dbsf ]
|
||||
weight_range: (0.0, 1.0)
|
||||
test_weight_size: 101
|
||||
- node_type: passage_augmenter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
top_k: 5
|
||||
embedding_model: openai
|
||||
modules:
|
||||
- module_type: pass_passage_augmenter
|
||||
- module_type: prev_next_augmenter
|
||||
mode: next
|
||||
- node_type: passage_reranker
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 10
|
||||
top_k: 5
|
||||
modules:
|
||||
- module_type: pass_reranker
|
||||
- module_type: tart
|
||||
- module_type: monot5
|
||||
- module_type: upr
|
||||
- module_type: rankgpt
|
||||
- module_type: colbert_reranker
|
||||
- module_type: sentence_transformer_reranker
|
||||
- module_type: flag_embedding_reranker
|
||||
- module_type: flag_embedding_llm_reranker
|
||||
- module_type: time_reranker
|
||||
- module_type: openvino_reranker
|
||||
- module_type: flashrank_reranker
|
||||
- node_type: passage_filter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
modules:
|
||||
- module_type: pass_passage_filter
|
||||
- module_type: similarity_threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: similarity_percentile_cutoff
|
||||
percentile: 0.6
|
||||
- module_type: threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: percentile_cutoff
|
||||
percentile: 0.6
|
||||
- node_type: passage_compressor
|
||||
strategy:
|
||||
metrics: [retrieval_token_f1, retrieval_token_recall, retrieval_token_precision]
|
||||
speed_threshold: 10
|
||||
modules:
|
||||
- module_type: pass_compressor
|
||||
- module_type: tree_summarize
|
||||
llm: openai
|
||||
model: gpt-4o-mini
|
||||
- module_type: refine
|
||||
llm: openai
|
||||
model: gpt-4o-mini
|
||||
- module_type: longllmlingua
|
||||
- node_line_name: post_retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: prompt_maker
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: bleu
|
||||
- metric_name: meteor
|
||||
- metric_name: rouge
|
||||
- metric_name: sem_score
|
||||
embedding_model: openai
|
||||
speed_threshold: 10
|
||||
generator_modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: openai
|
||||
model: [gpt-4o-mini]
|
||||
modules:
|
||||
- module_type: fstring
|
||||
prompt:
|
||||
- "Answer to given questions with the following passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"
|
||||
- "There is a passages related to user question. Please response carefully to the following question. \n\n Passage: {retrieved_contents} \n\n Question: {query} \n\n Answer the question. Think step by step." # Zero-shot CoT prompt
|
||||
- "{retrieved_contents} \n\n Read the passage carefully, and answer this question. \n\n Question: {query} \n\n Answer the question. Be concise." # concise prompt
|
||||
- module_type: long_context_reorder
|
||||
prompt:
|
||||
- "Answer to given questions with the following passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"
|
||||
- "There is a passages related to user question. Please response carefully to the following question. \n\n Passage: {retrieved_contents} \n\n Question: {query} \n\n Answer the question. Think step by step." # Zero-shot CoT prompt
|
||||
- "{retrieved_contents} \n\n Read the passage carefully, and answer this question. \n\n Question: {query} \n\n Answer the question. Be concise." # concise prompt
|
||||
- node_type: generator
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: rouge
|
||||
- embedding_model: openai
|
||||
metric_name: sem_score
|
||||
- metric_name: bert_score
|
||||
speed_threshold: 10
|
||||
modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: [openai]
|
||||
model: [gpt-4o-mini]
|
||||
temperature: [0.5, 1.0]
|
||||
@@ -0,0 +1,105 @@
|
||||
node_lines:
|
||||
- node_line_name: retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: retrieval
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision,
|
||||
retrieval_ndcg, retrieval_map, retrieval_mrr ]
|
||||
speed_threshold: 10
|
||||
top_k: 10
|
||||
modules:
|
||||
- module_type: bm25
|
||||
bm25_tokenizer: [ porter_stemmer, space, gpt2 ]
|
||||
- module_type: vectordb
|
||||
vectordb: default
|
||||
- module_type: hybrid_rrf
|
||||
weight_range: (4,80)
|
||||
- module_type: hybrid_cc
|
||||
normalize_method: [ mm, tmm, z, dbsf ]
|
||||
weight_range: (0.0, 1.0)
|
||||
test_weight_size: 101
|
||||
- node_type: passage_augmenter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
top_k: 5
|
||||
embedding_model: openai
|
||||
modules:
|
||||
- module_type: pass_passage_augmenter
|
||||
- module_type: prev_next_augmenter
|
||||
mode: next
|
||||
- node_type: passage_reranker
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 10
|
||||
top_k: 5
|
||||
modules:
|
||||
- module_type: pass_reranker
|
||||
- module_type: tart
|
||||
- module_type: monot5
|
||||
- module_type: upr
|
||||
- module_type: cohere_reranker
|
||||
- module_type: rankgpt
|
||||
- module_type: jina_reranker
|
||||
- module_type: colbert_reranker
|
||||
- module_type: sentence_transformer_reranker
|
||||
- module_type: flag_embedding_reranker
|
||||
- module_type: flag_embedding_llm_reranker
|
||||
- module_type: time_reranker
|
||||
- module_type: openvino_reranker
|
||||
- module_type: voyageai_reranker
|
||||
- module_type: mixedbreadai_reranker
|
||||
- module_type: flashrank_reranker
|
||||
- node_type: passage_filter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
modules:
|
||||
- module_type: pass_passage_filter
|
||||
- module_type: similarity_threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: similarity_percentile_cutoff
|
||||
percentile: 0.6
|
||||
- module_type: threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: percentile_cutoff
|
||||
percentile: 0.6
|
||||
- node_line_name: post_retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: prompt_maker
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: bleu
|
||||
- metric_name: meteor
|
||||
- metric_name: rouge
|
||||
- metric_name: sem_score
|
||||
embedding_model: openai
|
||||
speed_threshold: 10
|
||||
generator_modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: openai
|
||||
model: [gpt-4o-mini]
|
||||
modules:
|
||||
- module_type: fstring
|
||||
prompt:
|
||||
- "Answer to given questions with the following passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"
|
||||
- "There is a passages related to user question. Please response carefully to the following question. \n\n Passage: {retrieved_contents} \n\n Question: {query} \n\n Answer the question. Think step by step." # Zero-shot CoT prompt
|
||||
- "{retrieved_contents} \n\n Read the passage carefully, and answer this question. \n\n Question: {query} \n\n Answer the question. Be concise." # concise prompt
|
||||
- module_type: long_context_reorder
|
||||
prompt:
|
||||
- "Answer to given questions with the following passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"
|
||||
- "There is a passages related to user question. Please response carefully to the following question. \n\n Passage: {retrieved_contents} \n\n Question: {query} \n\n Answer the question. Think step by step." # Zero-shot CoT prompt
|
||||
- "{retrieved_contents} \n\n Read the passage carefully, and answer this question. \n\n Question: {query} \n\n Answer the question. Be concise." # concise prompt
|
||||
- node_type: generator
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: rouge
|
||||
- embedding_model: openai
|
||||
metric_name: sem_score
|
||||
- metric_name: bert_score
|
||||
speed_threshold: 10
|
||||
modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: [openai]
|
||||
model: [gpt-4o-mini]
|
||||
temperature: [0.5, 1.0]
|
||||
@@ -0,0 +1,151 @@
|
||||
node_lines:
|
||||
- node_line_name: pre_retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: query_expansion
|
||||
strategy:
|
||||
metrics: [retrieval_f1, retrieval_recall, retrieval_precision]
|
||||
speed_threshold: 10
|
||||
top_k: 10
|
||||
retrieval_modules:
|
||||
- module_type: bm25
|
||||
bm25_tokenizer: [ porter_stemmer, space, gpt2 ]
|
||||
- module_type: vectordb
|
||||
vectordb: default
|
||||
modules:
|
||||
- module_type: pass_query_expansion
|
||||
- module_type: query_decompose
|
||||
generator_module_type: llama_index_llm
|
||||
llm: openai
|
||||
model: [ gpt-4o-mini ]
|
||||
- module_type: hyde
|
||||
generator_module_type: llama_index_llm
|
||||
llm: openai
|
||||
model: [ gpt-4o-mini ]
|
||||
max_token: 64
|
||||
- module_type: multi_query_expansion
|
||||
generator_module_type: llama_index_llm
|
||||
llm: openai
|
||||
temperature: [ 0.2, 1.0 ]
|
||||
- node_line_name: retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: retrieval
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision,
|
||||
retrieval_ndcg, retrieval_map, retrieval_mrr ]
|
||||
speed_threshold: 10
|
||||
top_k: 10
|
||||
modules:
|
||||
- module_type: bm25
|
||||
- module_type: vectordb
|
||||
vectordb: default
|
||||
- module_type: hybrid_rrf
|
||||
weight_range: (4,80)
|
||||
- module_type: hybrid_cc
|
||||
normalize_method: [ mm, tmm, z, dbsf ]
|
||||
weight_range: (0.0, 1.0)
|
||||
test_weight_size: 101
|
||||
- node_type: passage_augmenter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
top_k: 5
|
||||
embedding_model: openai
|
||||
modules:
|
||||
- module_type: pass_passage_augmenter
|
||||
- module_type: prev_next_augmenter
|
||||
mode: next
|
||||
- node_type: passage_reranker
|
||||
strategy:
|
||||
metrics: [retrieval_f1, retrieval_recall, retrieval_precision]
|
||||
speed_threshold: 10
|
||||
top_k: 5
|
||||
modules:
|
||||
- module_type: pass_reranker
|
||||
- module_type: tart
|
||||
- module_type: monot5
|
||||
- module_type: upr
|
||||
- module_type: cohere_reranker
|
||||
- module_type: rankgpt
|
||||
- module_type: jina_reranker
|
||||
- module_type: colbert_reranker
|
||||
- module_type: sentence_transformer_reranker
|
||||
- module_type: flag_embedding_reranker
|
||||
- module_type: flag_embedding_llm_reranker
|
||||
- module_type: time_reranker
|
||||
- module_type: openvino_reranker
|
||||
- module_type: voyageai_reranker
|
||||
- module_type: mixedbreadai_reranker
|
||||
- module_type: flashrank_reranker
|
||||
- node_type: passage_filter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
modules:
|
||||
- module_type: pass_passage_filter
|
||||
- module_type: similarity_threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: similarity_percentile_cutoff
|
||||
percentile: 0.6
|
||||
- module_type: recency_filter
|
||||
threshold_datetime: 2015-01-01 3:45:07
|
||||
- module_type: threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: percentile_cutoff
|
||||
percentile: 0.6
|
||||
- node_type: passage_compressor
|
||||
strategy:
|
||||
metrics: [retrieval_token_f1, retrieval_token_recall, retrieval_token_precision]
|
||||
speed_threshold: 10
|
||||
modules:
|
||||
- module_type: pass_compressor
|
||||
- module_type: tree_summarize
|
||||
llm: openai
|
||||
model: gpt-4o-mini
|
||||
- module_type: refine
|
||||
llm: openai
|
||||
model: gpt-4o-mini
|
||||
- module_type: longllmlingua
|
||||
- node_line_name: post_retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: prompt_maker
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: bleu
|
||||
- metric_name: meteor
|
||||
- metric_name: rouge
|
||||
- metric_name: sem_score
|
||||
embedding_model: openai
|
||||
- metric_name: g_eval
|
||||
speed_threshold: 10
|
||||
generator_modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: openai
|
||||
model: [gpt-4o-mini]
|
||||
modules:
|
||||
- module_type: fstring
|
||||
prompt: ["Tell me something about the question: {query} \n\n {retrieved_contents}",
|
||||
"Question: {query} \n Something to read: {retrieved_contents} \n What's your answer?"]
|
||||
- module_type: long_context_reorder
|
||||
prompt: [ "Tell me something about the question: {query} \n\n {retrieved_contents}",
|
||||
"Question: {query} \n Something to read: {retrieved_contents} \n What's your answer?" ]
|
||||
- module_type: window_replacement
|
||||
prompt: [ "Tell me something about the question: {query} \n\n {retrieved_contents}",
|
||||
"Question: {query} \n Something to read: {retrieved_contents} \n What's your answer?" ]
|
||||
- node_type: generator
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: bleu
|
||||
- metric_name: meteor
|
||||
- metric_name: rouge
|
||||
- metric_name: sem_score
|
||||
embedding_model: openai
|
||||
- metric_name: g_eval # LLM Judge Metric. Default Model: gpt-4-turbo
|
||||
speed_threshold: 10
|
||||
modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: [openai]
|
||||
model: [gpt-4o-mini]
|
||||
temperature: [0.5, 1.0, 1.5]
|
||||
- module_type: openai_llm
|
||||
llm: gpt-4o-mini
|
||||
temperature: 0.8
|
||||
@@ -0,0 +1,118 @@
|
||||
node_lines:
|
||||
- node_line_name: retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: retrieval
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision,
|
||||
retrieval_ndcg, retrieval_map, retrieval_mrr ]
|
||||
speed_threshold: 10
|
||||
top_k: 10
|
||||
modules:
|
||||
- module_type: bm25
|
||||
bm25_tokenizer: [ porter_stemmer, space, gpt2 ]
|
||||
- module_type: vectordb
|
||||
vectordb: default
|
||||
- module_type: hybrid_rrf
|
||||
weight_range: (4,80)
|
||||
- module_type: hybrid_cc
|
||||
normalize_method: [ mm, tmm, z, dbsf ]
|
||||
weight_range: (0.0, 1.0)
|
||||
test_weight_size: 101
|
||||
- node_type: passage_augmenter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
top_k: 5
|
||||
embedding_model: openai
|
||||
modules:
|
||||
- module_type: pass_passage_augmenter
|
||||
- module_type: prev_next_augmenter
|
||||
mode: next
|
||||
- node_type: passage_reranker
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 10
|
||||
top_k: 5
|
||||
modules:
|
||||
- module_type: pass_reranker
|
||||
- module_type: tart
|
||||
- module_type: monot5
|
||||
- module_type: upr
|
||||
- module_type: cohere_reranker
|
||||
- module_type: rankgpt
|
||||
- module_type: jina_reranker
|
||||
- module_type: colbert_reranker
|
||||
- module_type: sentence_transformer_reranker
|
||||
- module_type: flag_embedding_reranker
|
||||
- module_type: flag_embedding_llm_reranker
|
||||
- module_type: time_reranker
|
||||
- module_type: openvino_reranker
|
||||
- module_type: voyageai_reranker
|
||||
- module_type: mixedbreadai_reranker
|
||||
- module_type: flashrank_reranker
|
||||
- node_type: passage_filter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
modules:
|
||||
- module_type: pass_passage_filter
|
||||
- module_type: similarity_threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: similarity_percentile_cutoff
|
||||
percentile: 0.6
|
||||
- module_type: threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: percentile_cutoff
|
||||
percentile: 0.6
|
||||
- node_type: passage_compressor
|
||||
strategy:
|
||||
metrics: [retrieval_token_f1, retrieval_token_recall, retrieval_token_precision]
|
||||
speed_threshold: 10
|
||||
modules:
|
||||
- module_type: pass_compressor
|
||||
- module_type: tree_summarize
|
||||
llm: openai
|
||||
model: gpt-4o-mini
|
||||
- module_type: refine
|
||||
llm: openai
|
||||
model: gpt-4o-mini
|
||||
- module_type: longllmlingua
|
||||
- node_line_name: post_retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: prompt_maker
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: bleu
|
||||
- metric_name: meteor
|
||||
- metric_name: rouge
|
||||
- metric_name: sem_score
|
||||
embedding_model: openai
|
||||
speed_threshold: 10
|
||||
generator_modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: openai
|
||||
model: [gpt-4o-mini]
|
||||
modules:
|
||||
- module_type: fstring
|
||||
prompt:
|
||||
- "Answer to given questions with the following passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"
|
||||
- "There is a passages related to user question. Please response carefully to the following question. \n\n Passage: {retrieved_contents} \n\n Question: {query} \n\n Answer the question. Think step by step." # Zero-shot CoT prompt
|
||||
- "{retrieved_contents} \n\n Read the passage carefully, and answer this question. \n\n Question: {query} \n\n Answer the question. Be concise." # concise prompt
|
||||
- module_type: long_context_reorder
|
||||
prompt:
|
||||
- "Answer to given questions with the following passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"
|
||||
- "There is a passages related to user question. Please response carefully to the following question. \n\n Passage: {retrieved_contents} \n\n Question: {query} \n\n Answer the question. Think step by step." # Zero-shot CoT prompt
|
||||
- "{retrieved_contents} \n\n Read the passage carefully, and answer this question. \n\n Question: {query} \n\n Answer the question. Be concise." # concise prompt
|
||||
- node_type: generator
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: rouge
|
||||
- embedding_model: openai
|
||||
metric_name: sem_score
|
||||
- metric_name: bert_score
|
||||
speed_threshold: 10
|
||||
modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: [openai]
|
||||
model: [gpt-4o-mini]
|
||||
temperature: [0.5, 1.0]
|
||||
@@ -0,0 +1,83 @@
|
||||
node_lines:
|
||||
- node_line_name: retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: retrieval
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision,
|
||||
retrieval_ndcg, retrieval_map, retrieval_mrr ]
|
||||
speed_threshold: 10
|
||||
top_k: 10
|
||||
modules:
|
||||
- module_type: bm25
|
||||
bm25_tokenizer: [ porter_stemmer, space, gpt2 ]
|
||||
- module_type: vectordb
|
||||
vectordb: default
|
||||
- module_type: hybrid_rrf
|
||||
weight_range: (4,80)
|
||||
- module_type: hybrid_cc
|
||||
normalize_method: [ mm, tmm, z, dbsf ]
|
||||
weight_range: (0.0, 1.0)
|
||||
test_weight_size: 101
|
||||
- node_type: passage_augmenter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
top_k: 5
|
||||
embedding_model: openai
|
||||
modules:
|
||||
- module_type: pass_passage_augmenter
|
||||
- module_type: prev_next_augmenter
|
||||
mode: next
|
||||
- node_type: passage_filter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
modules:
|
||||
- module_type: pass_passage_filter
|
||||
- module_type: similarity_threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: similarity_percentile_cutoff
|
||||
percentile: 0.6
|
||||
- module_type: threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: percentile_cutoff
|
||||
percentile: 0.6
|
||||
- node_line_name: post_retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: prompt_maker
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: bleu
|
||||
- metric_name: meteor
|
||||
- metric_name: rouge
|
||||
- metric_name: sem_score
|
||||
embedding_model: openai
|
||||
speed_threshold: 10
|
||||
generator_modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: openai
|
||||
model: [gpt-4o-mini]
|
||||
modules:
|
||||
- module_type: fstring
|
||||
prompt:
|
||||
- "Answer to given questions with the following passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"
|
||||
- "There is a passages related to user question. Please response carefully to the following question. \n\n Passage: {retrieved_contents} \n\n Question: {query} \n\n Answer the question. Think step by step." # Zero-shot CoT prompt
|
||||
- "{retrieved_contents} \n\n Read the passage carefully, and answer this question. \n\n Question: {query} \n\n Answer the question. Be concise." # concise prompt
|
||||
- module_type: long_context_reorder
|
||||
prompt:
|
||||
- "Answer to given questions with the following passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"
|
||||
- "There is a passages related to user question. Please response carefully to the following question. \n\n Passage: {retrieved_contents} \n\n Question: {query} \n\n Answer the question. Think step by step." # Zero-shot CoT prompt
|
||||
- "{retrieved_contents} \n\n Read the passage carefully, and answer this question. \n\n Question: {query} \n\n Answer the question. Be concise." # concise prompt
|
||||
- node_type: generator
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: rouge
|
||||
- embedding_model: openai
|
||||
metric_name: sem_score
|
||||
- metric_name: bert_score
|
||||
speed_threshold: 10
|
||||
modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: [openai]
|
||||
model: [gpt-4o-mini]
|
||||
temperature: [0.5, 1.0]
|
||||
@@ -0,0 +1,129 @@
|
||||
node_lines:
|
||||
- node_line_name: pre_retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: query_expansion
|
||||
strategy:
|
||||
metrics: [retrieval_f1, retrieval_recall, retrieval_precision]
|
||||
speed_threshold: 10
|
||||
top_k: 10
|
||||
retrieval_modules:
|
||||
- module_type: bm25
|
||||
bm25_tokenizer: [ porter_stemmer, space, gpt2 ]
|
||||
- module_type: vectordb
|
||||
vectordb: default
|
||||
modules:
|
||||
- module_type: pass_query_expansion
|
||||
- module_type: query_decompose
|
||||
generator_module_type: llama_index_llm
|
||||
llm: openai
|
||||
model: [ gpt-4o-mini ]
|
||||
- module_type: hyde
|
||||
generator_module_type: llama_index_llm
|
||||
llm: openai
|
||||
model: [ gpt-4o-mini ]
|
||||
max_token: 64
|
||||
- module_type: multi_query_expansion
|
||||
generator_module_type: llama_index_llm
|
||||
llm: openai
|
||||
temperature: [ 0.2, 1.0 ]
|
||||
- node_line_name: retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: retrieval
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision,
|
||||
retrieval_ndcg, retrieval_map, retrieval_mrr ]
|
||||
speed_threshold: 10
|
||||
top_k: 10
|
||||
modules:
|
||||
- module_type: bm25
|
||||
- module_type: vectordb
|
||||
embedding_model: openai
|
||||
embedding_batch: 256
|
||||
- module_type: hybrid_rrf
|
||||
weight_range: (4,80)
|
||||
- module_type: hybrid_cc
|
||||
normalize_method: [ mm, tmm, z, dbsf ]
|
||||
weight_range: (0.0, 1.0)
|
||||
test_weight_size: 101
|
||||
- node_type: passage_augmenter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
top_k: 5
|
||||
embedding_model: openai
|
||||
modules:
|
||||
- module_type: pass_passage_augmenter
|
||||
- module_type: prev_next_augmenter
|
||||
mode: next
|
||||
- node_type: passage_filter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
modules:
|
||||
- module_type: pass_passage_filter
|
||||
- module_type: similarity_threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: similarity_percentile_cutoff
|
||||
percentile: 0.6
|
||||
- module_type: recency_filter
|
||||
threshold_datetime: 2015-01-01 3:45:07
|
||||
- module_type: threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: percentile_cutoff
|
||||
percentile: 0.6
|
||||
- node_type: passage_compressor
|
||||
strategy:
|
||||
metrics: [retrieval_token_f1, retrieval_token_recall, retrieval_token_precision]
|
||||
speed_threshold: 10
|
||||
modules:
|
||||
- module_type: pass_compressor
|
||||
- module_type: tree_summarize
|
||||
llm: openai
|
||||
model: gpt-4o-mini
|
||||
- module_type: refine
|
||||
llm: openai
|
||||
model: gpt-4o-mini
|
||||
- node_line_name: post_retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: prompt_maker
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: bleu
|
||||
- metric_name: meteor
|
||||
- metric_name: rouge
|
||||
- metric_name: sem_score
|
||||
embedding_model: openai
|
||||
- metric_name: g_eval
|
||||
speed_threshold: 10
|
||||
generator_modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: openai
|
||||
model: [gpt-4o-mini]
|
||||
modules:
|
||||
- module_type: fstring
|
||||
prompt: ["Tell me something about the question: {query} \n\n {retrieved_contents}",
|
||||
"Question: {query} \n Something to read: {retrieved_contents} \n What's your answer?"]
|
||||
- module_type: long_context_reorder
|
||||
prompt: [ "Tell me something about the question: {query} \n\n {retrieved_contents}",
|
||||
"Question: {query} \n Something to read: {retrieved_contents} \n What's your answer?" ]
|
||||
- module_type: window_replacement
|
||||
prompt: [ "Tell me something about the question: {query} \n\n {retrieved_contents}",
|
||||
"Question: {query} \n Something to read: {retrieved_contents} \n What's your answer?" ]
|
||||
- node_type: generator
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: bleu
|
||||
- metric_name: meteor
|
||||
- metric_name: rouge
|
||||
- metric_name: sem_score
|
||||
embedding_model: openai
|
||||
- metric_name: g_eval # LLM Judge Metric. Default Model: gpt-4-turbo
|
||||
speed_threshold: 10
|
||||
modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: [openai]
|
||||
model: [gpt-4o-mini]
|
||||
temperature: [0.5, 1.0, 1.5]
|
||||
- module_type: openai_llm
|
||||
llm: gpt-4o-mini
|
||||
temperature: 0.8
|
||||
@@ -0,0 +1,95 @@
|
||||
node_lines:
|
||||
- node_line_name: retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: retrieval
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision,
|
||||
retrieval_ndcg, retrieval_map, retrieval_mrr ]
|
||||
speed_threshold: 10
|
||||
top_k: 10
|
||||
modules:
|
||||
- module_type: bm25
|
||||
bm25_tokenizer: [ porter_stemmer, space, gpt2 ]
|
||||
- module_type: vectordb
|
||||
vectordb: default
|
||||
- module_type: hybrid_rrf
|
||||
weight_range: (4,80)
|
||||
- module_type: hybrid_cc
|
||||
normalize_method: [ mm, tmm, z, dbsf ]
|
||||
weight_range: (0.0, 1.0)
|
||||
test_weight_size: 101
|
||||
- node_type: passage_augmenter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
top_k: 5
|
||||
embedding_model: openai
|
||||
modules:
|
||||
- module_type: pass_passage_augmenter
|
||||
- module_type: prev_next_augmenter
|
||||
mode: next
|
||||
- node_type: passage_filter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
modules:
|
||||
- module_type: pass_passage_filter
|
||||
- module_type: similarity_threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: similarity_percentile_cutoff
|
||||
percentile: 0.6
|
||||
- module_type: threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: percentile_cutoff
|
||||
percentile: 0.6
|
||||
- node_type: passage_compressor
|
||||
strategy:
|
||||
metrics: [retrieval_token_f1, retrieval_token_recall, retrieval_token_precision]
|
||||
speed_threshold: 10
|
||||
modules:
|
||||
- module_type: pass_compressor
|
||||
- module_type: tree_summarize
|
||||
llm: openai
|
||||
model: gpt-4o-mini
|
||||
- module_type: refine
|
||||
llm: openai
|
||||
model: gpt-4o-mini
|
||||
- node_line_name: post_retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: prompt_maker
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: bleu
|
||||
- metric_name: meteor
|
||||
- metric_name: rouge
|
||||
- metric_name: sem_score
|
||||
embedding_model: openai
|
||||
speed_threshold: 10
|
||||
generator_modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: openai
|
||||
model: [gpt-4o-mini]
|
||||
modules:
|
||||
- module_type: fstring
|
||||
prompt:
|
||||
- "Answer to given questions with the following passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"
|
||||
- "There is a passages related to user question. Please response carefully to the following question. \n\n Passage: {retrieved_contents} \n\n Question: {query} \n\n Answer the question. Think step by step." # Zero-shot CoT prompt
|
||||
- "{retrieved_contents} \n\n Read the passage carefully, and answer this question. \n\n Question: {query} \n\n Answer the question. Be concise." # concise prompt
|
||||
- module_type: long_context_reorder
|
||||
prompt:
|
||||
- "Answer to given questions with the following passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"
|
||||
- "There is a passages related to user question. Please response carefully to the following question. \n\n Passage: {retrieved_contents} \n\n Question: {query} \n\n Answer the question. Think step by step." # Zero-shot CoT prompt
|
||||
- "{retrieved_contents} \n\n Read the passage carefully, and answer this question. \n\n Question: {query} \n\n Answer the question. Be concise." # concise prompt
|
||||
- node_type: generator
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: rouge
|
||||
- embedding_model: openai
|
||||
metric_name: sem_score
|
||||
- metric_name: bert_score
|
||||
speed_threshold: 10
|
||||
modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: [openai]
|
||||
model: [gpt-4o-mini]
|
||||
temperature: [0.5, 1.0]
|
||||
@@ -0,0 +1,33 @@
|
||||
vectordb:
|
||||
- name: mpnet_base_chroma
|
||||
db_type: chroma
|
||||
client_type: persistent
|
||||
embedding_model: huggingface_all_mpnet_base_v2
|
||||
collection_name: huggingface_all_mpnet_base_v2
|
||||
path: ${PROJECT_DIR}/data/chroma
|
||||
node_lines:
|
||||
- node_line_name: retrieve_node_line
|
||||
nodes:
|
||||
- node_type: retrieval
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
top_k: 3
|
||||
modules:
|
||||
- module_type: vectordb
|
||||
vectordb: mpnet_base_chroma
|
||||
- node_line_name: post_retrieve_node_line
|
||||
nodes:
|
||||
- node_type: prompt_maker
|
||||
strategy:
|
||||
metrics: [ meteor, rouge, bert_score ]
|
||||
modules:
|
||||
- module_type: fstring
|
||||
prompt: "Read the passages and answer the given question. \n Question: {query} \n Passage: {retrieved_contents} \n Answer : "
|
||||
- node_type: generator
|
||||
strategy:
|
||||
metrics: [ bleu, rouge, bert_score ]
|
||||
modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: bedrock
|
||||
model: amazon.titan-text-express-v1
|
||||
profile_name: your_profile_name # Plz replace this with your profile name
|
||||
@@ -0,0 +1,31 @@
|
||||
vectordb:
|
||||
- name: baai_chroma
|
||||
db_type: chroma
|
||||
client_type: persistent
|
||||
embedding_model: huggingface_baai_bge_small
|
||||
collection_name: huggingface_baai_bge_small
|
||||
path: ${PROJECT_DIR}/data/chroma
|
||||
node_lines:
|
||||
- node_line_name: retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: retrieval
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
top_k: 3
|
||||
modules:
|
||||
- module_type: vectordb
|
||||
vectordb: baai_chroma
|
||||
- node_line_name: post_retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: prompt_maker
|
||||
strategy:
|
||||
metrics: [ meteor, rouge, bert_score ]
|
||||
modules:
|
||||
- module_type: fstring
|
||||
prompt: "Read the passages and answer the given question. \n Question: {query} \n Passage: {retrieved_contents} \n Answer : "
|
||||
- node_type: generator
|
||||
strategy:
|
||||
metrics: [ bleu, rouge, bert_score ]
|
||||
modules:
|
||||
- module_type: vllm
|
||||
llm: mistralai/Mistral-7B-Instruct-v0.2
|
||||
@@ -0,0 +1,34 @@
|
||||
vectordb:
|
||||
- name: mpnet_base_chroma
|
||||
db_type: chroma
|
||||
client_type: persistent
|
||||
embedding_model: huggingface_all_mpnet_base_v2
|
||||
collection_name: huggingface_all_mpnet_base_v2
|
||||
path: ${PROJECT_DIR}/data/chroma
|
||||
node_lines:
|
||||
- node_line_name: retrieve_node_line
|
||||
nodes:
|
||||
- node_type: retrieval
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
top_k: 3
|
||||
modules:
|
||||
- module_type: vectordb
|
||||
vectordb: mpnet_base_chroma
|
||||
- node_line_name: post_retrieve_node_line
|
||||
nodes:
|
||||
- node_type: prompt_maker
|
||||
strategy:
|
||||
metrics: [ meteor, rouge, bert_score ]
|
||||
modules:
|
||||
- module_type: fstring
|
||||
prompt: "Read the passages and answer the given question. \n Question: {query} \n Passage: {retrieved_contents} \n Answer : "
|
||||
- node_type: generator
|
||||
strategy:
|
||||
metrics: [ bleu, rouge, bert_score ]
|
||||
modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: ollama
|
||||
model: llama3
|
||||
batch: 1
|
||||
request_timeout: 100 # You can increase this value if your model is big (slow)
|
||||
@@ -0,0 +1,25 @@
|
||||
node_lines:
|
||||
- node_line_name: retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: retrieval
|
||||
strategy:
|
||||
metrics: [retrieval_f1, retrieval_recall, retrieval_precision]
|
||||
top_k: 3
|
||||
modules:
|
||||
- module_type: vectordb
|
||||
vectordb: default
|
||||
- node_line_name: post_retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: prompt_maker
|
||||
strategy:
|
||||
metrics: [bleu, meteor, rouge]
|
||||
modules:
|
||||
- module_type: fstring
|
||||
prompt: "Read the passages and answer the given question. \n Question: {query} \n Passage: {retrieved_contents} \n Answer : "
|
||||
- node_type: generator
|
||||
strategy:
|
||||
metrics: [bleu, rouge]
|
||||
modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: openai
|
||||
model: [ gpt-4o-mini ]
|
||||
@@ -0,0 +1,47 @@
|
||||
vectordb:
|
||||
- name: default
|
||||
db_type: chroma
|
||||
client_type: persistent
|
||||
embedding_model: openai
|
||||
collection_name: openai
|
||||
path: ${PROJECT_DIR}/data/chroma
|
||||
node_lines:
|
||||
- node_line_name: retrieve_node_line
|
||||
nodes:
|
||||
- node_type: retrieval
|
||||
modules:
|
||||
- module_type: vectordb
|
||||
vectordb: default
|
||||
top_k: 3
|
||||
strategy:
|
||||
metrics:
|
||||
- retrieval_f1
|
||||
- retrieval_recall
|
||||
- retrieval_precision
|
||||
- node_line_name: post_retrieve_node_line
|
||||
nodes:
|
||||
- node_type: prompt_maker
|
||||
modules:
|
||||
- module_type: fstring
|
||||
prompt: "Read the passages and answer the given question. \n Question: {query} \n Passage: {retrieved_contents} \n Answer : "
|
||||
strategy:
|
||||
generator_modules:
|
||||
- batch: 2
|
||||
llm: openai
|
||||
module_type: llama_index_llm
|
||||
metrics:
|
||||
- bleu
|
||||
- meteor
|
||||
- rouge
|
||||
- node_type: generator
|
||||
modules:
|
||||
- batch: 2
|
||||
llm: openai
|
||||
model: gpt-3.5-turbo-16k
|
||||
module_type: llama_index_llm
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: bleu
|
||||
- metric_name: meteor
|
||||
- embedding_model: openai
|
||||
metric_name: sem_score
|
||||
159
autorag-workspace/example/sample_config/rag/full.yaml
Normal file
159
autorag-workspace/example/sample_config/rag/full.yaml
Normal file
@@ -0,0 +1,159 @@
|
||||
vectordb:
|
||||
- name: chroma_large
|
||||
db_type: chroma
|
||||
client_type: persistent
|
||||
embedding_model: openai_embed_3_large
|
||||
collection_name: openai_embed_3_large
|
||||
path: ${PROJECT_DIR}/resources/chroma
|
||||
node_lines:
|
||||
- node_line_name: pre_retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: query_expansion
|
||||
strategy:
|
||||
metrics: [retrieval_f1, retrieval_recall, retrieval_precision]
|
||||
speed_threshold: 10
|
||||
top_k: 10
|
||||
retrieval_modules:
|
||||
- module_type: bm25
|
||||
bm25_tokenizer: [ porter_stemmer, ko_kiwi, space, gpt2, ko_okt, ko_kkma, sudachipy ]
|
||||
- module_type: vectordb
|
||||
vectordb: chroma_large
|
||||
modules:
|
||||
- module_type: pass_query_expansion
|
||||
- module_type: query_decompose
|
||||
generator_module_type: llama_index_llm
|
||||
llm: openai
|
||||
model: [ gpt-3.5-turbo-16k, gpt-3.5-turbo-1106 ]
|
||||
- module_type: hyde
|
||||
generator_module_type: llama_index_llm
|
||||
llm: openai
|
||||
model: [ gpt-3.5-turbo-16k ]
|
||||
max_token: 64
|
||||
- module_type: multi_query_expansion
|
||||
generator_module_type: llama_index_llm
|
||||
llm: openai
|
||||
temperature: [ 0.2, 1.0 ]
|
||||
- node_line_name: retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: retrieval
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision,
|
||||
retrieval_ndcg, retrieval_map, retrieval_mrr ]
|
||||
speed_threshold: 10
|
||||
top_k: 10
|
||||
modules:
|
||||
- module_type: bm25
|
||||
- module_type: vectordb
|
||||
vectordb: chroma_large
|
||||
embedding_batch: 256
|
||||
- module_type: hybrid_rrf
|
||||
weight_range: (4,80)
|
||||
- module_type: hybrid_cc
|
||||
normalize_method: [ mm, tmm, z, dbsf ]
|
||||
weight_range: (0.0, 1.0)
|
||||
test_weight_size: 101
|
||||
- node_type: passage_augmenter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
top_k: 5
|
||||
embedding_model: openai
|
||||
modules:
|
||||
- module_type: pass_passage_augmenter
|
||||
- module_type: prev_next_augmenter
|
||||
mode: next
|
||||
- node_type: passage_reranker
|
||||
strategy:
|
||||
metrics: [retrieval_f1, retrieval_recall, retrieval_precision]
|
||||
speed_threshold: 10
|
||||
top_k: 5
|
||||
modules:
|
||||
- module_type: pass_reranker
|
||||
- module_type: tart
|
||||
- module_type: monot5
|
||||
- module_type: upr
|
||||
- module_type: cohere_reranker
|
||||
- module_type: rankgpt
|
||||
- module_type: jina_reranker
|
||||
- module_type: colbert_reranker
|
||||
- module_type: sentence_transformer_reranker
|
||||
- module_type: flag_embedding_reranker
|
||||
- module_type: flag_embedding_llm_reranker
|
||||
- module_type: time_reranker
|
||||
- module_type: openvino_reranker
|
||||
- module_type: voyageai_reranker
|
||||
- module_type: mixedbreadai_reranker
|
||||
- module_type: flashrank_reranker
|
||||
- node_type: passage_filter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
modules:
|
||||
- module_type: pass_passage_filter
|
||||
- module_type: similarity_threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: similarity_percentile_cutoff
|
||||
percentile: 0.6
|
||||
- module_type: recency_filter
|
||||
threshold_datetime: 2015-01-01 3:45:07
|
||||
- module_type: threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: percentile_cutoff
|
||||
percentile: 0.6
|
||||
- node_type: passage_compressor
|
||||
strategy:
|
||||
metrics: [retrieval_token_f1, retrieval_token_recall, retrieval_token_precision]
|
||||
speed_threshold: 10
|
||||
modules:
|
||||
- module_type: pass_compressor
|
||||
- module_type: tree_summarize
|
||||
llm: openai
|
||||
model: gpt-3.5-turbo-16k
|
||||
- module_type: refine
|
||||
llm: openai
|
||||
model: gpt-3.5-turbo-16k
|
||||
- module_type: longllmlingua
|
||||
- node_line_name: post_retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: prompt_maker
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: bleu
|
||||
- metric_name: meteor
|
||||
- metric_name: rouge
|
||||
- metric_name: sem_score
|
||||
embedding_model: openai
|
||||
- metric_name: g_eval
|
||||
speed_threshold: 10
|
||||
generator_modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: openai
|
||||
model: [gpt-3.5-turbo-16k, gpt-3.5-turbo-1106]
|
||||
modules:
|
||||
- module_type: fstring
|
||||
prompt: ["Tell me something about the question: {query} \n\n {retrieved_contents}",
|
||||
"Question: {query} \n Something to read: {retrieved_contents} \n What's your answer?"]
|
||||
- module_type: long_context_reorder
|
||||
prompt: [ "Tell me something about the question: {query} \n\n {retrieved_contents}",
|
||||
"Question: {query} \n Something to read: {retrieved_contents} \n What's your answer?" ]
|
||||
- module_type: window_replacement
|
||||
prompt: [ "Tell me something about the question: {query} \n\n {retrieved_contents}",
|
||||
"Question: {query} \n Something to read: {retrieved_contents} \n What's your answer?" ]
|
||||
- node_type: generator
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: bleu
|
||||
- metric_name: meteor
|
||||
- metric_name: rouge
|
||||
- metric_name: sem_score
|
||||
embedding_model: openai
|
||||
- metric_name: g_eval
|
||||
speed_threshold: 10
|
||||
modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: [openai]
|
||||
model: [gpt-3.5-turbo-16k, gpt-3.5-turbo-1106]
|
||||
temperature: [0.5, 1.0, 1.5]
|
||||
- module_type: openai_llm
|
||||
llm: gpt-3.5-turbo
|
||||
temperature: 0.8
|
||||
@@ -0,0 +1,93 @@
|
||||
vectordb:
|
||||
- name: chroma_bge_m3
|
||||
db_type: chroma
|
||||
client_type: persistent
|
||||
embedding_model: huggingface_bge_m3
|
||||
collection_name: openai
|
||||
path: ${PROJECT_DIR}/resources/chroma
|
||||
node_lines:
|
||||
- node_line_name: retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: retrieval
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision,
|
||||
retrieval_ndcg, retrieval_map, retrieval_mrr ]
|
||||
speed_threshold: 10
|
||||
top_k: 10
|
||||
modules:
|
||||
- module_type: bm25
|
||||
bm25_tokenizer: [ ko_kiwi, ko_okt, ko_kkma ]
|
||||
- module_type: vectordb
|
||||
vectordb: chroma_bge_m3
|
||||
- module_type: hybrid_rrf
|
||||
weight_range: (4,80)
|
||||
- module_type: hybrid_cc
|
||||
normalize_method: [ mm, tmm, z, dbsf ]
|
||||
weight_range: (0.0, 1.0)
|
||||
test_weight_size: 101
|
||||
- node_type: passage_augmenter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
top_k: 5
|
||||
embedding_model: openai
|
||||
modules:
|
||||
- module_type: pass_passage_augmenter
|
||||
- module_type: prev_next_augmenter
|
||||
mode: next
|
||||
- node_type: passage_reranker
|
||||
modules:
|
||||
- module_type: koreranker
|
||||
- module_type: flag_embedding_llm_reranker # Requires enough GPU resources
|
||||
- module_type: pass_reranker
|
||||
strategy:
|
||||
metrics: [ retrieval_recall, retrieval_precision, retrieval_map ]
|
||||
top_k: 3
|
||||
- node_type: passage_filter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
modules:
|
||||
- module_type: pass_passage_filter
|
||||
- module_type: similarity_threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: similarity_percentile_cutoff
|
||||
percentile: 0.6
|
||||
- module_type: threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: percentile_cutoff
|
||||
percentile: 0.6
|
||||
- node_line_name: post_retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: prompt_maker
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: bleu
|
||||
- metric_name: meteor
|
||||
- metric_name: rouge
|
||||
- metric_name: sem_score
|
||||
embedding_model: openai
|
||||
speed_threshold: 10
|
||||
generator_modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: openai
|
||||
model: [gpt-4o-mini]
|
||||
modules:
|
||||
- module_type: fstring
|
||||
prompt: ["주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"]
|
||||
- module_type: long_context_reorder
|
||||
prompt: ["주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"]
|
||||
- node_type: generator
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: rouge
|
||||
- embedding_model: openai
|
||||
metric_name: sem_score
|
||||
- metric_name: bert_score
|
||||
lang: ko
|
||||
speed_threshold: 10
|
||||
modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: [openai]
|
||||
model: [gpt-4o-mini]
|
||||
temperature: [0.5, 1.0]
|
||||
@@ -0,0 +1,157 @@
|
||||
vectordb:
|
||||
- name: chroma_bge_m3
|
||||
db_type: chroma
|
||||
client_type: persistent
|
||||
embedding_model: huggingface_bge_m3
|
||||
collection_name: openai
|
||||
path: ${PROJECT_DIR}/resources/chroma
|
||||
node_lines:
|
||||
- node_line_name: pre_retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: query_expansion
|
||||
strategy:
|
||||
metrics: [retrieval_f1, retrieval_recall, retrieval_precision]
|
||||
speed_threshold: 10
|
||||
top_k: 10
|
||||
retrieval_modules:
|
||||
- module_type: bm25
|
||||
bm25_tokenizer: [ porter_stemmer, space, gpt2 ]
|
||||
- module_type: vectordb
|
||||
vectordb: chroma_bge_m3
|
||||
modules:
|
||||
- module_type: pass_query_expansion
|
||||
- module_type: hyde
|
||||
generator_module_type: llama_index_llm
|
||||
llm: openai
|
||||
model: [ gpt-4o-mini ]
|
||||
max_token: 64
|
||||
prompt: "질문에 답하기 위한 단락을 작성해 주세요."
|
||||
- module_type: multi_query_expansion
|
||||
generator_module_type: llama_index_llm
|
||||
llm: openai
|
||||
temperature: [ 0.2, 1.0 ]
|
||||
prompt: |
|
||||
당신은 인공지능 언어 모델 어시스턴트입니다.
|
||||
주어진 사용자 질문을 이용해 세 가지 버전의 새 질문을 생성하여 벡터 데이터베이스에서 관련 문서를 검색하는 것이 과제입니다.
|
||||
주어진 질문에 대한 다양한 관점을 생성함으로써 사용자가 거리 기반 유사도 검색의 한계를 극복할 수 있도록 돕는 것이 목표입니다.
|
||||
다음과 같은 대체 질문을 줄 바꿈으로 구분하여 제공하십시오.
|
||||
원래 질문: {query}
|
||||
- node_line_name: retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: retrieval
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision,
|
||||
retrieval_ndcg, retrieval_map, retrieval_mrr ]
|
||||
speed_threshold: 10
|
||||
top_k: 10
|
||||
modules:
|
||||
- module_type: bm25
|
||||
bm25_tokenizer: [ ko_kiwi, ko_okt, ko_kkma ]
|
||||
- module_type: vectordb
|
||||
vectordb: chroma_bge_m3
|
||||
- module_type: hybrid_rrf
|
||||
weight_range: (4,80)
|
||||
- module_type: hybrid_cc
|
||||
normalize_method: [ mm, tmm, z, dbsf ]
|
||||
weight_range: (0.0, 1.0)
|
||||
test_weight_size: 101
|
||||
- node_type: passage_augmenter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
top_k: 5
|
||||
embedding_model: openai
|
||||
modules:
|
||||
- module_type: pass_passage_augmenter
|
||||
- module_type: prev_next_augmenter
|
||||
mode: next
|
||||
- node_type: passage_reranker
|
||||
modules:
|
||||
- module_type: koreranker
|
||||
- module_type: flag_embedding_llm_reranker # Requires enough GPU resources
|
||||
- module_type: pass_reranker
|
||||
strategy:
|
||||
metrics: [ retrieval_recall, retrieval_precision, retrieval_map ]
|
||||
top_k: 3
|
||||
- node_type: passage_filter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
modules:
|
||||
- module_type: pass_passage_filter
|
||||
- module_type: similarity_threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: similarity_percentile_cutoff
|
||||
percentile: 0.6
|
||||
- module_type: threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: percentile_cutoff
|
||||
percentile: 0.6
|
||||
- node_type: passage_compressor
|
||||
strategy:
|
||||
metrics: [retrieval_token_f1, retrieval_token_recall, retrieval_token_precision]
|
||||
speed_threshold: 10
|
||||
modules:
|
||||
- module_type: pass_compressor
|
||||
- module_type: tree_summarize
|
||||
llm: openai
|
||||
model: gpt-4o-mini
|
||||
prompt: |
|
||||
여러 문맥 정보는 다음과 같습니다.\n
|
||||
---------------------\n
|
||||
{context_str}\n
|
||||
---------------------\n
|
||||
사전 지식이 아닌 여러 정보가 주어졌습니다,
|
||||
질문에 대답하세요.\n
|
||||
질문: {query_str}\n
|
||||
답변:
|
||||
- module_type: refine
|
||||
llm: openai
|
||||
model: gpt-4o-mini
|
||||
prompt: |
|
||||
원래 질문은 다음과 같습니다: {query_str}
|
||||
기존 답변은 다음과 같습니다: {existing_answer}
|
||||
아래에서 기존 답변을 정제할 수 있는 기회가 있습니다.
|
||||
(필요한 경우에만) 아래에 몇 가지 맥락을 추가하여 기존 답변을 정제할 수 있습니다.
|
||||
------------
|
||||
{context_msg}
|
||||
------------
|
||||
새로운 문맥이 주어지면 기존 답변을 수정하여 질문에 대한 답변을 정제합니다.
|
||||
맥락이 쓸모 없다면, 기존 답변을 그대로 답변하세요.
|
||||
정제된 답변:
|
||||
- module_type: longllmlingua
|
||||
- node_line_name: post_retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: prompt_maker
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: bleu
|
||||
- metric_name: meteor
|
||||
- metric_name: rouge
|
||||
- metric_name: sem_score
|
||||
embedding_model: openai
|
||||
speed_threshold: 10
|
||||
generator_modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: openai
|
||||
model: [gpt-4o-mini]
|
||||
modules:
|
||||
- module_type: fstring
|
||||
prompt: ["주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"]
|
||||
- module_type: long_context_reorder
|
||||
prompt: ["주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"]
|
||||
- node_type: generator
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: rouge
|
||||
- embedding_model: openai
|
||||
metric_name: sem_score
|
||||
- metric_name: bert_score
|
||||
lang: ko
|
||||
- metric_name: g_eval # LLM Judge Metric. Default Model: gpt-4-turbo
|
||||
speed_threshold: 10
|
||||
modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: [openai]
|
||||
model: [gpt-4o-mini]
|
||||
temperature: [0.5, 1.0]
|
||||
@@ -0,0 +1,126 @@
|
||||
vectordb:
|
||||
- name: chroma_bge_m3
|
||||
db_type: chroma
|
||||
client_type: persistent
|
||||
embedding_model: huggingface_bge_m3
|
||||
collection_name: openai
|
||||
path: ${PROJECT_DIR}/resources/chroma
|
||||
node_lines:
|
||||
- node_line_name: retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: retrieval
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision,
|
||||
retrieval_ndcg, retrieval_map, retrieval_mrr ]
|
||||
speed_threshold: 10
|
||||
top_k: 10
|
||||
modules:
|
||||
- module_type: bm25
|
||||
bm25_tokenizer: [ ko_kiwi, ko_okt, ko_kkma ]
|
||||
- module_type: vectordb
|
||||
vectordb: chroma_bge_m3
|
||||
- module_type: hybrid_rrf
|
||||
weight_range: (4,80)
|
||||
- module_type: hybrid_cc
|
||||
normalize_method: [ mm, tmm, z, dbsf ]
|
||||
weight_range: (0.0, 1.0)
|
||||
test_weight_size: 101
|
||||
- node_type: passage_augmenter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
top_k: 5
|
||||
embedding_model: openai
|
||||
modules:
|
||||
- module_type: pass_passage_augmenter
|
||||
- module_type: prev_next_augmenter
|
||||
mode: next
|
||||
- node_type: passage_reranker
|
||||
modules:
|
||||
- module_type: koreranker
|
||||
- module_type: flag_embedding_llm_reranker # Requires enough GPU resources
|
||||
- module_type: pass_reranker
|
||||
strategy:
|
||||
metrics: [ retrieval_recall, retrieval_precision, retrieval_map ]
|
||||
top_k: 3
|
||||
- node_type: passage_filter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
modules:
|
||||
- module_type: pass_passage_filter
|
||||
- module_type: similarity_threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: similarity_percentile_cutoff
|
||||
percentile: 0.6
|
||||
- module_type: threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: percentile_cutoff
|
||||
percentile: 0.6
|
||||
- node_type: passage_compressor
|
||||
strategy:
|
||||
metrics: [retrieval_token_f1, retrieval_token_recall, retrieval_token_precision]
|
||||
speed_threshold: 10
|
||||
modules:
|
||||
- module_type: pass_compressor
|
||||
- module_type: tree_summarize
|
||||
llm: openai
|
||||
model: gpt-4o-mini
|
||||
prompt: |
|
||||
여러 문맥 정보는 다음과 같습니다.\n
|
||||
---------------------\n
|
||||
{context_str}\n
|
||||
---------------------\n
|
||||
사전 지식이 아닌 여러 정보가 주어졌습니다,
|
||||
질문에 대답하세요.\n
|
||||
질문: {query_str}\n
|
||||
답변:
|
||||
- module_type: refine
|
||||
llm: openai
|
||||
model: gpt-4o-mini
|
||||
prompt: |
|
||||
원래 질문은 다음과 같습니다: {query_str}
|
||||
기존 답변은 다음과 같습니다: {existing_answer}
|
||||
아래에서 기존 답변을 정제할 수 있는 기회가 있습니다.
|
||||
(필요한 경우에만) 아래에 몇 가지 맥락을 추가하여 기존 답변을 정제할 수 있습니다.
|
||||
------------
|
||||
{context_msg}
|
||||
------------
|
||||
새로운 문맥이 주어지면 기존 답변을 수정하여 질문에 대한 답변을 정제합니다.
|
||||
맥락이 쓸모 없다면, 기존 답변을 그대로 답변하세요.
|
||||
정제된 답변:
|
||||
- module_type: longllmlingua
|
||||
- node_line_name: post_retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: prompt_maker
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: bleu
|
||||
- metric_name: meteor
|
||||
- metric_name: rouge
|
||||
- metric_name: sem_score
|
||||
embedding_model: openai
|
||||
speed_threshold: 10
|
||||
generator_modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: openai
|
||||
model: [gpt-4o-mini]
|
||||
modules:
|
||||
- module_type: fstring
|
||||
prompt: ["주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"]
|
||||
- module_type: long_context_reorder
|
||||
prompt: ["주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"]
|
||||
- node_type: generator
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: rouge
|
||||
- embedding_model: openai
|
||||
metric_name: sem_score
|
||||
- metric_name: bert_score
|
||||
lang: ko
|
||||
speed_threshold: 10
|
||||
modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: [openai]
|
||||
model: [gpt-4o-mini]
|
||||
temperature: [0.5, 1.0]
|
||||
@@ -0,0 +1,87 @@
|
||||
node_lines:
|
||||
- node_line_name: retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: retrieval
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision,
|
||||
retrieval_ndcg, retrieval_map, retrieval_mrr ]
|
||||
speed_threshold: 10
|
||||
top_k: 10
|
||||
modules:
|
||||
- module_type: bm25
|
||||
bm25_tokenizer: [ ko_kiwi, ko_okt, ko_kkma ]
|
||||
- module_type: vectordb
|
||||
vectordb: default
|
||||
- module_type: hybrid_rrf
|
||||
weight_range: (4,80)
|
||||
- module_type: hybrid_cc
|
||||
normalize_method: [ mm, tmm, z, dbsf ]
|
||||
weight_range: (0.0, 1.0)
|
||||
test_weight_size: 101
|
||||
- node_type: passage_augmenter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
top_k: 5
|
||||
embedding_model: openai
|
||||
modules:
|
||||
- module_type: pass_passage_augmenter
|
||||
- module_type: prev_next_augmenter
|
||||
mode: next
|
||||
- node_type: passage_reranker
|
||||
modules:
|
||||
- module_type: koreranker
|
||||
- module_type: flag_embedding_llm_reranker # Requires enough GPU resources
|
||||
- module_type: cohere_reranker # Set Environment Variable: COHERE_API_KEY
|
||||
- module_type: pass_reranker
|
||||
strategy:
|
||||
metrics: [ retrieval_recall, retrieval_precision, retrieval_map ]
|
||||
top_k: 3
|
||||
- node_type: passage_filter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
modules:
|
||||
- module_type: pass_passage_filter
|
||||
- module_type: similarity_threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: similarity_percentile_cutoff
|
||||
percentile: 0.6
|
||||
- module_type: threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: percentile_cutoff
|
||||
percentile: 0.6
|
||||
- node_line_name: post_retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: prompt_maker
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: bleu
|
||||
- metric_name: meteor
|
||||
- metric_name: rouge
|
||||
- metric_name: sem_score
|
||||
embedding_model: openai
|
||||
speed_threshold: 10
|
||||
generator_modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: openai
|
||||
model: [gpt-4o-mini]
|
||||
modules:
|
||||
- module_type: fstring
|
||||
prompt: ["주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"]
|
||||
- module_type: long_context_reorder
|
||||
prompt: ["주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"]
|
||||
- node_type: generator
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: rouge
|
||||
- embedding_model: openai
|
||||
metric_name: sem_score
|
||||
- metric_name: bert_score
|
||||
lang: ko
|
||||
speed_threshold: 10
|
||||
modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: [openai]
|
||||
model: [gpt-4o-mini]
|
||||
temperature: [0.5, 1.0]
|
||||
@@ -0,0 +1,151 @@
|
||||
node_lines:
|
||||
- node_line_name: pre_retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: query_expansion
|
||||
strategy:
|
||||
metrics: [retrieval_f1, retrieval_recall, retrieval_precision]
|
||||
speed_threshold: 10
|
||||
top_k: 10
|
||||
retrieval_modules:
|
||||
- module_type: bm25
|
||||
bm25_tokenizer: [ porter_stemmer, space, gpt2 ]
|
||||
- module_type: vectordb
|
||||
vectordb: default
|
||||
modules:
|
||||
- module_type: pass_query_expansion
|
||||
- module_type: hyde
|
||||
generator_module_type: llama_index_llm
|
||||
llm: openai #
|
||||
model: [ gpt-4o-mini ] #
|
||||
max_token: 64
|
||||
prompt: "질문에 답하기 위한 단락을 작성해 주세요."
|
||||
- module_type: multi_query_expansion
|
||||
generator_module_type: llama_index_llm
|
||||
llm: openai
|
||||
temperature: [ 0.2, 1.0 ]
|
||||
prompt: |
|
||||
당신은 인공지능 언어 모델 어시스턴트입니다.
|
||||
주어진 사용자 질문을 이용해 세 가지 버전의 새 질문을 생성하여 벡터 데이터베이스에서 관련 문서를 검색하는 것이 과제입니다.
|
||||
주어진 질문에 대한 다양한 관점을 생성함으로써 사용자가 거리 기반 유사도 검색의 한계를 극복할 수 있도록 돕는 것이 목표입니다.
|
||||
다음과 같은 대체 질문을 줄 바꿈으로 구분하여 제공하십시오.
|
||||
원래 질문: {query}
|
||||
- node_line_name: retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: retrieval
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision,
|
||||
retrieval_ndcg, retrieval_map, retrieval_mrr ]
|
||||
speed_threshold: 10
|
||||
top_k: 10
|
||||
modules:
|
||||
- module_type: bm25
|
||||
bm25_tokenizer: [ ko_kiwi, ko_okt, ko_kkma ] # ko_kiwi, ko_okt
|
||||
- module_type: vectordb
|
||||
vectordb: default
|
||||
- module_type: hybrid_rrf
|
||||
weight_range: (4,80)
|
||||
- module_type: hybrid_cc
|
||||
normalize_method: [ mm, tmm, z, dbsf ]
|
||||
weight_range: (0.0, 1.0)
|
||||
test_weight_size: 101
|
||||
- node_type: passage_augmenter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
top_k: 5
|
||||
embedding_model: openai
|
||||
modules:
|
||||
- module_type: pass_passage_augmenter
|
||||
- module_type: prev_next_augmenter
|
||||
mode: next
|
||||
- node_type: passage_reranker
|
||||
modules:
|
||||
- module_type: koreranker
|
||||
- module_type: flag_embedding_llm_reranker # Requires enough GPU resources
|
||||
- module_type: cohere_reranker # Set Environment Variable: COHERE_API_KEY
|
||||
- module_type: pass_reranker
|
||||
strategy:
|
||||
metrics: [ retrieval_recall, retrieval_precision, retrieval_map ]
|
||||
top_k: 3
|
||||
- node_type: passage_filter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
modules:
|
||||
- module_type: pass_passage_filter
|
||||
- module_type: similarity_threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: similarity_percentile_cutoff
|
||||
percentile: 0.6
|
||||
- module_type: threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: percentile_cutoff
|
||||
percentile: 0.6
|
||||
- node_type: passage_compressor
|
||||
strategy:
|
||||
metrics: [retrieval_token_f1, retrieval_token_recall, retrieval_token_precision]
|
||||
speed_threshold: 10
|
||||
modules:
|
||||
- module_type: pass_compressor
|
||||
- module_type: tree_summarize
|
||||
llm: openai
|
||||
model: gpt-4o-mini
|
||||
prompt: |
|
||||
여러 문맥 정보는 다음과 같습니다.\n
|
||||
---------------------\n
|
||||
{context_str}\n
|
||||
---------------------\n
|
||||
사전 지식이 아닌 여러 정보가 주어졌습니다,
|
||||
질문에 대답하세요.\n
|
||||
질문: {query_str}\n
|
||||
답변:
|
||||
- module_type: refine
|
||||
llm: openai
|
||||
model: gpt-4o-mini
|
||||
prompt: |
|
||||
원래 질문은 다음과 같습니다: {query_str}
|
||||
기존 답변은 다음과 같습니다: {existing_answer}
|
||||
아래에서 기존 답변을 정제할 수 있는 기회가 있습니다.
|
||||
(필요한 경우에만) 아래에 몇 가지 맥락을 추가하여 기존 답변을 정제할 수 있습니다.
|
||||
------------
|
||||
{context_msg}
|
||||
------------
|
||||
새로운 문맥이 주어지면 기존 답변을 수정하여 질문에 대한 답변을 정제합니다.
|
||||
맥락이 쓸모 없다면, 기존 답변을 그대로 답변하세요.
|
||||
정제된 답변:
|
||||
- module_type: longllmlingua
|
||||
- node_line_name: post_retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: prompt_maker
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: bleu
|
||||
- metric_name: meteor
|
||||
- metric_name: rouge
|
||||
- metric_name: sem_score
|
||||
embedding_model: openai
|
||||
speed_threshold: 10
|
||||
generator_modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: openai
|
||||
model: [gpt-4o-mini]
|
||||
modules:
|
||||
- module_type: fstring
|
||||
prompt: ["주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"]
|
||||
- module_type: long_context_reorder
|
||||
prompt: ["주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"]
|
||||
- node_type: generator
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: rouge
|
||||
- embedding_model: openai
|
||||
metric_name: sem_score
|
||||
- metric_name: bert_score
|
||||
lang: ko
|
||||
- metric_name: g_eval # LLM Judge Metric. Default Model: gpt-4-turbo
|
||||
speed_threshold: 10
|
||||
modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: [openai]
|
||||
model: [gpt-4o-mini]
|
||||
temperature: [0.5, 1.0]
|
||||
@@ -0,0 +1,120 @@
|
||||
node_lines:
|
||||
- node_line_name: retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: retrieval
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision,
|
||||
retrieval_ndcg, retrieval_map, retrieval_mrr ]
|
||||
speed_threshold: 10
|
||||
top_k: 10
|
||||
modules:
|
||||
- module_type: bm25
|
||||
bm25_tokenizer: [ ko_kiwi, ko_okt, ko_kkma ]
|
||||
- module_type: vectordb
|
||||
vectordb: default
|
||||
- module_type: hybrid_rrf
|
||||
weight_range: (4,80)
|
||||
- module_type: hybrid_cc
|
||||
normalize_method: [ mm, tmm, z, dbsf ]
|
||||
weight_range: (0.0, 1.0)
|
||||
test_weight_size: 101
|
||||
- node_type: passage_augmenter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
top_k: 5
|
||||
embedding_model: openai
|
||||
modules:
|
||||
- module_type: pass_passage_augmenter
|
||||
- module_type: prev_next_augmenter
|
||||
mode: next
|
||||
- node_type: passage_reranker
|
||||
modules:
|
||||
- module_type: koreranker
|
||||
- module_type: flag_embedding_llm_reranker # Requires enough GPU resources
|
||||
- module_type: cohere_reranker # Set Environment Variable: COHERE_API_KEY
|
||||
- module_type: pass_reranker
|
||||
strategy:
|
||||
metrics: [ retrieval_recall, retrieval_precision, retrieval_map ]
|
||||
top_k: 3
|
||||
- node_type: passage_filter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
modules:
|
||||
- module_type: pass_passage_filter
|
||||
- module_type: similarity_threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: similarity_percentile_cutoff
|
||||
percentile: 0.6
|
||||
- module_type: threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: percentile_cutoff
|
||||
percentile: 0.6
|
||||
- node_type: passage_compressor
|
||||
strategy:
|
||||
metrics: [retrieval_token_f1, retrieval_token_recall, retrieval_token_precision]
|
||||
speed_threshold: 10
|
||||
modules:
|
||||
- module_type: pass_compressor
|
||||
- module_type: tree_summarize
|
||||
llm: openai
|
||||
model: gpt-4o-mini
|
||||
prompt: |
|
||||
여러 문맥 정보는 다음과 같습니다.\n
|
||||
---------------------\n
|
||||
{context_str}\n
|
||||
---------------------\n
|
||||
사전 지식이 아닌 여러 정보가 주어졌습니다,
|
||||
질문에 대답하세요.\n
|
||||
질문: {query_str}\n
|
||||
답변:
|
||||
- module_type: refine
|
||||
llm: openai
|
||||
model: gpt-4o-mini
|
||||
prompt: |
|
||||
원래 질문은 다음과 같습니다: {query_str}
|
||||
기존 답변은 다음과 같습니다: {existing_answer}
|
||||
아래에서 기존 답변을 정제할 수 있는 기회가 있습니다.
|
||||
(필요한 경우에만) 아래에 몇 가지 맥락을 추가하여 기존 답변을 정제할 수 있습니다.
|
||||
------------
|
||||
{context_msg}
|
||||
------------
|
||||
새로운 문맥이 주어지면 기존 답변을 수정하여 질문에 대한 답변을 정제합니다.
|
||||
맥락이 쓸모 없다면, 기존 답변을 그대로 답변하세요.
|
||||
정제된 답변:
|
||||
- module_type: longllmlingua
|
||||
- node_line_name: post_retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: prompt_maker
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: bleu
|
||||
- metric_name: meteor
|
||||
- metric_name: rouge
|
||||
- metric_name: sem_score
|
||||
embedding_model: openai
|
||||
speed_threshold: 10
|
||||
generator_modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: openai
|
||||
model: [gpt-4o-mini]
|
||||
modules:
|
||||
- module_type: fstring
|
||||
prompt: ["주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"]
|
||||
- module_type: long_context_reorder
|
||||
prompt: ["주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"]
|
||||
- node_type: generator
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: rouge
|
||||
- embedding_model: openai
|
||||
metric_name: sem_score
|
||||
- metric_name: bert_score
|
||||
lang: ko
|
||||
speed_threshold: 10
|
||||
modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: [openai]
|
||||
model: [gpt-4o-mini]
|
||||
temperature: [0.5, 1.0]
|
||||
@@ -0,0 +1,78 @@
|
||||
node_lines:
|
||||
- node_line_name: retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: retrieval
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision,
|
||||
retrieval_ndcg, retrieval_map, retrieval_mrr ]
|
||||
speed_threshold: 10
|
||||
top_k: 10
|
||||
modules:
|
||||
- module_type: bm25
|
||||
bm25_tokenizer: [ ko_kiwi, ko_okt, ko_kkma ]
|
||||
- module_type: vectordb
|
||||
vectordb: default
|
||||
- module_type: hybrid_rrf
|
||||
weight_range: (4,80)
|
||||
- module_type: hybrid_cc
|
||||
normalize_method: [ mm, tmm, z, dbsf ]
|
||||
weight_range: (0.0, 1.0)
|
||||
test_weight_size: 101
|
||||
- node_type: passage_augmenter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
top_k: 5
|
||||
embedding_model: openai
|
||||
modules:
|
||||
- module_type: pass_passage_augmenter
|
||||
- module_type: prev_next_augmenter
|
||||
mode: next
|
||||
- node_type: passage_filter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
modules:
|
||||
- module_type: pass_passage_filter
|
||||
- module_type: similarity_threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: similarity_percentile_cutoff
|
||||
percentile: 0.6
|
||||
- module_type: threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: percentile_cutoff
|
||||
percentile: 0.6
|
||||
- node_line_name: post_retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: prompt_maker
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: bleu
|
||||
- metric_name: meteor
|
||||
- metric_name: rouge
|
||||
- metric_name: sem_score
|
||||
embedding_model: openai
|
||||
speed_threshold: 10
|
||||
generator_modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: openai
|
||||
model: [gpt-4o-mini]
|
||||
modules:
|
||||
- module_type: fstring
|
||||
prompt: ["주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"]
|
||||
- module_type: long_context_reorder
|
||||
prompt: ["주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"]
|
||||
- node_type: generator
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: rouge
|
||||
- embedding_model: openai
|
||||
metric_name: sem_score
|
||||
- metric_name: bert_score
|
||||
lang: ko
|
||||
speed_threshold: 10
|
||||
modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: [openai]
|
||||
model: [gpt-4o-mini]
|
||||
temperature: [0.5, 1.0]
|
||||
@@ -0,0 +1,142 @@
|
||||
node_lines:
|
||||
- node_line_name: pre_retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: query_expansion
|
||||
strategy:
|
||||
metrics: [retrieval_f1, retrieval_recall, retrieval_precision]
|
||||
speed_threshold: 10
|
||||
top_k: 10
|
||||
retrieval_modules:
|
||||
- module_type: bm25
|
||||
bm25_tokenizer: [ porter_stemmer, space, gpt2 ]
|
||||
- module_type: vectordb
|
||||
vectordb: default
|
||||
modules:
|
||||
- module_type: pass_query_expansion
|
||||
- module_type: hyde
|
||||
generator_module_type: llama_index_llm
|
||||
llm: openai
|
||||
model: [ gpt-4o-mini ]
|
||||
max_token: 64
|
||||
prompt: "질문에 답하기 위한 단락을 작성해 주세요."
|
||||
- module_type: multi_query_expansion
|
||||
generator_module_type: llama_index_llm
|
||||
llm: openai
|
||||
temperature: [ 0.2, 1.0 ]
|
||||
prompt: |
|
||||
당신은 인공지능 언어 모델 어시스턴트입니다.
|
||||
주어진 사용자 질문을 이용해 세 가지 버전의 새 질문을 생성하여 벡터 데이터베이스에서 관련 문서를 검색하는 것이 과제입니다.
|
||||
주어진 질문에 대한 다양한 관점을 생성함으로써 사용자가 거리 기반 유사도 검색의 한계를 극복할 수 있도록 돕는 것이 목표입니다.
|
||||
다음과 같은 대체 질문을 줄 바꿈으로 구분하여 제공하십시오.
|
||||
원래 질문: {query}
|
||||
- node_line_name: retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: retrieval
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision,
|
||||
retrieval_ndcg, retrieval_map, retrieval_mrr ]
|
||||
speed_threshold: 10
|
||||
top_k: 10
|
||||
modules:
|
||||
- module_type: bm25
|
||||
bm25_tokenizer: [ ko_kiwi, ko_okt, ko_kkma ]
|
||||
- module_type: vectordb
|
||||
vectordb: default
|
||||
- module_type: hybrid_rrf
|
||||
weight_range: (4,80)
|
||||
- module_type: hybrid_cc
|
||||
normalize_method: [ mm, tmm, z, dbsf ]
|
||||
weight_range: (0.0, 1.0)
|
||||
test_weight_size: 101
|
||||
- node_type: passage_augmenter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
top_k: 5
|
||||
embedding_model: openai
|
||||
modules:
|
||||
- module_type: pass_passage_augmenter
|
||||
- module_type: prev_next_augmenter
|
||||
mode: next
|
||||
- node_type: passage_filter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
modules:
|
||||
- module_type: pass_passage_filter
|
||||
- module_type: similarity_threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: similarity_percentile_cutoff
|
||||
percentile: 0.6
|
||||
- module_type: threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: percentile_cutoff
|
||||
percentile: 0.6
|
||||
- node_type: passage_compressor
|
||||
strategy:
|
||||
metrics: [retrieval_token_f1, retrieval_token_recall, retrieval_token_precision]
|
||||
speed_threshold: 10
|
||||
modules:
|
||||
- module_type: pass_compressor
|
||||
- module_type: tree_summarize
|
||||
llm: openai
|
||||
model: gpt-4o-mini
|
||||
prompt: |
|
||||
여러 문맥 정보는 다음과 같습니다.\n
|
||||
---------------------\n
|
||||
{context_str}\n
|
||||
---------------------\n
|
||||
사전 지식이 아닌 여러 정보가 주어졌습니다,
|
||||
질문에 대답하세요.\n
|
||||
질문: {query_str}\n
|
||||
답변:
|
||||
- module_type: refine
|
||||
llm: openai
|
||||
model: gpt-4o-mini
|
||||
prompt: |
|
||||
원래 질문은 다음과 같습니다: {query_str}
|
||||
기존 답변은 다음과 같습니다: {existing_answer}
|
||||
아래에서 기존 답변을 정제할 수 있는 기회가 있습니다.
|
||||
(필요한 경우에만) 아래에 몇 가지 맥락을 추가하여 기존 답변을 정제할 수 있습니다.
|
||||
------------
|
||||
{context_msg}
|
||||
------------
|
||||
새로운 문맥이 주어지면 기존 답변을 수정하여 질문에 대한 답변을 정제합니다.
|
||||
맥락이 쓸모 없다면, 기존 답변을 그대로 답변하세요.
|
||||
정제된 답변:
|
||||
- module_type: longllmlingua
|
||||
- node_line_name: post_retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: prompt_maker
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: bleu
|
||||
- metric_name: meteor
|
||||
- metric_name: rouge
|
||||
- metric_name: sem_score
|
||||
embedding_model: openai
|
||||
speed_threshold: 10
|
||||
generator_modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: openai
|
||||
model: [gpt-4o-mini]
|
||||
modules:
|
||||
- module_type: fstring
|
||||
prompt: ["주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"]
|
||||
- module_type: long_context_reorder
|
||||
prompt: ["주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"]
|
||||
- node_type: generator
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: rouge
|
||||
- embedding_model: openai
|
||||
metric_name: sem_score
|
||||
- metric_name: bert_score
|
||||
lang: ko
|
||||
- metric_name: g_eval # LLM Judge Metric. Default Model: gpt-4-turbo
|
||||
speed_threshold: 10
|
||||
modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: [openai]
|
||||
model: [gpt-4o-mini]
|
||||
temperature: [0.5, 1.0]
|
||||
@@ -0,0 +1,111 @@
|
||||
node_lines:
|
||||
- node_line_name: retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: retrieval
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision,
|
||||
retrieval_ndcg, retrieval_map, retrieval_mrr ]
|
||||
speed_threshold: 10
|
||||
top_k: 10
|
||||
modules:
|
||||
- module_type: bm25
|
||||
bm25_tokenizer: [ ko_kiwi, ko_okt, ko_kkma ]
|
||||
- module_type: vectordb
|
||||
vectordb: default
|
||||
- module_type: hybrid_rrf
|
||||
weight_range: (4,80)
|
||||
- module_type: hybrid_cc
|
||||
normalize_method: [ mm, tmm, z, dbsf ]
|
||||
weight_range: (0.0, 1.0)
|
||||
test_weight_size: 101
|
||||
- node_type: passage_augmenter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
top_k: 5
|
||||
embedding_model: openai
|
||||
modules:
|
||||
- module_type: pass_passage_augmenter
|
||||
- module_type: prev_next_augmenter
|
||||
mode: next
|
||||
- node_type: passage_filter
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
speed_threshold: 5
|
||||
modules:
|
||||
- module_type: pass_passage_filter
|
||||
- module_type: similarity_threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: similarity_percentile_cutoff
|
||||
percentile: 0.6
|
||||
- module_type: threshold_cutoff
|
||||
threshold: 0.85
|
||||
- module_type: percentile_cutoff
|
||||
percentile: 0.6
|
||||
- node_type: passage_compressor
|
||||
strategy:
|
||||
metrics: [retrieval_token_f1, retrieval_token_recall, retrieval_token_precision]
|
||||
speed_threshold: 10
|
||||
modules:
|
||||
- module_type: pass_compressor
|
||||
- module_type: tree_summarize
|
||||
llm: openai
|
||||
model: gpt-4o-mini
|
||||
prompt: |
|
||||
여러 문맥 정보는 다음과 같습니다.\n
|
||||
---------------------\n
|
||||
{context_str}\n
|
||||
---------------------\n
|
||||
사전 지식이 아닌 여러 정보가 주어졌습니다,
|
||||
질문에 대답하세요.\n
|
||||
질문: {query_str}\n
|
||||
답변:
|
||||
- module_type: refine
|
||||
llm: openai
|
||||
model: gpt-4o-mini
|
||||
prompt: |
|
||||
원래 질문은 다음과 같습니다: {query_str}
|
||||
기존 답변은 다음과 같습니다: {existing_answer}
|
||||
아래에서 기존 답변을 정제할 수 있는 기회가 있습니다.
|
||||
(필요한 경우에만) 아래에 몇 가지 맥락을 추가하여 기존 답변을 정제할 수 있습니다.
|
||||
------------
|
||||
{context_msg}
|
||||
------------
|
||||
새로운 문맥이 주어지면 기존 답변을 수정하여 질문에 대한 답변을 정제합니다.
|
||||
맥락이 쓸모 없다면, 기존 답변을 그대로 답변하세요.
|
||||
정제된 답변:
|
||||
- module_type: longllmlingua
|
||||
- node_line_name: post_retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: prompt_maker
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: bleu
|
||||
- metric_name: meteor
|
||||
- metric_name: rouge
|
||||
- metric_name: sem_score
|
||||
embedding_model: openai
|
||||
speed_threshold: 10
|
||||
generator_modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: openai
|
||||
model: [gpt-4o-mini]
|
||||
modules:
|
||||
- module_type: fstring
|
||||
prompt: ["주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"]
|
||||
- module_type: long_context_reorder
|
||||
prompt: ["주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"]
|
||||
- node_type: generator
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: rouge
|
||||
- embedding_model: openai
|
||||
metric_name: sem_score
|
||||
- metric_name: bert_score
|
||||
lang: ko
|
||||
speed_threshold: 10
|
||||
modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: [openai]
|
||||
model: [gpt-4o-mini]
|
||||
temperature: [0.5, 1.0]
|
||||
@@ -0,0 +1,30 @@
|
||||
node_lines:
|
||||
- node_line_name: retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: retrieval
|
||||
strategy:
|
||||
metrics: [ retrieval_f1, retrieval_recall, retrieval_precision ]
|
||||
top_k: 3
|
||||
modules:
|
||||
- module_type: vectordb
|
||||
vectordb: default
|
||||
- node_line_name: post_retrieve_node_line # Arbitrary node line name
|
||||
nodes:
|
||||
- node_type: prompt_maker
|
||||
strategy:
|
||||
metrics: [ bleu, meteor, rouge ]
|
||||
modules:
|
||||
- module_type: fstring
|
||||
prompt: "주어진 passage만을 이용하여 question에 따라 답하시오 passage: {retrieved_contents} \n\n Question: {query} \n\n Answer:"
|
||||
- node_type: generator
|
||||
strategy:
|
||||
metrics:
|
||||
- metric_name: rouge
|
||||
- embedding_model: openai
|
||||
metric_name: sem_score
|
||||
- metric_name: bert_score
|
||||
lang: ko
|
||||
modules:
|
||||
- module_type: llama_index_llm
|
||||
llm: openai
|
||||
model: [ gpt-4o-mini ]
|
||||
25
autorag-workspace/example/sample_dataset/README.md
Normal file
25
autorag-workspace/example/sample_dataset/README.md
Normal file
@@ -0,0 +1,25 @@
|
||||
# sample_dataset handling
|
||||
|
||||
The sample_dataset folder does not includes a `qa.parquet`, `corpus.parquet` file that is significantly large and cannot be uploaded directly to Git due to size limitations.
|
||||
|
||||
To prepare and use datasets available in the sample_dataset folder, specifically `triviaqa`, `hotpotqa`, `msmarco` and `eli5`, you can follow the outlined methods below.
|
||||
|
||||
## Usage
|
||||
|
||||
The example provided uses `triviaqa`, but the same approach applies to `msmarco`, `eli5` and `hotpotqa`.
|
||||
|
||||
### 1. Run with a specified save path
|
||||
To execute the Python script from the terminal and save the dataset to a specified path, use the command:
|
||||
|
||||
```bash
|
||||
python ./sample_dataset/triviaqa/load_triviaqa_dataset.py --save_path /path/to/save/dataset
|
||||
```
|
||||
This runs the `load_triviaqa_dataset.py` script located in the `./sample_dataset/triviaqa/` directory,
|
||||
using the `--save_path` argument to specify the dataset's save location.
|
||||
|
||||
### 2. Run without specifying a save path
|
||||
If you run the script without the `--save_path` argument, the dataset will be saved to a default location, which is the directory containing the `load_triviaqa_dataset.py` file, essentially `./sample_dataset/triviaqa/`:
|
||||
```bash
|
||||
python ./sample_dataset/triviaqa/load_triviaqa_dataset.py
|
||||
```
|
||||
This behavior allows for a straightforward execution without needing to specify a path, making it convenient for quick tests or when working directly within the target directory.
|
||||
@@ -0,0 +1,35 @@
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
import click
|
||||
from datasets import load_dataset
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option(
|
||||
"--save_path",
|
||||
type=str,
|
||||
default=pathlib.PurePath(__file__).parent,
|
||||
help="Path to save sample eli5 dataset.",
|
||||
)
|
||||
def load_eli5_dataset(save_path):
|
||||
# set file path
|
||||
file_path = "MarkrAI/eli5_sample_autorag"
|
||||
|
||||
# load dataset
|
||||
corpus_dataset = load_dataset(file_path, "corpus")["train"].to_pandas()
|
||||
qa_train_dataset = load_dataset(file_path, "qa")["train"].to_pandas()
|
||||
qa_test_dataset = load_dataset(file_path, "qa")["test"].to_pandas()
|
||||
|
||||
# save data
|
||||
if os.path.exists(os.path.join(save_path, "corpus.parquet")) is True:
|
||||
raise ValueError("corpus.parquet already exists")
|
||||
if os.path.exists(os.path.join(save_path, "qa.parquet")) is True:
|
||||
raise ValueError("qa.parquet already exists")
|
||||
corpus_dataset.to_parquet(os.path.join(save_path, "corpus.parquet"))
|
||||
qa_train_dataset.to_parquet(os.path.join(save_path, "qa_train.parquet"))
|
||||
qa_test_dataset.to_parquet(os.path.join(save_path, "qa_test.parquet"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
load_eli5_dataset()
|
||||
@@ -0,0 +1,35 @@
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
import click
|
||||
from datasets import load_dataset
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option(
|
||||
"--save_path",
|
||||
type=str,
|
||||
default=pathlib.PurePath(__file__).parent,
|
||||
help="Path to save sample hotpotqa dataset.",
|
||||
)
|
||||
def load_hotpotqa_dataset(save_path):
|
||||
# set file path
|
||||
file_path = "gnekt/hotpotqa_small_sample_autorag"
|
||||
|
||||
# load dataset
|
||||
corpus_dataset = load_dataset(file_path, "corpus")["train"].to_pandas()
|
||||
qa_validation_dataset = load_dataset(file_path, "qa")["validation"].to_pandas()
|
||||
|
||||
# save corpus data
|
||||
if os.path.exists(os.path.join(save_path, "corpus.parquet")) is True:
|
||||
raise ValueError("corpus.parquet already exists")
|
||||
if os.path.exists(os.path.join(save_path, "qa.parquet")) is True:
|
||||
raise ValueError("qa.parquet already exists")
|
||||
corpus_dataset.to_parquet(os.path.join(save_path, "corpus.parquet"), index=False)
|
||||
qa_validation_dataset.to_parquet(
|
||||
os.path.join(save_path, "qa_validation.parquet"), index=False
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
load_hotpotqa_dataset()
|
||||
@@ -0,0 +1,37 @@
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
import click
|
||||
from datasets import load_dataset
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option(
|
||||
"--save_path",
|
||||
type=str,
|
||||
default=pathlib.PurePath(__file__).parent,
|
||||
help="Path to save sample msmarco dataset.",
|
||||
)
|
||||
def load_msmarco_dataset(save_path):
|
||||
# set file path
|
||||
file_path = "MarkrAI/msmarco_sample_autorag"
|
||||
|
||||
# load dataset
|
||||
corpus_dataset = load_dataset(file_path, "corpus")["train"].to_pandas()
|
||||
qa_train_dataset = load_dataset(file_path, "qa")["train"].to_pandas()
|
||||
qa_test_dataset = load_dataset(file_path, "qa")["test"].to_pandas()
|
||||
|
||||
# save corpus data
|
||||
if os.path.exists(os.path.join(save_path, "corpus.parquet")) is True:
|
||||
raise ValueError("corpus.parquet already exists")
|
||||
if os.path.exists(os.path.join(save_path, "qa.parquet")) is True:
|
||||
raise ValueError("qa.parquet already exists")
|
||||
corpus_dataset.to_parquet(os.path.join(save_path, "corpus.parquet"), index=False)
|
||||
qa_train_dataset.to_parquet(
|
||||
os.path.join(save_path, "qa_train.parquet"), index=False
|
||||
)
|
||||
qa_test_dataset.to_parquet(os.path.join(save_path, "qa_test.parquet"), index=False)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
load_msmarco_dataset()
|
||||
@@ -0,0 +1,37 @@
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
import click
|
||||
from datasets import load_dataset
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option(
|
||||
"--save_path",
|
||||
type=str,
|
||||
default=pathlib.PurePath(__file__).parent,
|
||||
help="Path to save sample triviaqa dataset.",
|
||||
)
|
||||
def load_triviaqa_dataset(save_path):
|
||||
# set file path
|
||||
file_path = "MarkrAI/triviaqa_sample_autorag"
|
||||
|
||||
# load dataset
|
||||
corpus_dataset = load_dataset(file_path, "corpus")["train"].to_pandas()
|
||||
qa_train_dataset = load_dataset(file_path, "qa")["train"].to_pandas()
|
||||
qa_test_dataset = load_dataset(file_path, "qa")["test"].to_pandas()
|
||||
|
||||
# save corpus data
|
||||
if os.path.exists(os.path.join(save_path, "corpus.parquet")) is True:
|
||||
raise ValueError("corpus.parquet already exists")
|
||||
if os.path.exists(os.path.join(save_path, "qa.parquet")) is True:
|
||||
raise ValueError("qa.parquet already exists")
|
||||
corpus_dataset.to_parquet(os.path.join(save_path, "corpus.parquet"), index=False)
|
||||
qa_train_dataset.to_parquet(
|
||||
os.path.join(save_path, "qa_train.parquet"), index=False
|
||||
)
|
||||
qa_test_dataset.to_parquet(os.path.join(save_path, "qa_test.parquet"), index=False)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
load_triviaqa_dataset()
|
||||
Reference in New Issue
Block a user