Files
autorag_evaluation/making.sh
2025-03-18 16:41:12 +09:00

67 lines
2.3 KiB
Bash
Raw Permalink Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/bash
PROJECT_DIR="../projects/example_01"
CONFIG_DIR="$PROJECT_DIR/config"
RAW_DATA_DIR="$PROJECT_DIR/raw_data"
# TIMESTAMP=$(date +"%Y%m%d_%H%M")
PARSE_DIR="$PROJECT_DIR/parse"
CHUNK_DIR="$PROJECT_DIR/chunk"
mkdir -p "$PARSE_DIR" "$CHUNK_DIR";
# ----------------------------------------------------------------------- #
echo "1⃣ Parsing PDF 문서 시작..."
python3 -c "
from autorag.parser import Parser
parser = Parser(data_path_glob='$RAW_DATA_DIR/*.pdf', project_dir='$PARSE_DIR')
parser.start_parsing('$CONFIG_DIR/parse.yaml')
"
echo "✅ Parse 데이터 생성 완료"
# ----------------------------------------------------------------------- #
echo "2⃣ Chunking 데이터 생성 시작..."
python3 -c "
from autorag.chunker import Chunker
chunker = Chunker.from_parquet(parsed_data_path='$PARSE_DIR/parsed_result.parquet', project_dir='$CHUNK_DIR')
chunker.start_chunking('$CONFIG_DIR/chunk.yaml')
"
echo "✅ Chunk 데이터 생성 완료"
# ----------------------------------------------------------------------- #
QA_SIZE=20
echo "3⃣ QA 데이터 생성 시작..."
python3 -c "
import os
import pandas as pd
from autorag.data.qa.filter.dontknow import dontknow_filter_rule_based
from autorag.data.qa.generation_gt.llama_index_gen_gt import (
make_basic_gen_gt,
make_concise_gen_gt,
)
from autorag.data.qa.query.llama_gen_query import factoid_query_gen
from autorag.data.qa.sample import random_single_hop
from autorag.data.qa.schema import Raw, Corpus
from dotenv import load_dotenv
from llama_index.llms.openai import OpenAI
load_dotenv()
print('API Key:', os.getenv('OPENAI_API_KEY'))
llm = OpenAI(model='gpt-4o-mini')
initial_raw = Raw(pd.read_parquet('$PARSE_DIR/parsed_result.parquet', engine='pyarrow'))
initial_corpus = Corpus(pd.read_parquet('$CHUNK_DIR/0.parquet', engine='pyarrow'), initial_raw)
qa = (
initial_corpus
.sample(random_single_hop, n=$QA_SIZE)
.map(lambda df: df.reset_index(drop=True))
.make_retrieval_gt_contents()
.batch_apply(factoid_query_gen, llm=llm, lang='ko')
.batch_apply(make_basic_gen_gt, llm=llm, lang='ko')
.batch_apply(make_concise_gen_gt, llm=llm, lang='ko')
.filter(dontknow_filter_rule_based, lang='ko')
)
qa.to_parquet('$PROJECT_DIR/qa.parquet', '$PROJECT_DIR/corpus.parquet')
"
echo "✅ QA 데이터 생성 완료"