67 lines
2.3 KiB
Bash
67 lines
2.3 KiB
Bash
#!/bin/bash
|
||
|
||
PROJECT_DIR="../projects/example_01"
|
||
CONFIG_DIR="$PROJECT_DIR/config"
|
||
RAW_DATA_DIR="$PROJECT_DIR/raw_data"
|
||
|
||
# TIMESTAMP=$(date +"%Y%m%d_%H%M")
|
||
PARSE_DIR="$PROJECT_DIR/parse"
|
||
CHUNK_DIR="$PROJECT_DIR/chunk"
|
||
|
||
mkdir -p "$PARSE_DIR" "$CHUNK_DIR";
|
||
|
||
# ----------------------------------------------------------------------- #
|
||
echo "1️⃣ Parsing PDF 문서 시작..."
|
||
python3 -c "
|
||
from autorag.parser import Parser
|
||
parser = Parser(data_path_glob='$RAW_DATA_DIR/*.pdf', project_dir='$PARSE_DIR')
|
||
parser.start_parsing('$CONFIG_DIR/parse.yaml')
|
||
"
|
||
echo "✅ Parse 데이터 생성 완료"
|
||
# ----------------------------------------------------------------------- #
|
||
echo "2️⃣ Chunking 데이터 생성 시작..."
|
||
python3 -c "
|
||
from autorag.chunker import Chunker
|
||
chunker = Chunker.from_parquet(parsed_data_path='$PARSE_DIR/parsed_result.parquet', project_dir='$CHUNK_DIR')
|
||
chunker.start_chunking('$CONFIG_DIR/chunk.yaml')
|
||
"
|
||
echo "✅ Chunk 데이터 생성 완료"
|
||
# ----------------------------------------------------------------------- #
|
||
QA_SIZE=20
|
||
echo "3️⃣ QA 데이터 생성 시작..."
|
||
python3 -c "
|
||
import os
|
||
import pandas as pd
|
||
from autorag.data.qa.filter.dontknow import dontknow_filter_rule_based
|
||
from autorag.data.qa.generation_gt.llama_index_gen_gt import (
|
||
make_basic_gen_gt,
|
||
make_concise_gen_gt,
|
||
)
|
||
from autorag.data.qa.query.llama_gen_query import factoid_query_gen
|
||
from autorag.data.qa.sample import random_single_hop
|
||
from autorag.data.qa.schema import Raw, Corpus
|
||
from dotenv import load_dotenv
|
||
from llama_index.llms.openai import OpenAI
|
||
|
||
load_dotenv()
|
||
print('API Key:', os.getenv('OPENAI_API_KEY'))
|
||
|
||
llm = OpenAI(model='gpt-4o-mini')
|
||
|
||
initial_raw = Raw(pd.read_parquet('$PARSE_DIR/parsed_result.parquet', engine='pyarrow'))
|
||
initial_corpus = Corpus(pd.read_parquet('$CHUNK_DIR/0.parquet', engine='pyarrow'), initial_raw)
|
||
|
||
qa = (
|
||
initial_corpus
|
||
.sample(random_single_hop, n=$QA_SIZE)
|
||
.map(lambda df: df.reset_index(drop=True))
|
||
.make_retrieval_gt_contents()
|
||
.batch_apply(factoid_query_gen, llm=llm, lang='ko')
|
||
.batch_apply(make_basic_gen_gt, llm=llm, lang='ko')
|
||
.batch_apply(make_concise_gen_gt, llm=llm, lang='ko')
|
||
.filter(dontknow_filter_rule_based, lang='ko')
|
||
)
|
||
|
||
qa.to_parquet('$PROJECT_DIR/qa.parquet', '$PROJECT_DIR/corpus.parquet')
|
||
"
|
||
echo "✅ QA 데이터 생성 완료" |