#!/bin/bash PROJECT_DIR="../projects/example_01" CONFIG_DIR="$PROJECT_DIR/config" RAW_DATA_DIR="$PROJECT_DIR/raw_data" # TIMESTAMP=$(date +"%Y%m%d_%H%M") PARSE_DIR="$PROJECT_DIR/parse" CHUNK_DIR="$PROJECT_DIR/chunk" mkdir -p "$PARSE_DIR" "$CHUNK_DIR"; # ----------------------------------------------------------------------- # echo "1️⃣ Parsing PDF 문서 시작..." python3 -c " from autorag.parser import Parser parser = Parser(data_path_glob='$RAW_DATA_DIR/*.pdf', project_dir='$PARSE_DIR') parser.start_parsing('$CONFIG_DIR/parse.yaml') " echo "✅ Parse 데이터 생성 완료" # ----------------------------------------------------------------------- # echo "2️⃣ Chunking 데이터 생성 시작..." python3 -c " from autorag.chunker import Chunker chunker = Chunker.from_parquet(parsed_data_path='$PARSE_DIR/parsed_result.parquet', project_dir='$CHUNK_DIR') chunker.start_chunking('$CONFIG_DIR/chunk.yaml') " echo "✅ Chunk 데이터 생성 완료" # ----------------------------------------------------------------------- # QA_SIZE=20 echo "3️⃣ QA 데이터 생성 시작..." python3 -c " import os import pandas as pd from autorag.data.qa.filter.dontknow import dontknow_filter_rule_based from autorag.data.qa.generation_gt.llama_index_gen_gt import ( make_basic_gen_gt, make_concise_gen_gt, ) from autorag.data.qa.query.llama_gen_query import factoid_query_gen from autorag.data.qa.sample import random_single_hop from autorag.data.qa.schema import Raw, Corpus from dotenv import load_dotenv from llama_index.llms.openai import OpenAI load_dotenv() print('API Key:', os.getenv('OPENAI_API_KEY')) llm = OpenAI(model='gpt-4o-mini') initial_raw = Raw(pd.read_parquet('$PARSE_DIR/parsed_result.parquet', engine='pyarrow')) initial_corpus = Corpus(pd.read_parquet('$CHUNK_DIR/0.parquet', engine='pyarrow'), initial_raw) qa = ( initial_corpus .sample(random_single_hop, n=$QA_SIZE) .map(lambda df: df.reset_index(drop=True)) .make_retrieval_gt_contents() .batch_apply(factoid_query_gen, llm=llm, lang='ko') .batch_apply(make_basic_gen_gt, llm=llm, lang='ko') .batch_apply(make_concise_gen_gt, llm=llm, lang='ko') .filter(dontknow_filter_rule_based, lang='ko') ) qa.to_parquet('$PROJECT_DIR/qa.parquet', '$PROJECT_DIR/corpus.parquet') " echo "✅ QA 데이터 생성 완료"