Fix Dockerfile build issue
This commit is contained in:
67
making.sh
Normal file
67
making.sh
Normal file
@@ -0,0 +1,67 @@
|
||||
#!/bin/bash
|
||||
|
||||
PROJECT_DIR="../projects/example_01"
|
||||
CONFIG_DIR="$PROJECT_DIR/config"
|
||||
RAW_DATA_DIR="$PROJECT_DIR/raw_data"
|
||||
|
||||
# TIMESTAMP=$(date +"%Y%m%d_%H%M")
|
||||
PARSE_DIR="$PROJECT_DIR/parse"
|
||||
CHUNK_DIR="$PROJECT_DIR/chunk"
|
||||
|
||||
mkdir -p "$PARSE_DIR" "$CHUNK_DIR";
|
||||
|
||||
# ----------------------------------------------------------------------- #
|
||||
echo "1️⃣ Parsing PDF 문서 시작..."
|
||||
python3 -c "
|
||||
from autorag.parser import Parser
|
||||
parser = Parser(data_path_glob='$RAW_DATA_DIR/*.pdf', project_dir='$PARSE_DIR')
|
||||
parser.start_parsing('$CONFIG_DIR/parse.yaml')
|
||||
"
|
||||
echo "✅ Parse 데이터 생성 완료"
|
||||
# ----------------------------------------------------------------------- #
|
||||
echo "2️⃣ Chunking 데이터 생성 시작..."
|
||||
python3 -c "
|
||||
from autorag.chunker import Chunker
|
||||
chunker = Chunker.from_parquet(parsed_data_path='$PARSE_DIR/parsed_result.parquet', project_dir='$CHUNK_DIR')
|
||||
chunker.start_chunking('$CONFIG_DIR/chunk.yaml')
|
||||
"
|
||||
echo "✅ Chunk 데이터 생성 완료"
|
||||
# ----------------------------------------------------------------------- #
|
||||
QA_SIZE=20
|
||||
echo "3️⃣ QA 데이터 생성 시작..."
|
||||
python3 -c "
|
||||
import os
|
||||
import pandas as pd
|
||||
from autorag.data.qa.filter.dontknow import dontknow_filter_rule_based
|
||||
from autorag.data.qa.generation_gt.llama_index_gen_gt import (
|
||||
make_basic_gen_gt,
|
||||
make_concise_gen_gt,
|
||||
)
|
||||
from autorag.data.qa.query.llama_gen_query import factoid_query_gen
|
||||
from autorag.data.qa.sample import random_single_hop
|
||||
from autorag.data.qa.schema import Raw, Corpus
|
||||
from dotenv import load_dotenv
|
||||
from llama_index.llms.openai import OpenAI
|
||||
|
||||
load_dotenv()
|
||||
print('API Key:', os.getenv('OPENAI_API_KEY'))
|
||||
|
||||
llm = OpenAI(model='gpt-4o-mini')
|
||||
|
||||
initial_raw = Raw(pd.read_parquet('$PARSE_DIR/parsed_result.parquet', engine='pyarrow'))
|
||||
initial_corpus = Corpus(pd.read_parquet('$CHUNK_DIR/0.parquet', engine='pyarrow'), initial_raw)
|
||||
|
||||
qa = (
|
||||
initial_corpus
|
||||
.sample(random_single_hop, n=$QA_SIZE)
|
||||
.map(lambda df: df.reset_index(drop=True))
|
||||
.make_retrieval_gt_contents()
|
||||
.batch_apply(factoid_query_gen, llm=llm, lang='ko')
|
||||
.batch_apply(make_basic_gen_gt, llm=llm, lang='ko')
|
||||
.batch_apply(make_concise_gen_gt, llm=llm, lang='ko')
|
||||
.filter(dontknow_filter_rule_based, lang='ko')
|
||||
)
|
||||
|
||||
qa.to_parquet('$PROJECT_DIR/qa.parquet', '$PROJECT_DIR/corpus.parquet')
|
||||
"
|
||||
echo "✅ QA 데이터 생성 완료"
|
||||
Reference in New Issue
Block a user