Fix Dockerfile build issue

This commit is contained in:
kyy
2025-03-18 16:41:12 +09:00
parent 6814230bfb
commit 9323aa254a
228 changed files with 467 additions and 3488 deletions

67
making.sh Normal file
View File

@@ -0,0 +1,67 @@
#!/bin/bash
PROJECT_DIR="../projects/example_01"
CONFIG_DIR="$PROJECT_DIR/config"
RAW_DATA_DIR="$PROJECT_DIR/raw_data"
# TIMESTAMP=$(date +"%Y%m%d_%H%M")
PARSE_DIR="$PROJECT_DIR/parse"
CHUNK_DIR="$PROJECT_DIR/chunk"
mkdir -p "$PARSE_DIR" "$CHUNK_DIR";
# ----------------------------------------------------------------------- #
echo "1⃣ Parsing PDF 문서 시작..."
python3 -c "
from autorag.parser import Parser
parser = Parser(data_path_glob='$RAW_DATA_DIR/*.pdf', project_dir='$PARSE_DIR')
parser.start_parsing('$CONFIG_DIR/parse.yaml')
"
echo "✅ Parse 데이터 생성 완료"
# ----------------------------------------------------------------------- #
echo "2⃣ Chunking 데이터 생성 시작..."
python3 -c "
from autorag.chunker import Chunker
chunker = Chunker.from_parquet(parsed_data_path='$PARSE_DIR/parsed_result.parquet', project_dir='$CHUNK_DIR')
chunker.start_chunking('$CONFIG_DIR/chunk.yaml')
"
echo "✅ Chunk 데이터 생성 완료"
# ----------------------------------------------------------------------- #
QA_SIZE=20
echo "3⃣ QA 데이터 생성 시작..."
python3 -c "
import os
import pandas as pd
from autorag.data.qa.filter.dontknow import dontknow_filter_rule_based
from autorag.data.qa.generation_gt.llama_index_gen_gt import (
make_basic_gen_gt,
make_concise_gen_gt,
)
from autorag.data.qa.query.llama_gen_query import factoid_query_gen
from autorag.data.qa.sample import random_single_hop
from autorag.data.qa.schema import Raw, Corpus
from dotenv import load_dotenv
from llama_index.llms.openai import OpenAI
load_dotenv()
print('API Key:', os.getenv('OPENAI_API_KEY'))
llm = OpenAI(model='gpt-4o-mini')
initial_raw = Raw(pd.read_parquet('$PARSE_DIR/parsed_result.parquet', engine='pyarrow'))
initial_corpus = Corpus(pd.read_parquet('$CHUNK_DIR/0.parquet', engine='pyarrow'), initial_raw)
qa = (
initial_corpus
.sample(random_single_hop, n=$QA_SIZE)
.map(lambda df: df.reset_index(drop=True))
.make_retrieval_gt_contents()
.batch_apply(factoid_query_gen, llm=llm, lang='ko')
.batch_apply(make_basic_gen_gt, llm=llm, lang='ko')
.batch_apply(make_concise_gen_gt, llm=llm, lang='ko')
.filter(dontknow_filter_rule_based, lang='ko')
)
qa.to_parquet('$PROJECT_DIR/qa.parquet', '$PROJECT_DIR/corpus.parquet')
"
echo "✅ QA 데이터 생성 완료"