Initial commit
This commit is contained in:
25
autorag-workspace/example/sample_dataset/README.md
Normal file
25
autorag-workspace/example/sample_dataset/README.md
Normal file
@@ -0,0 +1,25 @@
|
||||
# sample_dataset handling
|
||||
|
||||
The sample_dataset folder does not includes a `qa.parquet`, `corpus.parquet` file that is significantly large and cannot be uploaded directly to Git due to size limitations.
|
||||
|
||||
To prepare and use datasets available in the sample_dataset folder, specifically `triviaqa`, `hotpotqa`, `msmarco` and `eli5`, you can follow the outlined methods below.
|
||||
|
||||
## Usage
|
||||
|
||||
The example provided uses `triviaqa`, but the same approach applies to `msmarco`, `eli5` and `hotpotqa`.
|
||||
|
||||
### 1. Run with a specified save path
|
||||
To execute the Python script from the terminal and save the dataset to a specified path, use the command:
|
||||
|
||||
```bash
|
||||
python ./sample_dataset/triviaqa/load_triviaqa_dataset.py --save_path /path/to/save/dataset
|
||||
```
|
||||
This runs the `load_triviaqa_dataset.py` script located in the `./sample_dataset/triviaqa/` directory,
|
||||
using the `--save_path` argument to specify the dataset's save location.
|
||||
|
||||
### 2. Run without specifying a save path
|
||||
If you run the script without the `--save_path` argument, the dataset will be saved to a default location, which is the directory containing the `load_triviaqa_dataset.py` file, essentially `./sample_dataset/triviaqa/`:
|
||||
```bash
|
||||
python ./sample_dataset/triviaqa/load_triviaqa_dataset.py
|
||||
```
|
||||
This behavior allows for a straightforward execution without needing to specify a path, making it convenient for quick tests or when working directly within the target directory.
|
||||
@@ -0,0 +1,35 @@
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
import click
|
||||
from datasets import load_dataset
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option(
|
||||
"--save_path",
|
||||
type=str,
|
||||
default=pathlib.PurePath(__file__).parent,
|
||||
help="Path to save sample eli5 dataset.",
|
||||
)
|
||||
def load_eli5_dataset(save_path):
|
||||
# set file path
|
||||
file_path = "MarkrAI/eli5_sample_autorag"
|
||||
|
||||
# load dataset
|
||||
corpus_dataset = load_dataset(file_path, "corpus")["train"].to_pandas()
|
||||
qa_train_dataset = load_dataset(file_path, "qa")["train"].to_pandas()
|
||||
qa_test_dataset = load_dataset(file_path, "qa")["test"].to_pandas()
|
||||
|
||||
# save data
|
||||
if os.path.exists(os.path.join(save_path, "corpus.parquet")) is True:
|
||||
raise ValueError("corpus.parquet already exists")
|
||||
if os.path.exists(os.path.join(save_path, "qa.parquet")) is True:
|
||||
raise ValueError("qa.parquet already exists")
|
||||
corpus_dataset.to_parquet(os.path.join(save_path, "corpus.parquet"))
|
||||
qa_train_dataset.to_parquet(os.path.join(save_path, "qa_train.parquet"))
|
||||
qa_test_dataset.to_parquet(os.path.join(save_path, "qa_test.parquet"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
load_eli5_dataset()
|
||||
@@ -0,0 +1,35 @@
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
import click
|
||||
from datasets import load_dataset
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option(
|
||||
"--save_path",
|
||||
type=str,
|
||||
default=pathlib.PurePath(__file__).parent,
|
||||
help="Path to save sample hotpotqa dataset.",
|
||||
)
|
||||
def load_hotpotqa_dataset(save_path):
|
||||
# set file path
|
||||
file_path = "gnekt/hotpotqa_small_sample_autorag"
|
||||
|
||||
# load dataset
|
||||
corpus_dataset = load_dataset(file_path, "corpus")["train"].to_pandas()
|
||||
qa_validation_dataset = load_dataset(file_path, "qa")["validation"].to_pandas()
|
||||
|
||||
# save corpus data
|
||||
if os.path.exists(os.path.join(save_path, "corpus.parquet")) is True:
|
||||
raise ValueError("corpus.parquet already exists")
|
||||
if os.path.exists(os.path.join(save_path, "qa.parquet")) is True:
|
||||
raise ValueError("qa.parquet already exists")
|
||||
corpus_dataset.to_parquet(os.path.join(save_path, "corpus.parquet"), index=False)
|
||||
qa_validation_dataset.to_parquet(
|
||||
os.path.join(save_path, "qa_validation.parquet"), index=False
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
load_hotpotqa_dataset()
|
||||
@@ -0,0 +1,37 @@
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
import click
|
||||
from datasets import load_dataset
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option(
|
||||
"--save_path",
|
||||
type=str,
|
||||
default=pathlib.PurePath(__file__).parent,
|
||||
help="Path to save sample msmarco dataset.",
|
||||
)
|
||||
def load_msmarco_dataset(save_path):
|
||||
# set file path
|
||||
file_path = "MarkrAI/msmarco_sample_autorag"
|
||||
|
||||
# load dataset
|
||||
corpus_dataset = load_dataset(file_path, "corpus")["train"].to_pandas()
|
||||
qa_train_dataset = load_dataset(file_path, "qa")["train"].to_pandas()
|
||||
qa_test_dataset = load_dataset(file_path, "qa")["test"].to_pandas()
|
||||
|
||||
# save corpus data
|
||||
if os.path.exists(os.path.join(save_path, "corpus.parquet")) is True:
|
||||
raise ValueError("corpus.parquet already exists")
|
||||
if os.path.exists(os.path.join(save_path, "qa.parquet")) is True:
|
||||
raise ValueError("qa.parquet already exists")
|
||||
corpus_dataset.to_parquet(os.path.join(save_path, "corpus.parquet"), index=False)
|
||||
qa_train_dataset.to_parquet(
|
||||
os.path.join(save_path, "qa_train.parquet"), index=False
|
||||
)
|
||||
qa_test_dataset.to_parquet(os.path.join(save_path, "qa_test.parquet"), index=False)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
load_msmarco_dataset()
|
||||
@@ -0,0 +1,37 @@
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
import click
|
||||
from datasets import load_dataset
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option(
|
||||
"--save_path",
|
||||
type=str,
|
||||
default=pathlib.PurePath(__file__).parent,
|
||||
help="Path to save sample triviaqa dataset.",
|
||||
)
|
||||
def load_triviaqa_dataset(save_path):
|
||||
# set file path
|
||||
file_path = "MarkrAI/triviaqa_sample_autorag"
|
||||
|
||||
# load dataset
|
||||
corpus_dataset = load_dataset(file_path, "corpus")["train"].to_pandas()
|
||||
qa_train_dataset = load_dataset(file_path, "qa")["train"].to_pandas()
|
||||
qa_test_dataset = load_dataset(file_path, "qa")["test"].to_pandas()
|
||||
|
||||
# save corpus data
|
||||
if os.path.exists(os.path.join(save_path, "corpus.parquet")) is True:
|
||||
raise ValueError("corpus.parquet already exists")
|
||||
if os.path.exists(os.path.join(save_path, "qa.parquet")) is True:
|
||||
raise ValueError("qa.parquet already exists")
|
||||
corpus_dataset.to_parquet(os.path.join(save_path, "corpus.parquet"), index=False)
|
||||
qa_train_dataset.to_parquet(
|
||||
os.path.join(save_path, "qa_train.parquet"), index=False
|
||||
)
|
||||
qa_test_dataset.to_parquet(os.path.join(save_path, "qa_test.parquet"), index=False)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
load_triviaqa_dataset()
|
||||
Reference in New Issue
Block a user