Initial commit

This commit is contained in:
kyy
2025-03-14 17:28:01 +09:00
commit ba9c1a4a5f
225 changed files with 22467 additions and 0 deletions

View File

@@ -0,0 +1,25 @@
# sample_dataset handling
The sample_dataset folder does not includes a `qa.parquet`, `corpus.parquet` file that is significantly large and cannot be uploaded directly to Git due to size limitations.
To prepare and use datasets available in the sample_dataset folder, specifically `triviaqa`, `hotpotqa`, `msmarco` and `eli5`, you can follow the outlined methods below.
## Usage
The example provided uses `triviaqa`, but the same approach applies to `msmarco`, `eli5` and `hotpotqa`.
### 1. Run with a specified save path
To execute the Python script from the terminal and save the dataset to a specified path, use the command:
```bash
python ./sample_dataset/triviaqa/load_triviaqa_dataset.py --save_path /path/to/save/dataset
```
This runs the `load_triviaqa_dataset.py` script located in the `./sample_dataset/triviaqa/` directory,
using the `--save_path` argument to specify the dataset's save location.
### 2. Run without specifying a save path
If you run the script without the `--save_path` argument, the dataset will be saved to a default location, which is the directory containing the `load_triviaqa_dataset.py` file, essentially `./sample_dataset/triviaqa/`:
```bash
python ./sample_dataset/triviaqa/load_triviaqa_dataset.py
```
This behavior allows for a straightforward execution without needing to specify a path, making it convenient for quick tests or when working directly within the target directory.

View File

@@ -0,0 +1,35 @@
import os
import pathlib
import click
from datasets import load_dataset
@click.command()
@click.option(
"--save_path",
type=str,
default=pathlib.PurePath(__file__).parent,
help="Path to save sample eli5 dataset.",
)
def load_eli5_dataset(save_path):
# set file path
file_path = "MarkrAI/eli5_sample_autorag"
# load dataset
corpus_dataset = load_dataset(file_path, "corpus")["train"].to_pandas()
qa_train_dataset = load_dataset(file_path, "qa")["train"].to_pandas()
qa_test_dataset = load_dataset(file_path, "qa")["test"].to_pandas()
# save data
if os.path.exists(os.path.join(save_path, "corpus.parquet")) is True:
raise ValueError("corpus.parquet already exists")
if os.path.exists(os.path.join(save_path, "qa.parquet")) is True:
raise ValueError("qa.parquet already exists")
corpus_dataset.to_parquet(os.path.join(save_path, "corpus.parquet"))
qa_train_dataset.to_parquet(os.path.join(save_path, "qa_train.parquet"))
qa_test_dataset.to_parquet(os.path.join(save_path, "qa_test.parquet"))
if __name__ == "__main__":
load_eli5_dataset()

View File

@@ -0,0 +1,35 @@
import os
import pathlib
import click
from datasets import load_dataset
@click.command()
@click.option(
"--save_path",
type=str,
default=pathlib.PurePath(__file__).parent,
help="Path to save sample hotpotqa dataset.",
)
def load_hotpotqa_dataset(save_path):
# set file path
file_path = "gnekt/hotpotqa_small_sample_autorag"
# load dataset
corpus_dataset = load_dataset(file_path, "corpus")["train"].to_pandas()
qa_validation_dataset = load_dataset(file_path, "qa")["validation"].to_pandas()
# save corpus data
if os.path.exists(os.path.join(save_path, "corpus.parquet")) is True:
raise ValueError("corpus.parquet already exists")
if os.path.exists(os.path.join(save_path, "qa.parquet")) is True:
raise ValueError("qa.parquet already exists")
corpus_dataset.to_parquet(os.path.join(save_path, "corpus.parquet"), index=False)
qa_validation_dataset.to_parquet(
os.path.join(save_path, "qa_validation.parquet"), index=False
)
if __name__ == "__main__":
load_hotpotqa_dataset()

View File

@@ -0,0 +1,37 @@
import os
import pathlib
import click
from datasets import load_dataset
@click.command()
@click.option(
"--save_path",
type=str,
default=pathlib.PurePath(__file__).parent,
help="Path to save sample msmarco dataset.",
)
def load_msmarco_dataset(save_path):
# set file path
file_path = "MarkrAI/msmarco_sample_autorag"
# load dataset
corpus_dataset = load_dataset(file_path, "corpus")["train"].to_pandas()
qa_train_dataset = load_dataset(file_path, "qa")["train"].to_pandas()
qa_test_dataset = load_dataset(file_path, "qa")["test"].to_pandas()
# save corpus data
if os.path.exists(os.path.join(save_path, "corpus.parquet")) is True:
raise ValueError("corpus.parquet already exists")
if os.path.exists(os.path.join(save_path, "qa.parquet")) is True:
raise ValueError("qa.parquet already exists")
corpus_dataset.to_parquet(os.path.join(save_path, "corpus.parquet"), index=False)
qa_train_dataset.to_parquet(
os.path.join(save_path, "qa_train.parquet"), index=False
)
qa_test_dataset.to_parquet(os.path.join(save_path, "qa_test.parquet"), index=False)
if __name__ == "__main__":
load_msmarco_dataset()

View File

@@ -0,0 +1,37 @@
import os
import pathlib
import click
from datasets import load_dataset
@click.command()
@click.option(
"--save_path",
type=str,
default=pathlib.PurePath(__file__).parent,
help="Path to save sample triviaqa dataset.",
)
def load_triviaqa_dataset(save_path):
# set file path
file_path = "MarkrAI/triviaqa_sample_autorag"
# load dataset
corpus_dataset = load_dataset(file_path, "corpus")["train"].to_pandas()
qa_train_dataset = load_dataset(file_path, "qa")["train"].to_pandas()
qa_test_dataset = load_dataset(file_path, "qa")["test"].to_pandas()
# save corpus data
if os.path.exists(os.path.join(save_path, "corpus.parquet")) is True:
raise ValueError("corpus.parquet already exists")
if os.path.exists(os.path.join(save_path, "qa.parquet")) is True:
raise ValueError("qa.parquet already exists")
corpus_dataset.to_parquet(os.path.join(save_path, "corpus.parquet"), index=False)
qa_train_dataset.to_parquet(
os.path.join(save_path, "qa_train.parquet"), index=False
)
qa_test_dataset.to_parquet(os.path.join(save_path, "qa_test.parquet"), index=False)
if __name__ == "__main__":
load_triviaqa_dataset()