Initial commit

2025-03-14 17:28:01 +09:00
commit ba9c1a4a5f
225 changed files with 22467 additions and 0 deletions
--- a/autorag-workspace/example/sample_dataset/README.md
+++ b/autorag-workspace/example/sample_dataset/README.md
@@ -0,0 +1,25 @@
+# sample_dataset handling
+
+The sample_dataset folder does not includes a `qa.parquet`, `corpus.parquet` file that is significantly large and cannot be uploaded directly to Git due to size limitations.
+
+To prepare and use datasets available in the sample_dataset folder, specifically `triviaqa`, `hotpotqa`, `msmarco` and `eli5`, you can follow the outlined methods below.
+
+## Usage
+
+ The example provided uses `triviaqa`, but the same approach applies to `msmarco`, `eli5` and `hotpotqa`.
+
+### 1. Run with a specified save path
+To execute the Python script from the terminal and save the dataset to a specified path, use the command:
+
+```bash
+python ./sample_dataset/triviaqa/load_triviaqa_dataset.py --save_path /path/to/save/dataset
+```
+This runs the `load_triviaqa_dataset.py` script located in the `./sample_dataset/triviaqa/` directory,
+using the `--save_path` argument to specify the dataset's save location.
+
+### 2. Run without specifying a save path
+If you run the script without the `--save_path` argument, the dataset will be saved to a default location, which is the directory containing the `load_triviaqa_dataset.py` file, essentially `./sample_dataset/triviaqa/`:
+```bash
+python ./sample_dataset/triviaqa/load_triviaqa_dataset.py
+```
+This behavior allows for a straightforward execution without needing to specify a path, making it convenient for quick tests or when working directly within the target directory.
--- a/autorag-workspace/example/sample_dataset/eli5/load_eli5_dataset.py
+++ b/autorag-workspace/example/sample_dataset/eli5/load_eli5_dataset.py
@@ -0,0 +1,35 @@
+import os
+import pathlib
+
+import click
+from datasets import load_dataset
+
+
+@click.command()
+@click.option(
+	"--save_path",
+	type=str,
+	default=pathlib.PurePath(__file__).parent,
+	help="Path to save sample eli5 dataset.",
+)
+def load_eli5_dataset(save_path):
+	# set file path
+	file_path = "MarkrAI/eli5_sample_autorag"
+
+	# load dataset
+	corpus_dataset = load_dataset(file_path, "corpus")["train"].to_pandas()
+	qa_train_dataset = load_dataset(file_path, "qa")["train"].to_pandas()
+	qa_test_dataset = load_dataset(file_path, "qa")["test"].to_pandas()
+
+	# save data
+	if os.path.exists(os.path.join(save_path, "corpus.parquet")) is True:
+		raise ValueError("corpus.parquet already exists")
+	if os.path.exists(os.path.join(save_path, "qa.parquet")) is True:
+		raise ValueError("qa.parquet already exists")
+	corpus_dataset.to_parquet(os.path.join(save_path, "corpus.parquet"))
+	qa_train_dataset.to_parquet(os.path.join(save_path, "qa_train.parquet"))
+	qa_test_dataset.to_parquet(os.path.join(save_path, "qa_test.parquet"))
+
+
+if __name__ == "__main__":
+	load_eli5_dataset()
--- a/autorag-workspace/example/sample_dataset/hotpotqa/load_hotpotqa_dataset.py
+++ b/autorag-workspace/example/sample_dataset/hotpotqa/load_hotpotqa_dataset.py
@@ -0,0 +1,35 @@
+import os
+import pathlib
+
+import click
+from datasets import load_dataset
+
+
+@click.command()
+@click.option(
+	"--save_path",
+	type=str,
+	default=pathlib.PurePath(__file__).parent,
+	help="Path to save sample hotpotqa dataset.",
+)
+def load_hotpotqa_dataset(save_path):
+	# set file path
+	file_path = "gnekt/hotpotqa_small_sample_autorag"
+
+	# load dataset
+	corpus_dataset = load_dataset(file_path, "corpus")["train"].to_pandas()
+	qa_validation_dataset = load_dataset(file_path, "qa")["validation"].to_pandas()
+
+	# save corpus data
+	if os.path.exists(os.path.join(save_path, "corpus.parquet")) is True:
+		raise ValueError("corpus.parquet already exists")
+	if os.path.exists(os.path.join(save_path, "qa.parquet")) is True:
+		raise ValueError("qa.parquet already exists")
+	corpus_dataset.to_parquet(os.path.join(save_path, "corpus.parquet"), index=False)
+	qa_validation_dataset.to_parquet(
+		os.path.join(save_path, "qa_validation.parquet"), index=False
+	)
+
+
+if __name__ == "__main__":
+	load_hotpotqa_dataset()
--- a/autorag-workspace/example/sample_dataset/msmarco/load_msmarco_dataset.py
+++ b/autorag-workspace/example/sample_dataset/msmarco/load_msmarco_dataset.py
@@ -0,0 +1,37 @@
+import os
+import pathlib
+
+import click
+from datasets import load_dataset
+
+
+@click.command()
+@click.option(
+	"--save_path",
+	type=str,
+	default=pathlib.PurePath(__file__).parent,
+	help="Path to save sample msmarco dataset.",
+)
+def load_msmarco_dataset(save_path):
+	# set file path
+	file_path = "MarkrAI/msmarco_sample_autorag"
+
+	# load dataset
+	corpus_dataset = load_dataset(file_path, "corpus")["train"].to_pandas()
+	qa_train_dataset = load_dataset(file_path, "qa")["train"].to_pandas()
+	qa_test_dataset = load_dataset(file_path, "qa")["test"].to_pandas()
+
+	# save corpus data
+	if os.path.exists(os.path.join(save_path, "corpus.parquet")) is True:
+		raise ValueError("corpus.parquet already exists")
+	if os.path.exists(os.path.join(save_path, "qa.parquet")) is True:
+		raise ValueError("qa.parquet already exists")
+	corpus_dataset.to_parquet(os.path.join(save_path, "corpus.parquet"), index=False)
+	qa_train_dataset.to_parquet(
+		os.path.join(save_path, "qa_train.parquet"), index=False
+	)
+	qa_test_dataset.to_parquet(os.path.join(save_path, "qa_test.parquet"), index=False)
+
+
+if __name__ == "__main__":
+	load_msmarco_dataset()
--- a/autorag-workspace/example/sample_dataset/triviaqa/load_triviaqa_dataset.py
+++ b/autorag-workspace/example/sample_dataset/triviaqa/load_triviaqa_dataset.py
@@ -0,0 +1,37 @@
+import os
+import pathlib
+
+import click
+from datasets import load_dataset
+
+
+@click.command()
+@click.option(
+	"--save_path",
+	type=str,
+	default=pathlib.PurePath(__file__).parent,
+	help="Path to save sample triviaqa dataset.",
+)
+def load_triviaqa_dataset(save_path):
+	# set file path
+	file_path = "MarkrAI/triviaqa_sample_autorag"
+
+	# load dataset
+	corpus_dataset = load_dataset(file_path, "corpus")["train"].to_pandas()
+	qa_train_dataset = load_dataset(file_path, "qa")["train"].to_pandas()
+	qa_test_dataset = load_dataset(file_path, "qa")["test"].to_pandas()
+
+	# save corpus data
+	if os.path.exists(os.path.join(save_path, "corpus.parquet")) is True:
+		raise ValueError("corpus.parquet already exists")
+	if os.path.exists(os.path.join(save_path, "qa.parquet")) is True:
+		raise ValueError("qa.parquet already exists")
+	corpus_dataset.to_parquet(os.path.join(save_path, "corpus.parquet"), index=False)
+	qa_train_dataset.to_parquet(
+		os.path.join(save_path, "qa_train.parquet"), index=False
+	)
+	qa_test_dataset.to_parquet(os.path.join(save_path, "qa_test.parquet"), index=False)
+
+
+if __name__ == "__main__":
+	load_triviaqa_dataset()