Initial commit
This commit is contained in:
51
autorag-workspace/autorag/chunker.py
Normal file
51
autorag-workspace/autorag/chunker.py
Normal file
@@ -0,0 +1,51 @@
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
from typing import Optional
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.data.chunk.run import run_chunker
|
||||
from autorag.data.utils.util import load_yaml, get_param_combinations
|
||||
|
||||
logger = logging.getLogger("AutoRAG")
|
||||
|
||||
|
||||
class Chunker:
|
||||
def __init__(self, raw_df: pd.DataFrame, project_dir: Optional[str] = None):
|
||||
self.parsed_raw = raw_df
|
||||
self.project_dir = project_dir if project_dir is not None else os.getcwd()
|
||||
|
||||
@classmethod
|
||||
def from_parquet(
|
||||
cls, parsed_data_path: str, project_dir: Optional[str] = None
|
||||
) -> "Chunker":
|
||||
if not os.path.exists(parsed_data_path):
|
||||
raise ValueError(f"parsed_data_path {parsed_data_path} does not exist.")
|
||||
if not parsed_data_path.endswith("parquet"):
|
||||
raise ValueError(
|
||||
f"parsed_data_path {parsed_data_path} is not a parquet file."
|
||||
)
|
||||
parsed_result = pd.read_parquet(parsed_data_path, engine="pyarrow")
|
||||
return cls(parsed_result, project_dir)
|
||||
|
||||
def start_chunking(self, yaml_path: str):
|
||||
if not os.path.exists(self.project_dir):
|
||||
os.makedirs(self.project_dir)
|
||||
|
||||
# Copy YAML file to the trial directory
|
||||
shutil.copy(yaml_path, os.path.join(self.project_dir, "chunk_config.yaml"))
|
||||
|
||||
# load yaml file
|
||||
modules = load_yaml(yaml_path)
|
||||
|
||||
input_modules, input_params = get_param_combinations(modules)
|
||||
|
||||
logger.info("Chunking Start...")
|
||||
run_chunker(
|
||||
modules=input_modules,
|
||||
module_params=input_params,
|
||||
parsed_result=self.parsed_raw,
|
||||
project_dir=self.project_dir,
|
||||
)
|
||||
logger.info("Chunking Done!")
|
||||
Reference in New Issue
Block a user