Fix Dockerfile build issue

2025-03-18 16:41:12 +09:00
parent 6814230bfb
commit 9323aa254a
228 changed files with 467 additions and 3488 deletions
--- a/autorag/data/parse/run.py
+++ b/autorag/data/parse/run.py
@@ -0,0 +1,141 @@
+import os
+from typing import List, Callable, Dict
+import pandas as pd
+from glob import glob
+
+from autorag.strategy import measure_speed
+from autorag.data.utils.util import get_param_combinations
+
+default_map = {
+	"pdf": {
+		"file_type": "pdf",
+		"module_type": "langchain_parse",
+		"parse_method": "pdfminer",
+	},
+	"csv": {
+		"file_type": "csv",
+		"module_type": "langchain_parse",
+		"parse_method": "csv",
+	},
+	"md": {
+		"file_type": "md",
+		"module_type": "langchain_parse",
+		"parse_method": "unstructuredmarkdown",
+	},
+	"html": {
+		"file_type": "html",
+		"module_type": "langchain_parse",
+		"parse_method": "bshtml",
+	},
+	"xml": {
+		"file_type": "xml",
+		"module_type": "langchain_parse",
+		"parse_method": "unstructuredxml",
+	},
+}
+
+
+def run_parser(
+	modules: List[Callable],
+	module_params: List[Dict],
+	data_path_glob: str,
+	project_dir: str,
+	all_files: bool,
+):
+	if not all_files:
+		# Set the parsing module to default if it is a file type in paths but not set in YAML.
+		data_path_list = glob(data_path_glob)
+		if not data_path_list:
+			raise FileNotFoundError(f"data does not exits in {data_path_glob}")
+
+		file_types = set(
+			[os.path.basename(data_path).split(".")[-1] for data_path in data_path_list]
+		)
+		set_file_types = set([module["file_type"] for module in module_params])
+
+		# Calculate the set difference once
+		file_types_to_remove = set_file_types - file_types
+
+		# Use list comprehension to filter out unwanted elements
+		module_params = [
+			param
+			for param in module_params
+			if param["file_type"] not in file_types_to_remove
+		]
+		modules = [
+			module
+			for module, param in zip(modules, module_params)
+			if param["file_type"] not in file_types_to_remove
+		]
+
+		# create a list of only those file_types that are in file_types but not in set_file_types
+		missing_file_types = list(file_types - set_file_types)
+
+		if missing_file_types:
+			add_modules_list = []
+			for missing_file_type in missing_file_types:
+				if missing_file_type == "json":
+					raise ValueError(
+						"JSON file type must have a jq_schema so you must set it in the YAML file."
+					)
+
+				add_modules_list.append(default_map[missing_file_type])
+
+			add_modules, add_params = get_param_combinations(add_modules_list)
+			modules.extend(add_modules)
+			module_params.extend(add_params)
+
+	results, execution_times = zip(
+		*map(
+			lambda x: measure_speed(x[0], data_path_glob=data_path_glob, **x[1]),
+			zip(modules, module_params),
+		)
+	)
+	average_times = list(map(lambda x: x / len(results[0]), execution_times))
+
+	# save results to parquet files
+	if all_files:
+		if len(module_params) > 1:
+			raise ValueError(
+				"All files is set to True, You can only use one parsing module."
+			)
+		filepaths = [os.path.join(project_dir, "parsed_result.parquet")]
+	else:
+		filepaths = list(
+			map(
+				lambda x: os.path.join(project_dir, f"{x['file_type']}.parquet"),
+				module_params,
+			)
+		)
+
+	_files = {}
+	for result, filepath in zip(results, filepaths):
+		_files[filepath].append(result) if filepath in _files.keys() else _files.update(
+			{filepath: [result]}
+		)
+	# Save files with a specific file type as Parquet files.
+	for filepath, value in _files.items():
+		pd.concat(value).to_parquet(filepath, index=False)
+
+	filenames = list(map(lambda x: os.path.basename(x), filepaths))
+
+	summary_df = pd.DataFrame(
+		{
+			"filename": filenames,
+			"module_name": list(map(lambda module: module.__name__, modules)),
+			"module_params": module_params,
+			"execution_time": average_times,
+		}
+	)
+	summary_df.to_csv(os.path.join(project_dir, "summary.csv"), index=False)
+
+	# concat all parquet files here if not all_files.
+	_filepaths = list(_files.keys())
+	if not all_files:
+		dataframes = [pd.read_parquet(file) for file in _filepaths]
+		combined_df = pd.concat(dataframes, ignore_index=True)
+		combined_df.to_parquet(
+			os.path.join(project_dir, "parsed_result.parquet"), index=False
+		)
+
+	return summary_df