autorag_evaluation/autorag/data/parse/table_hybrid_parse.py

import os
import tempfile
from glob import glob
from typing import List, Tuple, Dict

from PyPDF2 import PdfFileReader, PdfFileWriter
import pdfplumber

from autorag.support import get_support_modules
from autorag.data.parse.base import parser_node


@parser_node
def table_hybrid_parse(
	data_path_list: List[str],
	text_parse_module: str,
	text_params: Dict,
	table_parse_module: str,
	table_params: Dict,
) -> Tuple[List[str], List[str], List[int]]:
	"""
	Parse documents to use table_hybrid_parse method.
	The table_hybrid_parse method is a hybrid method that combines the parsing results of PDFs with and without tables.
	It splits the PDF file into pages, separates pages with and without tables, and then parses and merges the results.

	:param data_path_list: The list of data paths to parse.
	:param text_parse_module: The text parsing module to use. The type should be a string.
	:param text_params: The extra parameters for the text parsing module. The type should be a dictionary.
	:param table_parse_module: The table parsing module to use. The type should be a string.
	:param table_params: The extra parameters for the table parsing module. The type should be a dictionary.
	:return: tuple of lists containing the parsed texts, path and pages.
	"""
	# make save folder directory
	with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as save_dir:
		text_dir = os.path.join(save_dir, "text")
		table_dir = os.path.join(save_dir, "table")

		os.makedirs(text_dir, exist_ok=True)
		os.makedirs(table_dir, exist_ok=True)

		# Split PDF file into pages and Save PDFs with and without tables
		path_map_dict_lst = [
			save_page_by_table(data_path, text_dir, table_dir)
			for data_path in data_path_list
		]
		path_map_dict = {k: v for d in path_map_dict_lst for k, v in d.items()}

		# Extract text pages
		table_results, table_file_path = get_each_module_result(
			table_parse_module, table_params, os.path.join(table_dir, "*")
		)

		# Extract table pages
		text_results, text_file_path = get_each_module_result(
			text_parse_module, text_params, os.path.join(text_dir, "*")
		)

		# Merge parsing results of PDFs with and without tables
		texts = table_results + text_results
		temp_path_lst = table_file_path + text_file_path

		# Sort by file names
		temp_path_lst, texts = zip(*sorted(zip(temp_path_lst, texts)))

		# get original file path
		path = list(map(lambda temp_path: path_map_dict[temp_path], temp_path_lst))

		# get pages
		pages = list(map(lambda x: get_page_from_path(x), temp_path_lst))

		return list(texts), path, pages


# Save PDFs with and without tables
def save_page_by_table(data_path: str, text_dir: str, table_dir: str) -> Dict[str, str]:
	file_name = os.path.basename(data_path).split(".pdf")[0]

	with open(data_path, "rb") as input_data:
		pdf_reader = PdfFileReader(input_data)
		num_pages = pdf_reader.getNumPages()

		path_map_dict = {}
		for page_num in range(num_pages):
			output_pdf_path = _get_output_path(
				data_path, page_num, file_name, text_dir, table_dir
			)
			_save_single_page(pdf_reader, page_num, output_pdf_path)
			path_map_dict.update({output_pdf_path: data_path})

	return path_map_dict


def _get_output_path(
	data_path: str, page_num: int, file_name: str, text_dir: str, table_dir: str
) -> str:
	with pdfplumber.open(data_path) as pdf:
		page = pdf.pages[page_num]
		tables = page.extract_tables()
		directory = table_dir if tables else text_dir
		return os.path.join(directory, f"{file_name}_page_{page_num + 1}.pdf")


def _save_single_page(pdf_reader: PdfFileReader, page_num: int, output_pdf_path: str):
	pdf_writer = PdfFileWriter()
	pdf_writer.addPage(pdf_reader.getPage(page_num))

	with open(output_pdf_path, "wb") as output_file:
		pdf_writer.write(output_file)


def get_each_module_result(
	module: str, module_params: Dict, data_path_glob: str
) -> Tuple[List[str], List[str]]:
	module_params["module_type"] = module

	data_path_list = glob(data_path_glob)
	if not data_path_list:
		return [], []

	module_name = module_params.pop("module_type")
	module_callable = get_support_modules(module_name)
	module_original = module_callable.__wrapped__
	texts, path, _ = module_original(data_path_list, **module_params)

	return texts, path


def get_page_from_path(file_path: str) -> int:
	file_name = os.path.basename(file_path)
	split_result = file_name.rsplit("_page_", -1)
	page_number_with_extension = split_result[1]
	page_number, _ = page_number_with_extension.split(".")

	return int(page_number)