Initial commit

This commit is contained in:
kyy
2025-03-14 17:33:18 +09:00
parent ba9c1a4a5f
commit 6814230bfb
61 changed files with 2087124 additions and 4 deletions

View File

@@ -0,0 +1,4 @@
modules:
- module_type: langchain_parse
file_type: pdf
parse_method: [ pdfminer, pdfplumber, pypdfium2, pypdf, pymupdf ]

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1,6 @@
filename,module_name,module_params,execution_time
pdf.parquet,langchain_parse,"{'file_type': 'pdf', 'parse_method': 'pymupdf'}",0.015248891783923638
pdf.parquet,langchain_parse,"{'file_type': 'pdf', 'parse_method': 'pypdf'}",0.15360368810048916
pdf.parquet,langchain_parse,"{'file_type': 'pdf', 'parse_method': 'pdfplumber'}",0.42682165052832627
pdf.parquet,langchain_parse,"{'file_type': 'pdf', 'parse_method': 'pdfminer'}",0.44084878549343204
pdf.parquet,langchain_parse,"{'file_type': 'pdf', 'parse_method': 'pypdfium2'}",0.008509700472761944
1 filename module_name module_params execution_time
2 pdf.parquet langchain_parse {'file_type': 'pdf', 'parse_method': 'pymupdf'} 0.015248891783923638
3 pdf.parquet langchain_parse {'file_type': 'pdf', 'parse_method': 'pypdf'} 0.15360368810048916
4 pdf.parquet langchain_parse {'file_type': 'pdf', 'parse_method': 'pdfplumber'} 0.42682165052832627
5 pdf.parquet langchain_parse {'file_type': 'pdf', 'parse_method': 'pdfminer'} 0.44084878549343204
6 pdf.parquet langchain_parse {'file_type': 'pdf', 'parse_method': 'pypdfium2'} 0.008509700472761944