Fix Dockerfile build issue
This commit is contained in:
2
autorag/data/chunk/__init__.py
Normal file
2
autorag/data/chunk/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
from .llama_index_chunk import llama_index_chunk
|
||||
from .langchain_chunk import langchain_chunk
|
||||
128
autorag/data/chunk/base.py
Normal file
128
autorag/data/chunk/base.py
Normal file
@@ -0,0 +1,128 @@
|
||||
import functools
|
||||
import logging
|
||||
from typing import Tuple, List, Dict, Any
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from autorag.embedding.base import EmbeddingModel
|
||||
from autorag.data import chunk_modules, sentence_splitter_modules
|
||||
from autorag.utils import result_to_dataframe
|
||||
|
||||
logger = logging.getLogger("AutoRAG")
|
||||
|
||||
|
||||
def chunker_node(func):
|
||||
@functools.wraps(func)
|
||||
@result_to_dataframe(["doc_id", "contents", "path", "start_end_idx", "metadata"])
|
||||
def wrapper(
|
||||
parsed_result: pd.DataFrame, chunk_method: str, **kwargs
|
||||
) -> Tuple[
|
||||
List[str], List[str], List[str], List[Tuple[int, int]], List[Dict[str, Any]]
|
||||
]:
|
||||
logger.info(f"Running chunker - {func.__name__} module...")
|
||||
|
||||
# get texts from parsed_result
|
||||
texts = parsed_result["texts"].tolist()
|
||||
|
||||
# get filenames from parsed_result when 'add_file_name' is setting
|
||||
file_name_language = kwargs.pop("add_file_name", None)
|
||||
metadata_list = make_metadata_list(parsed_result)
|
||||
|
||||
# run chunk module
|
||||
if func.__name__ in ["llama_index_chunk", "langchain_chunk"]:
|
||||
chunk_instance = __get_chunk_instance(
|
||||
func.__name__, chunk_method.lower(), **kwargs
|
||||
)
|
||||
result = func(
|
||||
texts=texts,
|
||||
chunker=chunk_instance,
|
||||
file_name_language=file_name_language,
|
||||
metadata_list=metadata_list,
|
||||
)
|
||||
del chunk_instance
|
||||
return result
|
||||
else:
|
||||
raise ValueError(f"Unsupported module_type: {func.__name__}")
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
def make_metadata_list(parsed_result: pd.DataFrame) -> List[Dict[str, str]]:
|
||||
metadata_list = [{} for _ in range(len(parsed_result["texts"]))]
|
||||
|
||||
def _make_metadata_pure(
|
||||
lst: List[str], key: str, metadata_lst: List[Dict[str, str]]
|
||||
):
|
||||
for value, metadata in zip(lst, metadata_lst):
|
||||
metadata[key] = value
|
||||
|
||||
for column in ["page", "last_modified_datetime", "path"]:
|
||||
if column in parsed_result.columns:
|
||||
_make_metadata_pure(parsed_result[column].tolist(), column, metadata_list)
|
||||
return metadata_list
|
||||
|
||||
|
||||
def __get_chunk_instance(module_type: str, chunk_method: str, **kwargs):
|
||||
# Add sentence_splitter to kwargs
|
||||
sentence_available_methods = [
|
||||
"semantic_llama_index",
|
||||
"semanticdoublemerging",
|
||||
"sentencewindow",
|
||||
]
|
||||
if chunk_method in sentence_available_methods:
|
||||
# llama index default sentence_splitter is 'nltk -PunktSentenceTokenizer'
|
||||
if "sentence_splitter" in kwargs.keys():
|
||||
sentence_splitter_str = kwargs.pop("sentence_splitter")
|
||||
sentence_splitter_func = sentence_splitter_modules[sentence_splitter_str]()
|
||||
kwargs.update({"sentence_splitter": sentence_splitter_func})
|
||||
|
||||
def get_embedding_model(_embed_model_str: str, _module_type: str):
|
||||
if _embed_model_str == "openai":
|
||||
if _module_type == "langchain_chunk":
|
||||
_embed_model_str = "openai_langchain"
|
||||
return EmbeddingModel.load(_embed_model_str)()
|
||||
|
||||
# Add embed_model to kwargs
|
||||
embedding_available_methods = ["semantic_llama_index", "semantic_langchain"]
|
||||
if chunk_method in embedding_available_methods:
|
||||
# there is no default embed_model, so we have to get it parameter and add it.
|
||||
if "embed_model" not in kwargs.keys():
|
||||
raise ValueError(f"embed_model is required for {chunk_method} method.")
|
||||
embed_model_str = kwargs.pop("embed_model")
|
||||
embed_model = get_embedding_model(embed_model_str, module_type)
|
||||
if chunk_method == "semantic_llama_index":
|
||||
kwargs.update({"embed_model": embed_model})
|
||||
elif chunk_method == "semantic_langchain":
|
||||
kwargs.update({"embeddings": embed_model})
|
||||
|
||||
return chunk_modules[chunk_method](**kwargs)
|
||||
|
||||
|
||||
def add_file_name(
|
||||
file_name_language: str, file_names: List[str], chunk_texts: List[str]
|
||||
) -> List[str]:
|
||||
if file_name_language == "en":
|
||||
return list(
|
||||
map(
|
||||
lambda x: f"file_name: {x[1]}\n contents: {x[0]}",
|
||||
zip(chunk_texts, file_names),
|
||||
)
|
||||
)
|
||||
elif file_name_language == "ko":
|
||||
return list(
|
||||
map(
|
||||
lambda x: f"파일 제목: {x[1]}\n 내용: {x[0]}",
|
||||
zip(chunk_texts, file_names),
|
||||
)
|
||||
)
|
||||
elif file_name_language == "ja":
|
||||
return list(
|
||||
map(
|
||||
lambda x: f"ファイル名: {x[1]}\n 内容: {x[0]}",
|
||||
zip(chunk_texts, file_names),
|
||||
)
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unsupported file_name_language: {file_name_language}. Choose from 'en' ,'ko' or 'ja."
|
||||
)
|
||||
76
autorag/data/chunk/langchain_chunk.py
Normal file
76
autorag/data/chunk/langchain_chunk.py
Normal file
@@ -0,0 +1,76 @@
|
||||
import os
|
||||
from itertools import chain
|
||||
import uuid
|
||||
from typing import Tuple, List, Dict, Any, Optional
|
||||
|
||||
from langchain_text_splitters import TextSplitter
|
||||
|
||||
from autorag.data.chunk.base import chunker_node, add_file_name
|
||||
from autorag.data.utils.util import add_essential_metadata, get_start_end_idx
|
||||
|
||||
|
||||
@chunker_node
|
||||
def langchain_chunk(
|
||||
texts: List[str],
|
||||
chunker: TextSplitter,
|
||||
file_name_language: Optional[str] = None,
|
||||
metadata_list: Optional[List[Dict[str, str]]] = None,
|
||||
) -> Tuple[
|
||||
List[str], List[str], List[str], List[Tuple[int, int]], List[Dict[str, Any]]
|
||||
]:
|
||||
"""
|
||||
Chunk texts from the parsed result to use langchain chunk method
|
||||
|
||||
:param texts: The list of texts to chunk from the parsed result
|
||||
:param chunker: A langchain TextSplitter(Chunker) instance.
|
||||
:param file_name_language: The language to use 'add_file_name' feature.
|
||||
You need to set one of 'English' and 'Korean'
|
||||
The 'add_file_name' feature is to add a file_name to chunked_contents.
|
||||
This is used to prevent hallucination by retrieving contents from the wrong document.
|
||||
Default form of 'English' is "file_name: {file_name}\n contents: {content}"
|
||||
:param metadata_list: The list of dict of metadata from the parsed result
|
||||
:return: tuple of lists containing the chunked doc_id, contents, path, start_idx, end_idx and metadata
|
||||
"""
|
||||
results = [
|
||||
langchain_chunk_pure(text, chunker, file_name_language, meta)
|
||||
for text, meta in zip(texts, metadata_list)
|
||||
]
|
||||
|
||||
doc_id, contents, path, start_end_idx, metadata = (
|
||||
list(chain.from_iterable(item)) for item in zip(*results)
|
||||
)
|
||||
|
||||
return doc_id, contents, path, start_end_idx, metadata
|
||||
|
||||
|
||||
def langchain_chunk_pure(
|
||||
text: str,
|
||||
chunker: TextSplitter,
|
||||
file_name_language: Optional[str] = None,
|
||||
_metadata: Optional[Dict[str, str]] = None,
|
||||
):
|
||||
# chunk
|
||||
chunk_results = chunker.create_documents([text], metadatas=[_metadata])
|
||||
|
||||
# make doc_id
|
||||
doc_id = list(str(uuid.uuid4()) for _ in range(len(chunk_results)))
|
||||
|
||||
# make path
|
||||
path_lst = list(map(lambda x: x.metadata.get("path", ""), chunk_results))
|
||||
|
||||
# make contents and start_end_idx
|
||||
if file_name_language:
|
||||
chunked_file_names = list(map(lambda x: os.path.basename(x), path_lst))
|
||||
chunked_texts = list(map(lambda x: x.page_content, chunk_results))
|
||||
start_end_idx = list(map(lambda x: get_start_end_idx(text, x), chunked_texts))
|
||||
contents = add_file_name(file_name_language, chunked_file_names, chunked_texts)
|
||||
else:
|
||||
contents = list(map(lambda node: node.page_content, chunk_results))
|
||||
start_end_idx = list(map(lambda x: get_start_end_idx(text, x), contents))
|
||||
|
||||
# make metadata
|
||||
metadata = list(
|
||||
map(lambda node: add_essential_metadata(node.metadata), chunk_results)
|
||||
)
|
||||
|
||||
return doc_id, contents, path_lst, start_end_idx, metadata
|
||||
96
autorag/data/chunk/llama_index_chunk.py
Normal file
96
autorag/data/chunk/llama_index_chunk.py
Normal file
@@ -0,0 +1,96 @@
|
||||
import os.path
|
||||
from itertools import chain
|
||||
from typing import Tuple, List, Dict, Any, Optional
|
||||
|
||||
from llama_index.core import Document
|
||||
from llama_index.core.node_parser.interface import NodeParser
|
||||
|
||||
from autorag.utils.util import process_batch, get_event_loop
|
||||
from autorag.data.chunk.base import chunker_node, add_file_name
|
||||
from autorag.data.utils.util import (
|
||||
add_essential_metadata_llama_text_node,
|
||||
get_start_end_idx,
|
||||
)
|
||||
|
||||
|
||||
@chunker_node
|
||||
def llama_index_chunk(
|
||||
texts: List[str],
|
||||
chunker: NodeParser,
|
||||
file_name_language: Optional[str] = None,
|
||||
metadata_list: Optional[List[Dict[str, str]]] = None,
|
||||
batch: int = 8,
|
||||
) -> Tuple[
|
||||
List[str], List[str], List[str], List[Tuple[int, int]], List[Dict[str, Any]]
|
||||
]:
|
||||
"""
|
||||
Chunk texts from the parsed result to use llama index chunk method
|
||||
|
||||
:param texts: The list of texts to chunk from the parsed result
|
||||
:param chunker: A llama index NodeParser(Chunker) instance.
|
||||
:param file_name_language: The language to use 'add_file_name' feature.
|
||||
You need to set one of 'English' and 'Korean'
|
||||
The 'add_file_name' feature is to add a file_name to chunked_contents.
|
||||
This is used to prevent hallucination by retrieving contents from the wrong document.
|
||||
Default form of 'English' is "file_name: {file_name}\n contents: {content}"
|
||||
:param metadata_list: The list of dict of metadata from the parsed result
|
||||
:param batch: The batch size for chunk texts. Default is 8
|
||||
:return: tuple of lists containing the chunked doc_id, contents, path, start_idx, end_idx and metadata
|
||||
"""
|
||||
tasks = [
|
||||
llama_index_chunk_pure(text, chunker, file_name_language, meta)
|
||||
for text, meta in zip(texts, metadata_list)
|
||||
]
|
||||
loop = get_event_loop()
|
||||
results = loop.run_until_complete(process_batch(tasks, batch))
|
||||
|
||||
doc_id, contents, path, start_end_idx, metadata = (
|
||||
list(chain.from_iterable(item)) for item in zip(*results)
|
||||
)
|
||||
|
||||
return list(doc_id), list(contents), list(path), list(start_end_idx), list(metadata)
|
||||
|
||||
|
||||
async def llama_index_chunk_pure(
|
||||
text: str,
|
||||
chunker: NodeParser,
|
||||
file_name_language: Optional[str] = None,
|
||||
_metadata: Optional[Dict[str, str]] = None,
|
||||
):
|
||||
# set document
|
||||
document = [Document(text=text, metadata=_metadata)]
|
||||
|
||||
# chunk document
|
||||
chunk_results = await chunker.aget_nodes_from_documents(documents=document)
|
||||
|
||||
# make doc_id
|
||||
doc_id = list(map(lambda node: node.node_id, chunk_results))
|
||||
|
||||
# make path
|
||||
path_lst = list(map(lambda x: x.metadata.get("path", ""), chunk_results))
|
||||
|
||||
# make contents and start_end_idx
|
||||
if file_name_language:
|
||||
chunked_file_names = list(map(lambda x: os.path.basename(x), path_lst))
|
||||
chunked_texts = list(map(lambda x: x.text, chunk_results))
|
||||
start_end_idx = list(
|
||||
map(
|
||||
lambda x: get_start_end_idx(text, x),
|
||||
chunked_texts,
|
||||
)
|
||||
)
|
||||
contents = add_file_name(file_name_language, chunked_file_names, chunked_texts)
|
||||
else:
|
||||
contents = list(map(lambda x: x.text, chunk_results))
|
||||
start_end_idx = list(map(lambda x: get_start_end_idx(text, x), contents))
|
||||
|
||||
metadata = list(
|
||||
map(
|
||||
lambda node: add_essential_metadata_llama_text_node(
|
||||
node.metadata, node.relationships
|
||||
),
|
||||
chunk_results,
|
||||
)
|
||||
)
|
||||
|
||||
return doc_id, contents, path_lst, start_end_idx, metadata
|
||||
38
autorag/data/chunk/run.py
Normal file
38
autorag/data/chunk/run.py
Normal file
@@ -0,0 +1,38 @@
|
||||
import os
|
||||
from typing import Callable, List, Dict
|
||||
import pandas as pd
|
||||
|
||||
from autorag.strategy import measure_speed
|
||||
|
||||
|
||||
def run_chunker(
|
||||
modules: List[Callable],
|
||||
module_params: List[Dict],
|
||||
parsed_result: pd.DataFrame,
|
||||
project_dir: str,
|
||||
):
|
||||
results, execution_times = zip(
|
||||
*map(
|
||||
lambda x: measure_speed(x[0], parsed_result=parsed_result, **x[1]),
|
||||
zip(modules, module_params),
|
||||
)
|
||||
)
|
||||
average_times = list(map(lambda x: x / len(results[0]), execution_times))
|
||||
|
||||
# save results to parquet files
|
||||
filepaths = list(
|
||||
map(lambda x: os.path.join(project_dir, f"{x}.parquet"), range(len(modules)))
|
||||
)
|
||||
list(map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths)))
|
||||
filenames = list(map(lambda x: os.path.basename(x), filepaths))
|
||||
|
||||
summary_df = pd.DataFrame(
|
||||
{
|
||||
"filename": filenames,
|
||||
"module_name": list(map(lambda module: module.__name__, modules)),
|
||||
"module_params": module_params,
|
||||
"execution_time": average_times,
|
||||
}
|
||||
)
|
||||
summary_df.to_csv(os.path.join(project_dir, "summary.csv"), index=False)
|
||||
return summary_df
|
||||
Reference in New Issue
Block a user