142 lines
4.4 KiB
Python
142 lines
4.4 KiB
Python
import logging
|
|
import sys
|
|
|
|
from random import random
|
|
from typing import List, Union, Dict
|
|
|
|
from llama_index.core.embeddings.mock_embed_model import MockEmbedding
|
|
from llama_index.embeddings.openai import OpenAIEmbedding
|
|
from llama_index.embeddings.openai import OpenAIEmbeddingModelType
|
|
from llama_index.embeddings.ollama import OllamaEmbedding
|
|
from langchain_openai.embeddings import OpenAIEmbeddings
|
|
|
|
from autorag import LazyInit
|
|
|
|
logger = logging.getLogger("AutoRAG")
|
|
|
|
|
|
class MockEmbeddingRandom(MockEmbedding):
|
|
"""Mock embedding with random vectors."""
|
|
|
|
def _get_vector(self) -> List[float]:
|
|
return [random() for _ in range(self.embed_dim)]
|
|
|
|
|
|
embedding_models = {
|
|
# llama index
|
|
"openai": LazyInit(
|
|
OpenAIEmbedding
|
|
), # default model is OpenAIEmbeddingModelType.TEXT_EMBED_ADA_002
|
|
"openai_embed_3_large": LazyInit(
|
|
OpenAIEmbedding, model_name=OpenAIEmbeddingModelType.TEXT_EMBED_3_LARGE
|
|
),
|
|
"openai_embed_3_small": LazyInit(
|
|
OpenAIEmbedding, model_name=OpenAIEmbeddingModelType.TEXT_EMBED_3_SMALL
|
|
),
|
|
"mock": LazyInit(MockEmbeddingRandom, embed_dim=768),
|
|
# langchain
|
|
"openai_langchain": LazyInit(OpenAIEmbeddings),
|
|
"ollama": LazyInit(OllamaEmbedding),
|
|
}
|
|
|
|
try:
|
|
# you can use your own model in this way.
|
|
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
|
|
|
embedding_models["huggingface_baai_bge_small"] = LazyInit(
|
|
HuggingFaceEmbedding, model_name="BAAI/bge-small-en-v1.5"
|
|
)
|
|
embedding_models["huggingface_cointegrated_rubert_tiny2"] = LazyInit(
|
|
HuggingFaceEmbedding, model_name="cointegrated/rubert-tiny2"
|
|
)
|
|
embedding_models["huggingface_all_mpnet_base_v2"] = LazyInit(
|
|
HuggingFaceEmbedding,
|
|
model_name="sentence-transformers/all-mpnet-base-v2",
|
|
max_length=512,
|
|
)
|
|
embedding_models["huggingface_bge_m3"] = LazyInit(
|
|
HuggingFaceEmbedding, model_name="BAAI/bge-m3"
|
|
)
|
|
embedding_models["huggingface_multilingual_e5_large"] = LazyInit(
|
|
HuggingFaceEmbedding, model_name="intfloat/multilingual-e5-large-instruct"
|
|
)
|
|
embedding_models["huggingface_all_mpnet_base_v2"] = LazyInit(
|
|
HuggingFaceEmbedding, model_name="sentence-transformers/all-mpnet-base-v2"
|
|
) # 230313 추가 - 김용연
|
|
embedding_models["huggingface_KURE-v1"] = LazyInit(
|
|
HuggingFaceEmbedding, model_name="nlpai-lab/KURE-v1"
|
|
) # 230313 추가 - 김용연
|
|
embedding_models["huggingface_drangonku-v2-ko"] = LazyInit(
|
|
HuggingFaceEmbedding, model_name="dragonkue/snowflake-arctic-embed-l-v2.0-ko"
|
|
) # 230313 추가 - 김용연
|
|
|
|
except ImportError:
|
|
logger.info(
|
|
"You are using API version of AutoRAG."
|
|
"To use local version, run pip install 'AutoRAG[gpu]'"
|
|
)
|
|
|
|
|
|
class EmbeddingModel:
|
|
@staticmethod
|
|
def load(config: Union[str, Dict, List[Dict]]):
|
|
if isinstance(config, str):
|
|
return EmbeddingModel.load_from_str(config)
|
|
elif isinstance(config, dict):
|
|
return EmbeddingModel.load_from_dict(config)
|
|
elif isinstance(config, list):
|
|
return EmbeddingModel.load_from_list(config)
|
|
else:
|
|
raise ValueError("Invalid type of config")
|
|
|
|
@staticmethod
|
|
def load_from_str(name: str):
|
|
try:
|
|
return embedding_models[name]
|
|
except KeyError:
|
|
raise ValueError(f"Embedding model '{name}' is not supported")
|
|
|
|
@staticmethod
|
|
def load_from_list(option: List[dict]):
|
|
if len(option) != 1:
|
|
raise ValueError("Only one embedding model is supported")
|
|
return EmbeddingModel.load_from_dict(option[0])
|
|
|
|
@staticmethod
|
|
def load_from_dict(option: dict):
|
|
def _check_keys(target: dict):
|
|
if "type" not in target or "model_name" not in target:
|
|
raise ValueError("Both 'type' and 'model_name' must be provided")
|
|
if target["type"] not in ["openai", "huggingface", "mock", "ollama"]:
|
|
raise ValueError(
|
|
f"Embedding model type '{target['type']}' is not supported"
|
|
)
|
|
|
|
def _get_huggingface_class():
|
|
module = sys.modules.get("llama_index.embeddings.huggingface")
|
|
if not module:
|
|
logger.info(
|
|
"You are using API version of AutoRAG. "
|
|
"To use local version, run `pip install 'AutoRAG[gpu]'`."
|
|
)
|
|
return None
|
|
return getattr(module, "HuggingFaceEmbedding", None)
|
|
|
|
_check_keys(option)
|
|
|
|
model_options = option
|
|
model_type = model_options.pop("type")
|
|
|
|
embedding_map = {
|
|
"openai": OpenAIEmbedding,
|
|
"mock": MockEmbeddingRandom,
|
|
"huggingface": _get_huggingface_class(),
|
|
"ollama": OllamaEmbedding,
|
|
}
|
|
|
|
embedding_class = embedding_map.get(model_type)
|
|
if not embedding_class:
|
|
raise ValueError(f"Embedding model type '{model_type}' is not supported")
|
|
|
|
return LazyInit(embedding_class, **model_options)
|