refactor: classes and configs (#528)
This commit is contained in:
@@ -1,14 +1,5 @@
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
try:
|
||||
from chromadb.utils import embedding_functions
|
||||
except RuntimeError:
|
||||
from embedchain.utils import use_pysqlite3
|
||||
|
||||
use_pysqlite3()
|
||||
from chromadb.utils import embedding_functions
|
||||
|
||||
from embedchain.helper_classes.json_serializable import register_deserializable
|
||||
|
||||
from .BaseAppConfig import BaseAppConfig
|
||||
@@ -23,44 +14,14 @@ class AppConfig(BaseAppConfig):
|
||||
def __init__(
|
||||
self,
|
||||
log_level=None,
|
||||
host=None,
|
||||
port=None,
|
||||
id=None,
|
||||
collection_name=None,
|
||||
collect_metrics: Optional[bool] = None,
|
||||
collection_name: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
:param log_level: Optional. (String) Debug level
|
||||
['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'].
|
||||
:param host: Optional. Hostname for the database server.
|
||||
:param port: Optional. Port for the database server.
|
||||
:param id: Optional. ID of the app. Document metadata will have this id.
|
||||
:param collection_name: Optional. Collection name for the database.
|
||||
:param collect_metrics: Defaults to True. Send anonymous telemetry to improve embedchain.
|
||||
"""
|
||||
super().__init__(
|
||||
log_level=log_level,
|
||||
embedding_fn=AppConfig.default_embedding_function(),
|
||||
host=host,
|
||||
port=port,
|
||||
id=id,
|
||||
collection_name=collection_name,
|
||||
collect_metrics=collect_metrics,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def default_embedding_function():
|
||||
"""
|
||||
Sets embedding function to default (`text-embedding-ada-002`).
|
||||
|
||||
:raises ValueError: If the template is not valid as template should contain
|
||||
$context and $query
|
||||
:returns: The default embedding function for the app class.
|
||||
"""
|
||||
if os.getenv("OPENAI_API_KEY") is None and os.getenv("OPENAI_ORGANIZATION") is None:
|
||||
raise ValueError("OPENAI_API_KEY or OPENAI_ORGANIZATION environment variables not provided") # noqa:E501
|
||||
return embedding_functions.OpenAIEmbeddingFunction(
|
||||
api_key=os.getenv("OPENAI_API_KEY"),
|
||||
organization_id=os.getenv("OPENAI_ORGANIZATION"),
|
||||
model_name="text-embedding-ada-002",
|
||||
)
|
||||
super().__init__(log_level=log_level, id=id, collect_metrics=collect_metrics, collection_name=collection_name)
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
from embedchain.config.BaseConfig import BaseConfig
|
||||
from embedchain.config.vectordbs import ElasticsearchDBConfig
|
||||
from embedchain.helper_classes.json_serializable import JSONSerializable
|
||||
from embedchain.models import VectorDatabases, VectorDimensions
|
||||
from embedchain.vectordb.base_vector_db import BaseVectorDB
|
||||
|
||||
|
||||
class BaseAppConfig(BaseConfig, JSONSerializable):
|
||||
@@ -14,81 +14,38 @@ class BaseAppConfig(BaseConfig, JSONSerializable):
|
||||
def __init__(
|
||||
self,
|
||||
log_level=None,
|
||||
embedding_fn=None,
|
||||
db=None,
|
||||
host=None,
|
||||
port=None,
|
||||
db: Optional[BaseVectorDB] = None,
|
||||
id=None,
|
||||
collection_name=None,
|
||||
collect_metrics: bool = True,
|
||||
db_type: VectorDatabases = None,
|
||||
vector_dim: VectorDimensions = None,
|
||||
es_config: ElasticsearchDBConfig = None,
|
||||
chroma_settings: dict = {},
|
||||
collection_name: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
:param log_level: Optional. (String) Debug level
|
||||
['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'].
|
||||
:param embedding_fn: Embedding function to use.
|
||||
:param db: Optional. (Vector) database instance to use for embeddings.
|
||||
:param host: Optional. Hostname for the database server.
|
||||
:param port: Optional. Port for the database server.
|
||||
:param db: Optional. (Vector) database instance to use for embeddings. Deprecated in favor of app(..., db).
|
||||
:param id: Optional. ID of the app. Document metadata will have this id.
|
||||
:param collection_name: Optional. Collection name for the database.
|
||||
:param collect_metrics: Defaults to True. Send anonymous telemetry to improve embedchain.
|
||||
:param db_type: Optional. type of Vector database to use
|
||||
:param vector_dim: Vector dimension generated by embedding fn
|
||||
:param db_type: Optional. Initializes a default vector database of the given type.
|
||||
Using the `db` argument is preferred.
|
||||
:param es_config: Optional. elasticsearch database config to be used for connection
|
||||
:param chroma_settings: Optional. Chroma settings for connection.
|
||||
:param collection_name: Optional. Default collection name.
|
||||
It's recommended to use app.set_collection_name() instead.
|
||||
"""
|
||||
self._setup_logging(log_level)
|
||||
self.collection_name = collection_name if collection_name else "embedchain_store"
|
||||
self.db = BaseAppConfig.get_db(
|
||||
db=db,
|
||||
embedding_fn=embedding_fn,
|
||||
host=host,
|
||||
port=port,
|
||||
db_type=db_type,
|
||||
vector_dim=vector_dim,
|
||||
collection_name=self.collection_name,
|
||||
es_config=es_config,
|
||||
chroma_settings=chroma_settings,
|
||||
)
|
||||
self.id = id
|
||||
self.collect_metrics = True if (collect_metrics is True or collect_metrics is None) else False
|
||||
return
|
||||
self.collection_name = collection_name
|
||||
|
||||
@staticmethod
|
||||
def get_db(db, embedding_fn, host, port, db_type, vector_dim, collection_name, es_config, chroma_settings):
|
||||
"""
|
||||
Get db based on db_type, db with default database (`ChromaDb`)
|
||||
:param Optional. (Vector) database to use for embeddings.
|
||||
:param embedding_fn: Embedding function to use in database.
|
||||
:param host: Optional. Hostname for the database server.
|
||||
:param port: Optional. Port for the database server.
|
||||
:param db_type: Optional. db type to use. Supported values (`es`, `chroma`)
|
||||
:param vector_dim: Vector dimension generated by embedding fn
|
||||
:param collection_name: Optional. Collection name for the database.
|
||||
:param es_config: Optional. elasticsearch database config to be used for connection
|
||||
:raises ValueError: BaseAppConfig knows no default embedding function.
|
||||
:returns: database instance
|
||||
"""
|
||||
if db:
|
||||
return db
|
||||
|
||||
if embedding_fn is None:
|
||||
raise ValueError("ChromaDb cannot be instantiated without an embedding function")
|
||||
|
||||
if db_type == VectorDatabases.ELASTICSEARCH:
|
||||
from embedchain.vectordb.elasticsearch_db import ElasticsearchDB
|
||||
|
||||
return ElasticsearchDB(
|
||||
embedding_fn=embedding_fn, vector_dim=vector_dim, collection_name=collection_name, es_config=es_config
|
||||
self._db = db
|
||||
logging.warning(
|
||||
"DEPRECATION WARNING: Please supply the database as the second parameter during app init. "
|
||||
"Such as `app(config=config, db=db)`."
|
||||
)
|
||||
|
||||
from embedchain.vectordb.chroma_db import ChromaDB
|
||||
|
||||
return ChromaDB(embedding_fn=embedding_fn, host=host, port=port, chroma_settings=chroma_settings)
|
||||
if collection_name:
|
||||
logging.warning("DEPRECATION WARNING: Please supply the collection name to the database config.")
|
||||
return
|
||||
|
||||
def _setup_logging(self, debug_level):
|
||||
level = logging.WARNING # Default level
|
||||
|
||||
@@ -1,12 +1,8 @@
|
||||
from typing import Any, Optional
|
||||
from typing import Optional
|
||||
|
||||
from chromadb.api.types import Documents, Embeddings
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from embedchain.config.vectordbs import ElasticsearchDBConfig
|
||||
from embedchain.helper_classes.json_serializable import register_deserializable
|
||||
from embedchain.models import (EmbeddingFunctions, Providers, VectorDatabases,
|
||||
VectorDimensions)
|
||||
|
||||
from .BaseAppConfig import BaseAppConfig
|
||||
|
||||
@@ -22,123 +18,23 @@ class CustomAppConfig(BaseAppConfig):
|
||||
def __init__(
|
||||
self,
|
||||
log_level=None,
|
||||
embedding_fn: EmbeddingFunctions = None,
|
||||
embedding_fn_model=None,
|
||||
db=None,
|
||||
host=None,
|
||||
port=None,
|
||||
id=None,
|
||||
collection_name=None,
|
||||
provider: Providers = None,
|
||||
open_source_app_config=None,
|
||||
deployment_name=None,
|
||||
collect_metrics: Optional[bool] = None,
|
||||
db_type: VectorDatabases = None,
|
||||
es_config: ElasticsearchDBConfig = None,
|
||||
chroma_settings: dict = {},
|
||||
collection_name: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
:param log_level: Optional. (String) Debug level
|
||||
['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'].
|
||||
:param embedding_fn: Optional. Embedding function to use.
|
||||
:param embedding_fn_model: Optional. Model name to use for embedding function.
|
||||
:param db: Optional. (Vector) database to use for embeddings.
|
||||
:param host: Optional. Hostname for the database server.
|
||||
:param port: Optional. Port for the database server.
|
||||
:param id: Optional. ID of the app. Document metadata will have this id.
|
||||
:param collection_name: Optional. Collection name for the database.
|
||||
:param provider: Optional. (Providers): LLM Provider to use.
|
||||
:param open_source_app_config: Optional. Config instance needed for open source apps.
|
||||
:param collect_metrics: Defaults to True. Send anonymous telemetry to improve embedchain.
|
||||
:param db_type: Optional. type of Vector database to use.
|
||||
:param es_config: Optional. elasticsearch database config to be used for connection
|
||||
:param chroma_settings: Optional. Chroma settings for connection.
|
||||
:param collection_name: Optional. Default collection name.
|
||||
It's recommended to use app.set_collection_name() instead.
|
||||
"""
|
||||
if provider:
|
||||
self.provider = provider
|
||||
else:
|
||||
raise ValueError("CustomApp must have a provider assigned.")
|
||||
|
||||
self.open_source_app_config = open_source_app_config
|
||||
|
||||
super().__init__(
|
||||
log_level=log_level,
|
||||
embedding_fn=CustomAppConfig.embedding_function(
|
||||
embedding_function=embedding_fn, model=embedding_fn_model, deployment_name=deployment_name
|
||||
),
|
||||
db=db,
|
||||
host=host,
|
||||
port=port,
|
||||
id=id,
|
||||
collection_name=collection_name,
|
||||
collect_metrics=collect_metrics,
|
||||
db_type=db_type,
|
||||
vector_dim=CustomAppConfig.get_vector_dimension(embedding_function=embedding_fn),
|
||||
es_config=es_config,
|
||||
chroma_settings=chroma_settings,
|
||||
log_level=log_level, db=db, id=id, collect_metrics=collect_metrics, collection_name=collection_name
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def langchain_default_concept(embeddings: Any):
|
||||
"""
|
||||
Langchains default function layout for embeddings.
|
||||
"""
|
||||
|
||||
def embed_function(texts: Documents) -> Embeddings:
|
||||
return embeddings.embed_documents(texts)
|
||||
|
||||
return embed_function
|
||||
|
||||
@staticmethod
|
||||
def embedding_function(embedding_function: EmbeddingFunctions, model: str = None, deployment_name: str = None):
|
||||
if not isinstance(embedding_function, EmbeddingFunctions):
|
||||
raise ValueError(
|
||||
f"Invalid option: '{embedding_function}'. Expecting one of the following options: {list(map(lambda x: x.value, EmbeddingFunctions))}" # noqa: E501
|
||||
)
|
||||
|
||||
if embedding_function == EmbeddingFunctions.OPENAI:
|
||||
from langchain.embeddings import OpenAIEmbeddings
|
||||
|
||||
if model:
|
||||
embeddings = OpenAIEmbeddings(model=model)
|
||||
else:
|
||||
if deployment_name:
|
||||
embeddings = OpenAIEmbeddings(deployment=deployment_name)
|
||||
else:
|
||||
embeddings = OpenAIEmbeddings()
|
||||
return CustomAppConfig.langchain_default_concept(embeddings)
|
||||
|
||||
elif embedding_function == EmbeddingFunctions.HUGGING_FACE:
|
||||
from langchain.embeddings import HuggingFaceEmbeddings
|
||||
|
||||
embeddings = HuggingFaceEmbeddings(model_name=model)
|
||||
return CustomAppConfig.langchain_default_concept(embeddings)
|
||||
|
||||
elif embedding_function == EmbeddingFunctions.VERTEX_AI:
|
||||
from langchain.embeddings import VertexAIEmbeddings
|
||||
|
||||
embeddings = VertexAIEmbeddings(model_name=model)
|
||||
return CustomAppConfig.langchain_default_concept(embeddings)
|
||||
|
||||
elif embedding_function == EmbeddingFunctions.GPT4ALL:
|
||||
# Note: We could use langchains GPT4ALL embedding, but it's not available in all versions.
|
||||
from chromadb.utils import embedding_functions
|
||||
|
||||
return embedding_functions.SentenceTransformerEmbeddingFunction(model_name=model)
|
||||
|
||||
@staticmethod
|
||||
def get_vector_dimension(embedding_function: EmbeddingFunctions):
|
||||
if not isinstance(embedding_function, EmbeddingFunctions):
|
||||
raise ValueError(f"Invalid option: '{embedding_function}'.")
|
||||
|
||||
if embedding_function == EmbeddingFunctions.OPENAI:
|
||||
return VectorDimensions.OPENAI.value
|
||||
|
||||
elif embedding_function == EmbeddingFunctions.HUGGING_FACE:
|
||||
return VectorDimensions.HUGGING_FACE.value
|
||||
|
||||
elif embedding_function == EmbeddingFunctions.VERTEX_AI:
|
||||
return VectorDimensions.VERTEX_AI.value
|
||||
|
||||
elif embedding_function == EmbeddingFunctions.GPT4ALL:
|
||||
return VectorDimensions.GPT4ALL.value
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
from typing import Optional
|
||||
|
||||
from chromadb.utils import embedding_functions
|
||||
|
||||
from embedchain.helper_classes.json_serializable import register_deserializable
|
||||
|
||||
from .BaseAppConfig import BaseAppConfig
|
||||
@@ -16,47 +14,21 @@ class OpenSourceAppConfig(BaseAppConfig):
|
||||
def __init__(
|
||||
self,
|
||||
log_level=None,
|
||||
host=None,
|
||||
port=None,
|
||||
id=None,
|
||||
collection_name=None,
|
||||
collect_metrics: Optional[bool] = None,
|
||||
model=None,
|
||||
collection_name: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
:param log_level: Optional. (String) Debug level
|
||||
['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'].
|
||||
:param id: Optional. ID of the app. Document metadata will have this id.
|
||||
:param collection_name: Optional. Collection name for the database.
|
||||
:param host: Optional. Hostname for the database server.
|
||||
:param port: Optional. Port for the database server.
|
||||
:param collect_metrics: Defaults to True. Send anonymous telemetry to improve embedchain.
|
||||
:param model: Optional. GPT4ALL uses the model to instantiate the class.
|
||||
So unlike `App`, it has to be provided before querying.
|
||||
:param collection_name: Optional. Default collection name.
|
||||
It's recommended to use app.db.set_collection_name() instead.
|
||||
"""
|
||||
self.model = model or "orca-mini-3b.ggmlv3.q4_0.bin"
|
||||
|
||||
super().__init__(
|
||||
log_level=log_level,
|
||||
embedding_fn=OpenSourceAppConfig.default_embedding_function(),
|
||||
host=host,
|
||||
port=port,
|
||||
id=id,
|
||||
collection_name=collection_name,
|
||||
collect_metrics=collect_metrics,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def default_embedding_function():
|
||||
"""
|
||||
Sets embedding function to default (`all-MiniLM-L6-v2`).
|
||||
|
||||
:returns: The default embedding function
|
||||
"""
|
||||
try:
|
||||
return embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
|
||||
except ValueError as e:
|
||||
print(e)
|
||||
raise ModuleNotFoundError(
|
||||
"The open source app requires extra dependencies. Install with `pip install embedchain[opensource]`"
|
||||
) from None
|
||||
super().__init__(log_level=log_level, id=id, collect_metrics=collect_metrics, collection_name=collection_name)
|
||||
|
||||
Reference in New Issue
Block a user