refactor: classes and configs (#528)

2023-09-05 10:12:58 +02:00
parent 387b042a49
commit 344e7470f6
50 changed files with 1221 additions and 997 deletions
--- a/embedchain/config/apps/AppConfig.py
+++ b/embedchain/config/apps/AppConfig.py
@@ -1,14 +1,5 @@
-import os
 from typing import Optional

-try:
-    from chromadb.utils import embedding_functions
-except RuntimeError:
-    from embedchain.utils import use_pysqlite3
-
-    use_pysqlite3()
-    from chromadb.utils import embedding_functions
-
 from embedchain.helper_classes.json_serializable import register_deserializable

 from .BaseAppConfig import BaseAppConfig
@@ -23,44 +14,14 @@ class AppConfig(BaseAppConfig):
    def __init__(
        self,
        log_level=None,
-        host=None,
-        port=None,
        id=None,
-        collection_name=None,
        collect_metrics: Optional[bool] = None,
+        collection_name: Optional[str] = None,
    ):
        """
        :param log_level: Optional. (String) Debug level
        ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'].
-        :param host: Optional. Hostname for the database server.
-        :param port: Optional. Port for the database server.
        :param id: Optional. ID of the app. Document metadata will have this id.
-        :param collection_name: Optional. Collection name for the database.
        :param collect_metrics: Defaults to True. Send anonymous telemetry to improve embedchain.
        """
-        super().__init__(
-            log_level=log_level,
-            embedding_fn=AppConfig.default_embedding_function(),
-            host=host,
-            port=port,
-            id=id,
-            collection_name=collection_name,
-            collect_metrics=collect_metrics,
-        )
-
-    @staticmethod
-    def default_embedding_function():
-        """
-        Sets embedding function to default (`text-embedding-ada-002`).
-
-        :raises ValueError: If the template is not valid as template should contain
-        $context and $query
-        :returns: The default embedding function for the app class.
-        """
-        if os.getenv("OPENAI_API_KEY") is None and os.getenv("OPENAI_ORGANIZATION") is None:
-            raise ValueError("OPENAI_API_KEY or OPENAI_ORGANIZATION environment variables not provided")  # noqa:E501
-        return embedding_functions.OpenAIEmbeddingFunction(
-            api_key=os.getenv("OPENAI_API_KEY"),
-            organization_id=os.getenv("OPENAI_ORGANIZATION"),
-            model_name="text-embedding-ada-002",
-        )
+        super().__init__(log_level=log_level, id=id, collect_metrics=collect_metrics, collection_name=collection_name)
--- a/embedchain/config/apps/BaseAppConfig.py
+++ b/embedchain/config/apps/BaseAppConfig.py
@@ -1,9 +1,9 @@
 import logging
+from typing import Optional

 from embedchain.config.BaseConfig import BaseConfig
-from embedchain.config.vectordbs import ElasticsearchDBConfig
 from embedchain.helper_classes.json_serializable import JSONSerializable
-from embedchain.models import VectorDatabases, VectorDimensions
+from embedchain.vectordb.base_vector_db import BaseVectorDB


 class BaseAppConfig(BaseConfig, JSONSerializable):
@@ -14,81 +14,38 @@ class BaseAppConfig(BaseConfig, JSONSerializable):
    def __init__(
        self,
        log_level=None,
-        embedding_fn=None,
-        db=None,
-        host=None,
-        port=None,
+        db: Optional[BaseVectorDB] = None,
        id=None,
-        collection_name=None,
        collect_metrics: bool = True,
-        db_type: VectorDatabases = None,
-        vector_dim: VectorDimensions = None,
-        es_config: ElasticsearchDBConfig = None,
-        chroma_settings: dict = {},
+        collection_name: Optional[str] = None,
    ):
        """
        :param log_level: Optional. (String) Debug level
        ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'].
-        :param embedding_fn: Embedding function to use.
-        :param db: Optional. (Vector) database instance to use for embeddings.
-        :param host: Optional. Hostname for the database server.
-        :param port: Optional. Port for the database server.
+        :param db: Optional. (Vector) database instance to use for embeddings. Deprecated in favor of app(..., db).
        :param id: Optional. ID of the app. Document metadata will have this id.
-        :param collection_name: Optional. Collection name for the database.
        :param collect_metrics: Defaults to True. Send anonymous telemetry to improve embedchain.
-        :param db_type: Optional. type of Vector database to use
-        :param vector_dim: Vector dimension generated by embedding fn
+        :param db_type: Optional. Initializes a default vector database of the given type.
+        Using the `db` argument is preferred.
        :param es_config: Optional. elasticsearch database config to be used for connection
-        :param chroma_settings: Optional. Chroma settings for connection.
+        :param collection_name: Optional. Default collection name.
+        It's recommended to use app.set_collection_name() instead.
        """
        self._setup_logging(log_level)
-        self.collection_name = collection_name if collection_name else "embedchain_store"
-        self.db = BaseAppConfig.get_db(
-            db=db,
-            embedding_fn=embedding_fn,
-            host=host,
-            port=port,
-            db_type=db_type,
-            vector_dim=vector_dim,
-            collection_name=self.collection_name,
-            es_config=es_config,
-            chroma_settings=chroma_settings,
-        )
        self.id = id
        self.collect_metrics = True if (collect_metrics is True or collect_metrics is None) else False
-        return
+        self.collection_name = collection_name

-    @staticmethod
-    def get_db(db, embedding_fn, host, port, db_type, vector_dim, collection_name, es_config, chroma_settings):
-        """
-        Get db based on db_type, db with default database (`ChromaDb`)
-        :param Optional. (Vector) database to use for embeddings.
-        :param embedding_fn: Embedding function to use in database.
-        :param host: Optional. Hostname for the database server.
-        :param port: Optional. Port for the database server.
-        :param db_type: Optional. db type to use. Supported values (`es`, `chroma`)
-        :param vector_dim: Vector dimension generated by embedding fn
-        :param collection_name: Optional. Collection name for the database.
-        :param es_config: Optional. elasticsearch database config to be used for connection
-        :raises ValueError: BaseAppConfig knows no default embedding function.
-        :returns: database instance
-        """
        if db:
-            return db
-
-        if embedding_fn is None:
-            raise ValueError("ChromaDb cannot be instantiated without an embedding function")
-
-        if db_type == VectorDatabases.ELASTICSEARCH:
-            from embedchain.vectordb.elasticsearch_db import ElasticsearchDB
-
-            return ElasticsearchDB(
-                embedding_fn=embedding_fn, vector_dim=vector_dim, collection_name=collection_name, es_config=es_config
+            self._db = db
+            logging.warning(
+                "DEPRECATION WARNING: Please supply the database as the second parameter during app init. "
+                "Such as `app(config=config, db=db)`."
            )

-        from embedchain.vectordb.chroma_db import ChromaDB
-
-        return ChromaDB(embedding_fn=embedding_fn, host=host, port=port, chroma_settings=chroma_settings)
+        if collection_name:
+            logging.warning("DEPRECATION WARNING: Please supply the collection name to the database config.")
+        return

    def _setup_logging(self, debug_level):
        level = logging.WARNING  # Default level
--- a/embedchain/config/apps/CustomAppConfig.py
+++ b/embedchain/config/apps/CustomAppConfig.py
@@ -1,12 +1,8 @@
-from typing import Any, Optional
+from typing import Optional

-from chromadb.api.types import Documents, Embeddings
 from dotenv import load_dotenv

-from embedchain.config.vectordbs import ElasticsearchDBConfig
 from embedchain.helper_classes.json_serializable import register_deserializable
-from embedchain.models import (EmbeddingFunctions, Providers, VectorDatabases,
-                               VectorDimensions)

 from .BaseAppConfig import BaseAppConfig

@@ -22,123 +18,23 @@ class CustomAppConfig(BaseAppConfig):
    def __init__(
        self,
        log_level=None,
-        embedding_fn: EmbeddingFunctions = None,
-        embedding_fn_model=None,
        db=None,
-        host=None,
-        port=None,
        id=None,
-        collection_name=None,
-        provider: Providers = None,
-        open_source_app_config=None,
-        deployment_name=None,
        collect_metrics: Optional[bool] = None,
-        db_type: VectorDatabases = None,
-        es_config: ElasticsearchDBConfig = None,
-        chroma_settings: dict = {},
+        collection_name: Optional[str] = None,
    ):
        """
        :param log_level: Optional. (String) Debug level
        ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'].
-        :param embedding_fn: Optional. Embedding function to use.
-        :param embedding_fn_model: Optional. Model name to use for embedding function.
        :param db: Optional. (Vector) database to use for embeddings.
-        :param host: Optional. Hostname for the database server.
-        :param port: Optional. Port for the database server.
        :param id: Optional. ID of the app. Document metadata will have this id.
-        :param collection_name: Optional. Collection name for the database.
        :param provider: Optional. (Providers): LLM Provider to use.
        :param open_source_app_config: Optional. Config instance needed for open source apps.
        :param collect_metrics: Defaults to True. Send anonymous telemetry to improve embedchain.
-        :param db_type: Optional. type of Vector database to use.
-        :param es_config: Optional. elasticsearch database config to be used for connection
-        :param chroma_settings: Optional. Chroma settings for connection.
+        :param collection_name: Optional. Default collection name.
+        It's recommended to use app.set_collection_name() instead.
        """
-        if provider:
-            self.provider = provider
-        else:
-            raise ValueError("CustomApp must have a provider assigned.")
-
-        self.open_source_app_config = open_source_app_config

        super().__init__(
-            log_level=log_level,
-            embedding_fn=CustomAppConfig.embedding_function(
-                embedding_function=embedding_fn, model=embedding_fn_model, deployment_name=deployment_name
-            ),
-            db=db,
-            host=host,
-            port=port,
-            id=id,
-            collection_name=collection_name,
-            collect_metrics=collect_metrics,
-            db_type=db_type,
-            vector_dim=CustomAppConfig.get_vector_dimension(embedding_function=embedding_fn),
-            es_config=es_config,
-            chroma_settings=chroma_settings,
+            log_level=log_level, db=db, id=id, collect_metrics=collect_metrics, collection_name=collection_name
        )
-
-    @staticmethod
-    def langchain_default_concept(embeddings: Any):
-        """
-        Langchains default function layout for embeddings.
-        """
-
-        def embed_function(texts: Documents) -> Embeddings:
-            return embeddings.embed_documents(texts)
-
-        return embed_function
-
-    @staticmethod
-    def embedding_function(embedding_function: EmbeddingFunctions, model: str = None, deployment_name: str = None):
-        if not isinstance(embedding_function, EmbeddingFunctions):
-            raise ValueError(
-                f"Invalid option: '{embedding_function}'. Expecting one of the following options: {list(map(lambda x: x.value, EmbeddingFunctions))}"  # noqa: E501
-            )
-
-        if embedding_function == EmbeddingFunctions.OPENAI:
-            from langchain.embeddings import OpenAIEmbeddings
-
-            if model:
-                embeddings = OpenAIEmbeddings(model=model)
-            else:
-                if deployment_name:
-                    embeddings = OpenAIEmbeddings(deployment=deployment_name)
-                else:
-                    embeddings = OpenAIEmbeddings()
-            return CustomAppConfig.langchain_default_concept(embeddings)
-
-        elif embedding_function == EmbeddingFunctions.HUGGING_FACE:
-            from langchain.embeddings import HuggingFaceEmbeddings
-
-            embeddings = HuggingFaceEmbeddings(model_name=model)
-            return CustomAppConfig.langchain_default_concept(embeddings)
-
-        elif embedding_function == EmbeddingFunctions.VERTEX_AI:
-            from langchain.embeddings import VertexAIEmbeddings
-
-            embeddings = VertexAIEmbeddings(model_name=model)
-            return CustomAppConfig.langchain_default_concept(embeddings)
-
-        elif embedding_function == EmbeddingFunctions.GPT4ALL:
-            # Note: We could use langchains GPT4ALL embedding, but it's not available in all versions.
-            from chromadb.utils import embedding_functions
-
-            return embedding_functions.SentenceTransformerEmbeddingFunction(model_name=model)
-
-    @staticmethod
-    def get_vector_dimension(embedding_function: EmbeddingFunctions):
-        if not isinstance(embedding_function, EmbeddingFunctions):
-            raise ValueError(f"Invalid option: '{embedding_function}'.")
-
-        if embedding_function == EmbeddingFunctions.OPENAI:
-            return VectorDimensions.OPENAI.value
-
-        elif embedding_function == EmbeddingFunctions.HUGGING_FACE:
-            return VectorDimensions.HUGGING_FACE.value
-
-        elif embedding_function == EmbeddingFunctions.VERTEX_AI:
-            return VectorDimensions.VERTEX_AI.value
-
-        elif embedding_function == EmbeddingFunctions.GPT4ALL:
-            return VectorDimensions.GPT4ALL.value
--- a/embedchain/config/apps/OpenSourceAppConfig.py
+++ b/embedchain/config/apps/OpenSourceAppConfig.py
@@ -1,7 +1,5 @@
 from typing import Optional

-from chromadb.utils import embedding_functions
-
 from embedchain.helper_classes.json_serializable import register_deserializable

 from .BaseAppConfig import BaseAppConfig
@@ -16,47 +14,21 @@ class OpenSourceAppConfig(BaseAppConfig):
    def __init__(
        self,
        log_level=None,
-        host=None,
-        port=None,
        id=None,
-        collection_name=None,
        collect_metrics: Optional[bool] = None,
        model=None,
+        collection_name: Optional[str] = None,
    ):
        """
        :param log_level: Optional. (String) Debug level
        ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'].
        :param id: Optional. ID of the app. Document metadata will have this id.
-        :param collection_name: Optional. Collection name for the database.
-        :param host: Optional. Hostname for the database server.
-        :param port: Optional. Port for the database server.
        :param collect_metrics: Defaults to True. Send anonymous telemetry to improve embedchain.
        :param model: Optional. GPT4ALL uses the model to instantiate the class.
        So unlike `App`, it has to be provided before querying.
+        :param collection_name: Optional. Default collection name.
+        It's recommended to use app.db.set_collection_name() instead.
        """
        self.model = model or "orca-mini-3b.ggmlv3.q4_0.bin"

-        super().__init__(
-            log_level=log_level,
-            embedding_fn=OpenSourceAppConfig.default_embedding_function(),
-            host=host,
-            port=port,
-            id=id,
-            collection_name=collection_name,
-            collect_metrics=collect_metrics,
-        )
-
-    @staticmethod
-    def default_embedding_function():
-        """
-        Sets embedding function to default (`all-MiniLM-L6-v2`).
-
-        :returns: The default embedding function
-        """
-        try:
-            return embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
-        except ValueError as e:
-            print(e)
-            raise ModuleNotFoundError(
-                "The open source app requires extra dependencies. Install with `pip install embedchain[opensource]`"
-            ) from None
+        super().__init__(log_level=log_level, id=id, collect_metrics=collect_metrics, collection_name=collection_name)