refactor: classes and configs (#528)

2023-09-05 10:12:58 +02:00
parent 387b042a49
commit 344e7470f6
50 changed files with 1221 additions and 997 deletions
--- a/embedchain/vectordb/base_vector_db.py
+++ b/embedchain/vectordb/base_vector_db.py
@@ -1,11 +1,22 @@
+from embedchain.config.vectordbs.BaseVectorDbConfig import BaseVectorDbConfig
+from embedchain.embedder.base_embedder import BaseEmbedder
 from embedchain.helper_classes.json_serializable import JSONSerializable


 class BaseVectorDB(JSONSerializable):
    """Base class for vector database."""

-    def __init__(self):
+    def __init__(self, config: BaseVectorDbConfig):
        self.client = self._get_or_create_db()
+        self.config: BaseVectorDbConfig = config
+
+    def _initialize(self):
+        """
+        This method is needed because `embedder` attribute needs to be set externally before it can be initialized.
+
+        So it's can't be done in __init__ in one step.
+        """
+        raise NotImplementedError

    def _get_or_create_db(self):
        """Get or create the database."""
@@ -14,6 +25,9 @@ class BaseVectorDB(JSONSerializable):
    def _get_or_create_collection(self):
        raise NotImplementedError

+    def _set_embedder(self, embedder: BaseEmbedder):
+        self.embedder = embedder
+
    def get(self):
        raise NotImplementedError

@@ -28,3 +42,6 @@ class BaseVectorDB(JSONSerializable):

    def reset(self):
        raise NotImplementedError
+
+    def set_collection_name(self, name: str):
+        raise NotImplementedError
--- a/embedchain/vectordb/chroma_db.py
+++ b/embedchain/vectordb/chroma_db.py
@@ -1,53 +1,63 @@
 import logging
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional

-from chromadb.errors import InvalidDimensionException
 from langchain.docstore.document import Document

+from embedchain.config import ChromaDbConfig
+from embedchain.helper_classes.json_serializable import register_deserializable
+from embedchain.vectordb.base_vector_db import BaseVectorDB
+
 try:
    import chromadb
+    from chromadb.config import Settings
+    from chromadb.errors import InvalidDimensionException
 except RuntimeError:
    from embedchain.utils import use_pysqlite3

    use_pysqlite3()
    import chromadb
-
-from chromadb.config import Settings
-
-from embedchain.helper_classes.json_serializable import register_deserializable
-from embedchain.vectordb.base_vector_db import BaseVectorDB
+    from chromadb.config import Settings
+    from chromadb.errors import InvalidDimensionException


@register_deserializable
 class ChromaDB(BaseVectorDB):
    """Vector database using ChromaDB."""

-    def __init__(self, db_dir=None, embedding_fn=None, host=None, port=None, chroma_settings={}):
-        self.embedding_fn = embedding_fn
-
-        if not hasattr(embedding_fn, "__call__"):
-            raise ValueError("Embedding function is not a function")
+    def __init__(self, config: Optional[ChromaDbConfig] = None):
+        if config:
+            self.config = config
+        else:
+            self.config = ChromaDbConfig()

        self.settings = Settings()
-        for key, value in chroma_settings.items():
-            if hasattr(self.settings, key):
-                setattr(self.settings, key, value)
+        if self.config.chroma_settings:
+            for key, value in self.config.chroma_settings.items():
+                if hasattr(self.settings, key):
+                    setattr(self.settings, key, value)

-        if host and port:
-            logging.info(f"Connecting to ChromaDB server: {host}:{port}")
-            self.settings.chroma_server_host = host
-            self.settings.chroma_server_http_port = port
+        if self.config.host and self.config.port:
+            logging.info(f"Connecting to ChromaDB server: {self.config.host}:{self.config.port}")
+            self.settings.chroma_server_host = self.config.host
+            self.settings.chroma_server_http_port = self.config.port
            self.settings.chroma_api_impl = "chromadb.api.fastapi.FastAPI"
-
        else:
-            if db_dir is None:
-                db_dir = "db"
+            if self.config.dir is None:
+                self.config.dir = "db"

-            self.settings.persist_directory = db_dir
+            self.settings.persist_directory = self.config.dir
            self.settings.is_persistent = True

        self.client = chromadb.Client(self.settings)
-        super().__init__()
+        super().__init__(config=self.config)
+
+    def _initialize(self):
+        """
+        This method is needed because `embedder` attribute needs to be set externally before it can be initialized.
+        """
+        if not self.embedder:
+            raise ValueError("Embedder not set. Please set an embedder with `set_embedder` before initialization.")
+        self._get_or_create_collection(self.config.collection_name)

    def _get_or_create_db(self):
        """Get or create the database."""
@@ -55,9 +65,11 @@ class ChromaDB(BaseVectorDB):

    def _get_or_create_collection(self, name):
        """Get or create the collection."""
+        if not hasattr(self, "embedder") or not self.embedder:
+            raise ValueError("Cannot create a Chroma database collection without an embedder.")
        self.collection = self.client.get_or_create_collection(
            name=name,
-            embedding_function=self.embedding_fn,
+            embedding_function=self.embedder.embedding_fn,
        )
        return self.collection

@@ -119,9 +131,37 @@ class ChromaDB(BaseVectorDB):
        contents = [result[0].page_content for result in results_formatted]
        return contents

+    def set_collection_name(self, name: str):
+        self.config.collection_name = name
+        self._get_or_create_collection(self.config.collection_name)
+
    def count(self) -> int:
+        """
+        Count the number of embeddings.
+
+        :return: The number of embeddings.
+        """
        return self.collection.count()

    def reset(self):
+        """
+        Resets the database. Deletes all embeddings irreversibly.
+        `App` does not have to be reinitialized after using this method.
+        """
        # Delete all data from the database
-        self.client.reset()
+        try:
+            self.client.reset()
+        except ValueError:
+            raise ValueError(
+                "For safety reasons, resetting is disabled."
+                'Please enable it by including `chromadb_settings={"allow_reset": True}` in your ChromaDbConfig'
+            ) from None
+        # Recreate
+        self._get_or_create_collection(self.config.collection_name)
+
+        # Todo: Automatically recreating a collection with the same name cannot be the best way to handle a reset.
+        # A downside of this implementation is, if you have two instances,
+        # the other instance will not get the updated `self.collection` attribute.
+        # A better way would be to create the collection if it is called again after being reset.
+        # That means, checking if collection exists in the db-consuming methods, and creating it if it doesn't.
+        # That's an extra steps for all uses, just to satisfy a niche use case in a niche method. For now, this will do.
--- a/embedchain/vectordb/elasticsearch_db.py
+++ b/embedchain/vectordb/elasticsearch_db.py
@@ -1,4 +1,4 @@
-from typing import Any, Callable, Dict, List
+from typing import Any, Dict, List

 try:
    from elasticsearch import Elasticsearch
@@ -10,7 +10,6 @@ except ImportError:

 from embedchain.config import ElasticsearchDBConfig
 from embedchain.helper_classes.json_serializable import register_deserializable
-from embedchain.models.VectorDimensions import VectorDimensions
 from embedchain.vectordb.base_vector_db import BaseVectorDB


@@ -18,43 +17,40 @@ from embedchain.vectordb.base_vector_db import BaseVectorDB
 class ElasticsearchDB(BaseVectorDB):
    def __init__(
        self,
-        es_config: ElasticsearchDBConfig = None,
-        embedding_fn: Callable[[list[str]], list[str]] = None,
-        vector_dim: VectorDimensions = None,
-        collection_name: str = None,
+        config: ElasticsearchDBConfig = None,
+        es_config: ElasticsearchDBConfig = None,  # Backwards compatibility
    ):
        """
        Elasticsearch as vector database
        :param es_config. elasticsearch database config to be used for connection
        :param embedding_fn: Function to generate embedding vectors.
        :param vector_dim: Vector dimension generated by embedding fn
-        :param collection_name: Optional. Collection name for the database.
        """
-        if not hasattr(embedding_fn, "__call__"):
-            raise ValueError("Embedding function is not a function")
-        if es_config is None:
+        if config is None and es_config is None:
            raise ValueError("ElasticsearchDBConfig is required")
-        if vector_dim is None:
-            raise ValueError("Vector Dimension is required to refer correct index and mapping")
-        if collection_name is None:
-            raise ValueError("collection name is required. It cannot be empty")
-        self.embedding_fn = embedding_fn
+        self.config = config or es_config
        self.client = Elasticsearch(es_config.ES_URL, **es_config.ES_EXTRA_PARAMS)
-        self.vector_dim = vector_dim
-        self.es_index = f"{collection_name}_{self.vector_dim}"
+
+        # Call parent init here because embedder is needed
+        super().__init__(config=self.config)
+
+    def _initialize(self):
+        """
+        This method is needed because `embedder` attribute needs to be set externally before it can be initialized.
+        """
        index_settings = {
            "mappings": {
                "properties": {
                    "text": {"type": "text"},
-                    "embeddings": {"type": "dense_vector", "index": False, "dims": self.vector_dim},
+                    "embeddings": {"type": "dense_vector", "index": False, "dims": self.embedder.vector_dimension},
                }
            }
        }
-        if not self.client.indices.exists(index=self.es_index):
+        es_index = self._get_index()
+        if not self.client.indices.exists(index=es_index):
            # create index if not exist
-            print("Creating index", self.es_index, index_settings)
-            self.client.indices.create(index=self.es_index, body=index_settings)
-        super().__init__()
+            print("Creating index", es_index, index_settings)
+            self.client.indices.create(index=es_index, body=index_settings)

    def _get_or_create_db(self):
        return self.client
@@ -85,17 +81,17 @@ class ElasticsearchDB(BaseVectorDB):
        :param ids: ids of docs
        """
        docs = []
-        embeddings = self.embedding_fn(documents)
+        embeddings = self.config.embedding_fn(documents)
        for id, text, metadata, embeddings in zip(ids, documents, metadatas, embeddings):
            docs.append(
                {
-                    "_index": self.es_index,
+                    "_index": self._get_index(),
                    "_id": id,
                    "_source": {"text": text, "metadata": metadata, "embeddings": embeddings},
                }
            )
        bulk(self.client, docs)
-        self.client.indices.refresh(index=self.es_index)
+        self.client.indices.refresh(index=self._get_index())
        return

    def query(self, input_query: List[str], n_results: int, where: Dict[str, any]) -> List[str]:
@@ -105,7 +101,7 @@ class ElasticsearchDB(BaseVectorDB):
        :param n_results: no of similar documents to fetch from database
        :param where: Optional. to filter data
        """
-        input_query_vector = self.embedding_fn(input_query)
+        input_query_vector = self.config.embedding_fn(input_query)
        query_vector = input_query_vector[0]
        query = {
            "script_score": {
@@ -120,11 +116,14 @@ class ElasticsearchDB(BaseVectorDB):
            app_id = where["app_id"]
            query["script_score"]["query"]["bool"]["must"] = [{"term": {"metadata.app_id": app_id}}]
        _source = ["text"]
-        response = self.client.search(index=self.es_index, query=query, _source=_source, size=n_results)
+        response = self.client.search(index=self._get_index(), query=query, _source=_source, size=n_results)
        docs = response["hits"]["hits"]
        contents = [doc["_source"]["text"] for doc in docs]
        return contents

+    def set_collection_name(self, name: str):
+        self.config.collection_name = name
+
    def count(self) -> int:
        query = {"match_all": {}}
        response = self.client.count(index=self.es_index, query=query)
@@ -136,3 +135,8 @@ class ElasticsearchDB(BaseVectorDB):
        if self.client.indices.exists(index=self.es_index):
            # delete index in Es
            self.client.indices.delete(index=self.es_index)
+
+    def _get_index(self):
+        # NOTE: The method is preferred to an attribute, because if collection name changes,
+        # it's always up-to-date.
+        return f"{self.config.collection_name}_{self.config.vector_dim}"