Add support for OpenSearch as vector database (#725)

2023-09-28 14:54:42 -07:00
parent 9951b58005
commit 414c69fd62
22 changed files with 326 additions and 82 deletions
--- a/15
+++ b/15
@@ -7,8 +7,19 @@ PROJECT_NAME := embedchain
 .PHONY: install format lint clean test ci_lint ci_test

 install:
-	$(PIP) install --upgrade pip
-	$(PIP) install -e .[dev]
+	poetry install
+
+install_es:
+	poetry install --extras elasticsearch
+
+install_opensearch:
+	poetry install --extras opensearch
+
+shell:
+	poetry shell
+
+py_shell:
+	poetry run python

 format:
 	$(PYTHON) -m black .
--- a/docs/advanced/interface_types.mdx
+++ b/docs/advanced/interface_types.mdx
@@ -70,6 +70,6 @@ app.reset()
 Counts the number of embeddings (chunks) in the database.

 ```python
-print(app.count())
+print(app.db.count())
 # returns: 481
 ```
--- a/docs/advanced/vector_database.mdx
+++ b/docs/advanced/vector_database.mdx
@@ -2,7 +2,7 @@
 title: '💾 Vector Database'
 ---

-We support `Chroma` and `Elasticsearch` as two vector database. 
+We support `Chroma`, `Elasticsearch` and `OpenSearch` as vector databases.
 `Chroma` is used as a default database.

 ## Elasticsearch
@@ -22,13 +22,13 @@ Please note that the key needs certain privileges. For testing you can just togg
 2. Load the app
 ```python
 from embedchain import CustomApp
-from embedchain.embedder.openai import OpenAiEmbedder
+from embedchain.embedder.openai import OpenAIEmbedder
 from embedchain.llm.openai import OpenAILlm
 from embedchain.vectordb.elasticsearch import ElasticsearchDB

 es_app = CustomApp(
    llm=OpenAILlm(),
-    embedder=OpenAiEmbedder(),
+    embedder=OpenAIEmbedder(),
    db=ElasticsearchDB(),
 )
 ```
@@ -45,7 +45,7 @@ import os

 from embedchain import CustomApp
 from embedchain.config import CustomAppConfig, ElasticsearchDBConfig
-from embedchain.embedder.openai import OpenAiEmbedder
+from embedchain.embedder.openai import OpenAIEmbedder
 from embedchain.llm.openai import OpenAILlm
 from embedchain.vectordb.elasticsearch import ElasticsearchDB

@@ -61,10 +61,58 @@ es_config = ElasticsearchDBConfig(
 es_app = CustomApp(
    config=CustomAppConfig(log_level="INFO"),
    llm=OpenAILlm(),
-    embedder=OpenAiEmbedder(),
+    embedder=OpenAIEmbedder(),
    db=ElasticsearchDB(config=es_config),
 )
 ```
 3. This should log your connection details to the console.
 4. Alternatively to a URL, you `ElasticsearchDBConfig` accepts `es_url` as a list of nodes url with different hosts and ports.
 5. Additionally we can pass named parameters supported by Python Elasticsearch client.
+
+
+## OpenSearch 🔍
+
+To use OpenSearch as a vector database with a CustomApp, follow these simple steps:
+
+1. Set the `OPENAI_API_KEY` environment variable:
+
+```
+OPENAI_API_KEY=sk-xxxx
+```
+
+2. Define the OpenSearch configuration in your Python code:
+
+```python
+from embedchain import CustomApp
+from embedchain.config import OpenSearchDBConfig
+from embedchain.embedder.openai import OpenAIEmbedder
+from embedchain.llm.openai import OpenAILlm
+from embedchain.vectordb.opensearch import OpenSearchDB
+
+opensearch_url = "https://localhost:9200"
+http_auth = ("username", "password")
+
+db_config = OpenSearchDBConfig(
+    opensearch_url=opensearch_url,
+    http_auth=http_auth,
+    collection_name="embedchain-app",
+    use_ssl=True,
+    timeout=30,
+)
+db = OpenSearchDB(config=db_config)
+```
+
+2. Instantiate the app and add data:
+
+```python
+app = CustomApp(llm=OpenAILlm(), embedder=OpenAIEmbedder(), db=db)
+app.add("https://en.wikipedia.org/wiki/Elon_Musk")
+app.add("https://www.forbes.com/profile/elon-musk")
+app.add("https://www.britannica.com/biography/Elon-Musk")
+```
+
+3. You're all set! Start querying using the following command:
+
+```python
+app.query("What is the net worth of Elon Musk?")
+```
--- a/embedchain/apps/Llama2App.py
+++ b/embedchain/apps/Llama2App.py
@@ -2,7 +2,7 @@ from typing import Optional

 from embedchain.apps.custom_app import CustomApp
 from embedchain.config import CustomAppConfig
-from embedchain.embedder.openai import OpenAiEmbedder
+from embedchain.embedder.openai import OpenAIEmbedder
 from embedchain.helper.json_serializable import register_deserializable
 from embedchain.llm.llama2 import Llama2Llm
 from embedchain.vectordb.chroma import ChromaDB
@@ -29,5 +29,5 @@ class Llama2App(CustomApp):
            config = CustomAppConfig()

        super().__init__(
-            config=config, llm=Llama2Llm(), db=ChromaDB(), embedder=OpenAiEmbedder(), system_prompt=system_prompt
+            config=config, llm=Llama2Llm(), db=ChromaDB(), embedder=OpenAIEmbedder(), system_prompt=system_prompt
        )
--- a/embedchain/apps/app.py
+++ b/embedchain/apps/app.py
@@ -3,7 +3,7 @@ from typing import Optional
 from embedchain.config import (AppConfig, BaseEmbedderConfig, BaseLlmConfig,
                               ChromaDbConfig)
 from embedchain.embedchain import EmbedChain
-from embedchain.embedder.openai import OpenAiEmbedder
+from embedchain.embedder.openai import OpenAIEmbedder
 from embedchain.helper.json_serializable import register_deserializable
 from embedchain.llm.openai import OpenAILlm
 from embedchain.vectordb.chroma import ChromaDB
@@ -48,7 +48,7 @@ class App(EmbedChain):
            config = AppConfig()

        llm = OpenAILlm(config=llm_config)
-        embedder = OpenAiEmbedder(config=BaseEmbedderConfig(model="text-embedding-ada-002"))
+        embedder = OpenAIEmbedder(config=BaseEmbedderConfig(model="text-embedding-ada-002"))
        database = ChromaDB(config=chromadb_config)

        super().__init__(config, llm, db=database, embedder=embedder, system_prompt=system_prompt)
--- a/embedchain/bots/base.py
+++ b/embedchain/bots/base.py
@@ -2,7 +2,7 @@ from typing import Any

 from embedchain import CustomApp
 from embedchain.config import AddConfig, CustomAppConfig, LlmConfig
-from embedchain.embedder.openai import OpenAiEmbedder
+from embedchain.embedder.openai import OpenAIEmbedder
 from embedchain.helper.json_serializable import (JSONSerializable,
                                                 register_deserializable)
 from embedchain.llm.openai import OpenAILlm
@@ -12,7 +12,7 @@ from embedchain.vectordb.chroma import ChromaDB
@register_deserializable
 class BaseBot(JSONSerializable):
    def __init__(self):
-        self.app = CustomApp(config=CustomAppConfig(), llm=OpenAILlm(), db=ChromaDB(), embedder=OpenAiEmbedder())
+        self.app = CustomApp(config=CustomAppConfig(), llm=OpenAILlm(), db=ChromaDB(), embedder=OpenAIEmbedder())

    def add(self, data: Any, config: AddConfig = None):
        """
--- a/embedchain/config/init.py
+++ b/embedchain/config/init.py
@@ -5,9 +5,10 @@ from .apps.app_config import AppConfig
 from .apps.custom_app_config import CustomAppConfig
 from .apps.open_source_app_config import OpenSourceAppConfig
 from .base_config import BaseConfig
-from .embedder.BaseEmbedderConfig import BaseEmbedderConfig
-from .embedder.BaseEmbedderConfig import BaseEmbedderConfig as EmbedderConfig
+from .embedder.base import BaseEmbedderConfig
+from .embedder.base import BaseEmbedderConfig as EmbedderConfig
 from .llm.base_llm_config import BaseLlmConfig
 from .llm.base_llm_config import BaseLlmConfig as LlmConfig
-from .vectordbs.ChromaDbConfig import ChromaDbConfig
-from .vectordbs.ElasticsearchDBConfig import ElasticsearchDBConfig
+from .vectordb.chroma import ChromaDbConfig
+from .vectordb.elasticsearch import ElasticsearchDBConfig
+from .vectordb.opensearch import OpenSearchDBConfig
--- a/embedchain/config/embedder/BaseEmbedderConfig.py
+++ b/embedchain/config/embedder/BaseEmbedderConfig.py
--- a/embedchain/config/vectordbs/init.py
+++ b/embedchain/config/vectordbs/init.py
--- a/embedchain/config/vectordbs/BaseVectorDbConfig.py
+++ b/embedchain/config/vectordbs/BaseVectorDbConfig.py
--- a/embedchain/config/vectordbs/ChromaDbConfig.py
+++ b/embedchain/config/vectordbs/ChromaDbConfig.py
@@ -1,6 +1,6 @@
 from typing import Optional

-from embedchain.config.vectordbs.BaseVectorDbConfig import BaseVectorDbConfig
+from embedchain.config.vectordb.base import BaseVectorDbConfig
 from embedchain.helper.json_serializable import register_deserializable


--- a/embedchain/config/vectordbs/ElasticsearchDBConfig.py
+++ b/embedchain/config/vectordbs/ElasticsearchDBConfig.py
@@ -1,7 +1,7 @@
 import os
 from typing import Dict, List, Optional, Union

-from embedchain.config.vectordbs.BaseVectorDbConfig import BaseVectorDbConfig
+from embedchain.config.vectordb.base import BaseVectorDbConfig
 from embedchain.helper.json_serializable import register_deserializable


--- a/embedchain/config/vectordb/opensearch.py
+++ b/embedchain/config/vectordb/opensearch.py
@@ -0,0 +1,37 @@
+from typing import Dict, Optional, Tuple
+
+from embedchain.config.vectordb.base import BaseVectorDbConfig
+from embedchain.helper.json_serializable import register_deserializable
+
+
+@register_deserializable
+class OpenSearchDBConfig(BaseVectorDbConfig):
+    def __init__(
+        self,
+        opensearch_url: str,
+        http_auth: Tuple[str, str],
+        vector_dimension: int = 1536,
+        collection_name: Optional[str] = None,
+        dir: Optional[str] = None,
+        **extra_params: Dict[str, any],
+    ):
+        """
+        Initializes a configuration class instance for an OpenSearch client.
+
+        :param collection_name: Default name for the collection, defaults to None
+        :type collection_name: Optional[str], optional
+        :param opensearch_url: URL of the OpenSearch domain
+        :type opensearch_url: str, Eg, "http://localhost:9200"
+        :param http_auth: Tuple of username and password
+        :type http_auth: Tuple[str, str], Eg, ("username", "password")
+        :param vector_dimension: Dimension of  the vector, defaults to 1536 (openai embedding model)
+        :type vector_dimension: int, optional
+        :param dir: Path to the database directory, where the database is stored, defaults to None
+        :type dir: Optional[str], optional
+        """
+        self.opensearch_url = opensearch_url
+        self.http_auth = http_auth
+        self.vector_dimension = vector_dimension
+        self.extra_params = extra_params
+
+        super().__init__(collection_name=collection_name, dir=dir)
--- a/embedchain/embedchain.py
+++ b/embedchain/embedchain.py
@@ -61,16 +61,13 @@ class EmbedChain(JSONSerializable):
        """

        self.config = config
-
-        # Add subclasses
-        ## Llm
+        # Llm
        self.llm = llm
-        ## Database
        # Database has support for config assignment for backwards compatibility
        if db is None and (not hasattr(self.config, "db") or self.config.db is None):
            raise ValueError("App requires Database.")
        self.db = db or self.config.db
-        ## Embedder
+        # Embedder
        if embedder is None:
            raise ValueError("App requires Embedder.")
        self.embedder = embedder
@@ -256,7 +253,6 @@ class EmbedChain(JSONSerializable):
        )
        return self.add(source=source, data_type=data_type, metadata=metadata, config=config)

-
    def _get_existing_doc_id(self, chunker: BaseChunker, src: Any):
        """
        Get id of existing document for a given source, based on the data type
@@ -395,10 +391,10 @@ class EmbedChain(JSONSerializable):
            return list(documents), metadatas, ids, 0

        # Count before, to calculate a delta in the end.
-        chunks_before_addition = self.count()
+        chunks_before_addition = self.db.count()

        self.db.add(documents=documents, metadatas=metadatas, ids=ids)
-        count_new_chunks = self.count() - chunks_before_addition
+        count_new_chunks = self.db.count() - chunks_before_addition
        print((f"Successfully saved {src} ({chunker.data_type}). New chunks count: {count_new_chunks}"))
        return list(documents), metadatas, ids, count_new_chunks

--- a/embedchain/embedder/base.py
+++ b/embedchain/embedder/base.py
@@ -1,6 +1,6 @@
 from typing import Any, Callable, Optional

-from embedchain.config.embedder.BaseEmbedderConfig import BaseEmbedderConfig
+from embedchain.config.embedder.base import BaseEmbedderConfig

 try:
    from chromadb.api.types import Documents, Embeddings
--- a/embedchain/embedder/openai.py
+++ b/embedchain/embedder/openai.py
@@ -16,7 +16,7 @@ except RuntimeError:
    from chromadb.utils import embedding_functions


-class OpenAiEmbedder(BaseEmbedder):
+class OpenAIEmbedder(BaseEmbedder):
    def __init__(self, config: Optional[BaseEmbedderConfig] = None):
        super().__init__(config=config)
        if self.config.model is None:
--- a/embedchain/models/vector_databases.py
+++ b/embedchain/models/vector_databases.py
@@ -4,3 +4,4 @@ from enum import Enum
 class VectorDatabases(Enum):
    CHROMADB = "CHROMADB"
    ELASTICSEARCH = "ELASTICSEARCH"
+    OPENSEARCH = "OPENSEARCH"
--- a/embedchain/vectordb/base.py
+++ b/embedchain/vectordb/base.py
@@ -1,4 +1,4 @@
-from embedchain.config.vectordbs.BaseVectorDbConfig import BaseVectorDbConfig
+from embedchain.config.vectordb.base import BaseVectorDbConfig
 from embedchain.embedder.base import BaseEmbedder
 from embedchain.helper.json_serializable import JSONSerializable

--- a/embedchain/vectordb/base_vector_db.py
+++ b/embedchain/vectordb/base_vector_db.py
@@ -1,50 +0,0 @@
-from embedchain.config.vectordbs.BaseVectorDbConfig import BaseVectorDbConfig
-from embedchain.embedder.base_embedder import BaseEmbedder
-from embedchain.helper_classes.json_serializable import JSONSerializable
-
-
-class BaseVectorDB(JSONSerializable):
-    """Base class for vector database."""
-
-    def __init__(self, config: BaseVectorDbConfig):
-        self.client = self._get_or_create_db()
-        self.config: BaseVectorDbConfig = config
-
-    def _initialize(self):
-        """
-        This method is needed because `embedder` attribute needs to be set externally before it can be initialized.
-
-        So it's can't be done in __init__ in one step.
-        """
-        raise NotImplementedError
-
-    def _get_or_create_db(self):
-        """Get or create the database."""
-        raise NotImplementedError
-
-    def _get_or_create_collection(self):
-        raise NotImplementedError
-
-    def _set_embedder(self, embedder: BaseEmbedder):
-        self.embedder = embedder
-
-    def get(self):
-        raise NotImplementedError
-
-    def add(self):
-        raise NotImplementedError
-
-    def query(self):
-        raise NotImplementedError
-
-    def count(self):
-        raise NotImplementedError
-
-    def delete(self):
-        raise NotImplementedError
-
-    def reset(self):
-        raise NotImplementedError
-
-    def set_collection_name(self, name: str):
-        raise NotImplementedError
--- a/embedchain/vectordb/chroma.py
+++ b/embedchain/vectordb/chroma.py
@@ -63,7 +63,9 @@ class ChromaDB(BaseVectorDB):
        This method is needed because `embedder` attribute needs to be set externally before it can be initialized.
        """
        if not self.embedder:
-            raise ValueError("Embedder not set. Please set an embedder with `set_embedder` before initialization.")
+            raise ValueError(
+                "Embedder not set. Please set an embedder with `_set_embedder()` function before initialization."
+            )
        self._get_or_create_collection(self.config.collection_name)

    def _get_or_create_db(self):
--- a/embedchain/vectordb/opensearch.py
+++ b/embedchain/vectordb/opensearch.py
@@ -0,0 +1,196 @@
+import logging
+from typing import Dict, List, Optional, Set
+
+try:
+    from opensearchpy import OpenSearch
+    from opensearchpy.helpers import bulk
+except ImportError:
+    raise ImportError(
+        "OpenSearch requires extra dependencies. Install with `pip install --upgrade embedchain[opensearch]`"
+    ) from None
+
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.vectorstores import OpenSearchVectorSearch
+
+from embedchain.config import OpenSearchDBConfig
+from embedchain.helper.json_serializable import register_deserializable
+from embedchain.vectordb.base import BaseVectorDB
+
+
+@register_deserializable
+class OpenSearchDB(BaseVectorDB):
+    """
+    OpenSearch as vector database
+    """
+
+    def __init__(self, config: OpenSearchDBConfig):
+        """OpenSearch as vector database.
+
+        :param config: OpenSearch domain config
+        :type config: OpenSearchDBConfig
+        """
+        if config is None:
+            raise ValueError("OpenSearchDBConfig is required")
+        self.config = config
+        self.client = OpenSearch(
+            hosts=[self.config.opensearch_url],
+            http_auth=self.config.http_auth,
+            **self.config.extra_params,
+        )
+        info = self.client.info()
+        logging.info(f"Connected to {info['version']['distribution']}. Version: {info['version']['number']}")
+        # Remove auth credentials from config after successful connection
+        super().__init__(config=self.config)
+
+    def _initialize(self):
+        logging.info(self.client.info())
+        index_name = self._get_index()
+        if self.client.indices.exists(index=index_name):
+            print(f"Index '{index_name}' already exists.")
+            return
+
+        index_body = {
+            "settings": {"knn": True},
+            "mappings": {
+                "properties": {
+                    "text": {"type": "text"},
+                    "embeddings": {
+                        "type": "knn_vector",
+                        "index": False,
+                        "dimension": self.config.vector_dimension,
+                    },
+                }
+            },
+        }
+        self.client.indices.create(index_name, body=index_body)
+        print(self.client.indices.get(index_name))
+
+    def _get_or_create_db(self):
+        """Called during initialization"""
+        return self.client
+
+    def _get_or_create_collection(self, name):
+        """Note: nothing to return here. Discuss later"""
+
+    def get(
+        self, ids: Optional[List[str]] = None, where: Optional[Dict[str, any]] = None, limit: Optional[int] = None
+    ) -> Set[str]:
+        """
+        Get existing doc ids present in vector database
+
+        :param ids: _list of doc ids to check for existence
+        :type ids: List[str]
+        :param where: to filter data
+        :type where: Dict[str, any]
+        :return: ids
+        :type: Set[str]
+        """
+        if ids:
+            query = {"query": {"bool": {"must": [{"ids": {"values": ids}}]}}}
+        else:
+            query = {"query": {"bool": {"must": []}}}
+        if "app_id" in where:
+            app_id = where["app_id"]
+            query["query"]["bool"]["must"].append({"term": {"metadata.app_id": app_id}})
+
+        # OpenSearch syntax is different from Elasticsearch
+        response = self.client.search(index=self._get_index(), body=query, _source=False, size=limit)
+        docs = response["hits"]["hits"]
+        ids = [doc["_id"] for doc in docs]
+        return {"ids": set(ids)}
+
+    def add(self, documents: List[str], metadatas: List[object], ids: List[str]):
+        """add data in vector database
+
+        :param documents: list of texts to add
+        :type documents: List[str]
+        :param metadatas: list of metadata associated with docs
+        :type metadatas: List[object]
+        :param ids: ids of docs
+        :type ids: List[str]
+        """
+
+        docs = []
+        embeddings = self.embedder.embedding_fn(documents)
+        for id, text, metadata, embeddings in zip(ids, documents, metadatas, embeddings):
+            docs.append(
+                {
+                    "_index": self._get_index(),
+                    "_id": id,
+                    "_source": {"text": text, "metadata": metadata, "embeddings": embeddings},
+                }
+            )
+        bulk(self.client, docs)
+        self.client.indices.refresh(index=self._get_index())
+
+    def query(self, input_query: List[str], n_results: int, where: Dict[str, any]) -> List[str]:
+        """
+        query contents from vector data base based on vector similarity
+
+        :param input_query: list of query string
+        :type input_query: List[str]
+        :param n_results: no of similar documents to fetch from database
+        :type n_results: int
+        :param where: Optional. to filter data
+        :type where: Dict[str, any]
+        :return: Database contents that are the result of the query
+        :rtype: List[str]
+        """
+        embeddings = OpenAIEmbeddings()
+        docsearch = OpenSearchVectorSearch(
+            index_name=self._get_index(),
+            embedding_function=embeddings,
+            opensearch_url=f"{self.config.opensearch_url}",
+            http_auth=self.config.http_auth,
+            use_ssl=True,
+        )
+        docs = docsearch.similarity_search(
+            input_query,
+            search_type="script_scoring",
+            space_type="cosinesimil",
+            vector_field="embeddings",
+            text_field="text",
+            metadata_field="metadata",
+        )
+        contents = [doc.page_content for doc in docs]
+        return contents
+
+    def set_collection_name(self, name: str):
+        """
+        Set the name of the collection. A collection is an isolated space for vectors.
+
+        :param name: Name of the collection.
+        :type name: str
+        """
+        if not isinstance(name, str):
+            raise TypeError("Collection name must be a string")
+        self.config.collection_name = name
+
+    def count(self) -> int:
+        """
+        Count number of documents/chunks embedded in the database.
+
+        :return: number of documents
+        :rtype: int
+        """
+        query = {"query": {"match_all": {}}}
+        response = self.client.count(index=self._get_index(), body=query)
+        doc_count = response["count"]
+        return doc_count
+
+    def reset(self):
+        """
+        Resets the database. Deletes all embeddings irreversibly.
+        """
+        # Delete all data from the database
+        if self.client.indices.exists(index=self._get_index()):
+            # delete index in Es
+            self.client.indices.delete(index=self._get_index())
+
+    def _get_index(self) -> str:
+        """Get the OpenSearch index for a collection
+
+        :return: OpenSearch index
+        :rtype: str
+        """
+        return self.config.collection_name
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -98,6 +98,7 @@ torch = { version = ">=2.0.0, !=2.0.1", optional = true }
 # Torch 2.0.1 is not compatible with poetry (https://github.com/pytorch/pytorch/issues/100974)
 gpt4all = { version = "1.0.8", optional = true }
 # 1.0.9 is not working for some users (https://github.com/nomic-ai/gpt4all/issues/1394)
+opensearch-py = { version = "2.3.1", optional = true }
 elasticsearch = { version = "^8.9.0", optional = true }
 flask = { version = "^2.3.3", optional = true }
 twilio = { version = "^8.5.0", optional = true }
@@ -123,6 +124,7 @@ streamlit = ["streamlit"]
 community = ["llama-hub"]
 opensource = ["sentence-transformers", "torch", "gpt4all"]
 elasticsearch = ["elasticsearch"]
+opensearch = ["opensearch-py"]
 poe = ["fastapi-poe"]
 discord = ["discord"]
 slack = ["slack-sdk", "flask"]