diff --git a/Makefile b/Makefile index b4b5bef9..932cfbc7 100644 --- a/Makefile +++ b/Makefile @@ -7,8 +7,19 @@ PROJECT_NAME := embedchain .PHONY: install format lint clean test ci_lint ci_test install: - $(PIP) install --upgrade pip - $(PIP) install -e .[dev] + poetry install + +install_es: + poetry install --extras elasticsearch + +install_opensearch: + poetry install --extras opensearch + +shell: + poetry shell + +py_shell: + poetry run python format: $(PYTHON) -m black . diff --git a/docs/advanced/interface_types.mdx b/docs/advanced/interface_types.mdx index 329d8f6d..8096c6c1 100644 --- a/docs/advanced/interface_types.mdx +++ b/docs/advanced/interface_types.mdx @@ -70,6 +70,6 @@ app.reset() Counts the number of embeddings (chunks) in the database. ```python -print(app.count()) +print(app.db.count()) # returns: 481 ``` diff --git a/docs/advanced/vector_database.mdx b/docs/advanced/vector_database.mdx index 8b87af80..964a3512 100644 --- a/docs/advanced/vector_database.mdx +++ b/docs/advanced/vector_database.mdx @@ -2,7 +2,7 @@ title: '💾 Vector Database' --- -We support `Chroma` and `Elasticsearch` as two vector database. +We support `Chroma`, `Elasticsearch` and `OpenSearch` as vector databases. `Chroma` is used as a default database. ## Elasticsearch @@ -22,13 +22,13 @@ Please note that the key needs certain privileges. For testing you can just togg 2. Load the app ```python from embedchain import CustomApp -from embedchain.embedder.openai import OpenAiEmbedder +from embedchain.embedder.openai import OpenAIEmbedder from embedchain.llm.openai import OpenAILlm from embedchain.vectordb.elasticsearch import ElasticsearchDB es_app = CustomApp( llm=OpenAILlm(), - embedder=OpenAiEmbedder(), + embedder=OpenAIEmbedder(), db=ElasticsearchDB(), ) ``` @@ -45,7 +45,7 @@ import os from embedchain import CustomApp from embedchain.config import CustomAppConfig, ElasticsearchDBConfig -from embedchain.embedder.openai import OpenAiEmbedder +from embedchain.embedder.openai import OpenAIEmbedder from embedchain.llm.openai import OpenAILlm from embedchain.vectordb.elasticsearch import ElasticsearchDB @@ -61,10 +61,58 @@ es_config = ElasticsearchDBConfig( es_app = CustomApp( config=CustomAppConfig(log_level="INFO"), llm=OpenAILlm(), - embedder=OpenAiEmbedder(), + embedder=OpenAIEmbedder(), db=ElasticsearchDB(config=es_config), ) ``` 3. This should log your connection details to the console. 4. Alternatively to a URL, you `ElasticsearchDBConfig` accepts `es_url` as a list of nodes url with different hosts and ports. 5. Additionally we can pass named parameters supported by Python Elasticsearch client. + + +## OpenSearch 🔍 + +To use OpenSearch as a vector database with a CustomApp, follow these simple steps: + +1. Set the `OPENAI_API_KEY` environment variable: + +``` +OPENAI_API_KEY=sk-xxxx +``` + +2. Define the OpenSearch configuration in your Python code: + +```python +from embedchain import CustomApp +from embedchain.config import OpenSearchDBConfig +from embedchain.embedder.openai import OpenAIEmbedder +from embedchain.llm.openai import OpenAILlm +from embedchain.vectordb.opensearch import OpenSearchDB + +opensearch_url = "https://localhost:9200" +http_auth = ("username", "password") + +db_config = OpenSearchDBConfig( + opensearch_url=opensearch_url, + http_auth=http_auth, + collection_name="embedchain-app", + use_ssl=True, + timeout=30, +) +db = OpenSearchDB(config=db_config) +``` + +2. Instantiate the app and add data: + +```python +app = CustomApp(llm=OpenAILlm(), embedder=OpenAIEmbedder(), db=db) +app.add("https://en.wikipedia.org/wiki/Elon_Musk") +app.add("https://www.forbes.com/profile/elon-musk") +app.add("https://www.britannica.com/biography/Elon-Musk") +``` + +3. You're all set! Start querying using the following command: + +```python +app.query("What is the net worth of Elon Musk?") +``` diff --git a/embedchain/apps/Llama2App.py b/embedchain/apps/Llama2App.py index 2ae5a613..8b4bf3f2 100644 --- a/embedchain/apps/Llama2App.py +++ b/embedchain/apps/Llama2App.py @@ -2,7 +2,7 @@ from typing import Optional from embedchain.apps.custom_app import CustomApp from embedchain.config import CustomAppConfig -from embedchain.embedder.openai import OpenAiEmbedder +from embedchain.embedder.openai import OpenAIEmbedder from embedchain.helper.json_serializable import register_deserializable from embedchain.llm.llama2 import Llama2Llm from embedchain.vectordb.chroma import ChromaDB @@ -29,5 +29,5 @@ class Llama2App(CustomApp): config = CustomAppConfig() super().__init__( - config=config, llm=Llama2Llm(), db=ChromaDB(), embedder=OpenAiEmbedder(), system_prompt=system_prompt + config=config, llm=Llama2Llm(), db=ChromaDB(), embedder=OpenAIEmbedder(), system_prompt=system_prompt ) diff --git a/embedchain/apps/app.py b/embedchain/apps/app.py index d8dc6559..9c9c7bbf 100644 --- a/embedchain/apps/app.py +++ b/embedchain/apps/app.py @@ -3,7 +3,7 @@ from typing import Optional from embedchain.config import (AppConfig, BaseEmbedderConfig, BaseLlmConfig, ChromaDbConfig) from embedchain.embedchain import EmbedChain -from embedchain.embedder.openai import OpenAiEmbedder +from embedchain.embedder.openai import OpenAIEmbedder from embedchain.helper.json_serializable import register_deserializable from embedchain.llm.openai import OpenAILlm from embedchain.vectordb.chroma import ChromaDB @@ -48,7 +48,7 @@ class App(EmbedChain): config = AppConfig() llm = OpenAILlm(config=llm_config) - embedder = OpenAiEmbedder(config=BaseEmbedderConfig(model="text-embedding-ada-002")) + embedder = OpenAIEmbedder(config=BaseEmbedderConfig(model="text-embedding-ada-002")) database = ChromaDB(config=chromadb_config) super().__init__(config, llm, db=database, embedder=embedder, system_prompt=system_prompt) diff --git a/embedchain/bots/base.py b/embedchain/bots/base.py index 60b69b01..d25df78e 100644 --- a/embedchain/bots/base.py +++ b/embedchain/bots/base.py @@ -2,7 +2,7 @@ from typing import Any from embedchain import CustomApp from embedchain.config import AddConfig, CustomAppConfig, LlmConfig -from embedchain.embedder.openai import OpenAiEmbedder +from embedchain.embedder.openai import OpenAIEmbedder from embedchain.helper.json_serializable import (JSONSerializable, register_deserializable) from embedchain.llm.openai import OpenAILlm @@ -12,7 +12,7 @@ from embedchain.vectordb.chroma import ChromaDB @register_deserializable class BaseBot(JSONSerializable): def __init__(self): - self.app = CustomApp(config=CustomAppConfig(), llm=OpenAILlm(), db=ChromaDB(), embedder=OpenAiEmbedder()) + self.app = CustomApp(config=CustomAppConfig(), llm=OpenAILlm(), db=ChromaDB(), embedder=OpenAIEmbedder()) def add(self, data: Any, config: AddConfig = None): """ diff --git a/embedchain/config/__init__.py b/embedchain/config/__init__.py index 82d59311..46b69b34 100644 --- a/embedchain/config/__init__.py +++ b/embedchain/config/__init__.py @@ -5,9 +5,10 @@ from .apps.app_config import AppConfig from .apps.custom_app_config import CustomAppConfig from .apps.open_source_app_config import OpenSourceAppConfig from .base_config import BaseConfig -from .embedder.BaseEmbedderConfig import BaseEmbedderConfig -from .embedder.BaseEmbedderConfig import BaseEmbedderConfig as EmbedderConfig +from .embedder.base import BaseEmbedderConfig +from .embedder.base import BaseEmbedderConfig as EmbedderConfig from .llm.base_llm_config import BaseLlmConfig from .llm.base_llm_config import BaseLlmConfig as LlmConfig -from .vectordbs.ChromaDbConfig import ChromaDbConfig -from .vectordbs.ElasticsearchDBConfig import ElasticsearchDBConfig +from .vectordb.chroma import ChromaDbConfig +from .vectordb.elasticsearch import ElasticsearchDBConfig +from .vectordb.opensearch import OpenSearchDBConfig diff --git a/embedchain/config/embedder/BaseEmbedderConfig.py b/embedchain/config/embedder/base.py similarity index 100% rename from embedchain/config/embedder/BaseEmbedderConfig.py rename to embedchain/config/embedder/base.py diff --git a/embedchain/config/vectordbs/__init__.py b/embedchain/config/vectordb/__init__.py similarity index 100% rename from embedchain/config/vectordbs/__init__.py rename to embedchain/config/vectordb/__init__.py diff --git a/embedchain/config/vectordbs/BaseVectorDbConfig.py b/embedchain/config/vectordb/base.py similarity index 100% rename from embedchain/config/vectordbs/BaseVectorDbConfig.py rename to embedchain/config/vectordb/base.py diff --git a/embedchain/config/vectordbs/ChromaDbConfig.py b/embedchain/config/vectordb/chroma.py similarity index 95% rename from embedchain/config/vectordbs/ChromaDbConfig.py rename to embedchain/config/vectordb/chroma.py index 2dddadbd..38bf0921 100644 --- a/embedchain/config/vectordbs/ChromaDbConfig.py +++ b/embedchain/config/vectordb/chroma.py @@ -1,6 +1,6 @@ from typing import Optional -from embedchain.config.vectordbs.BaseVectorDbConfig import BaseVectorDbConfig +from embedchain.config.vectordb.base import BaseVectorDbConfig from embedchain.helper.json_serializable import register_deserializable diff --git a/embedchain/config/vectordbs/ElasticsearchDBConfig.py b/embedchain/config/vectordb/elasticsearch.py similarity index 94% rename from embedchain/config/vectordbs/ElasticsearchDBConfig.py rename to embedchain/config/vectordb/elasticsearch.py index fdbd798f..75498bd2 100644 --- a/embedchain/config/vectordbs/ElasticsearchDBConfig.py +++ b/embedchain/config/vectordb/elasticsearch.py @@ -1,7 +1,7 @@ import os from typing import Dict, List, Optional, Union -from embedchain.config.vectordbs.BaseVectorDbConfig import BaseVectorDbConfig +from embedchain.config.vectordb.base import BaseVectorDbConfig from embedchain.helper.json_serializable import register_deserializable diff --git a/embedchain/config/vectordb/opensearch.py b/embedchain/config/vectordb/opensearch.py new file mode 100644 index 00000000..85d517b2 --- /dev/null +++ b/embedchain/config/vectordb/opensearch.py @@ -0,0 +1,37 @@ +from typing import Dict, Optional, Tuple + +from embedchain.config.vectordb.base import BaseVectorDbConfig +from embedchain.helper.json_serializable import register_deserializable + + +@register_deserializable +class OpenSearchDBConfig(BaseVectorDbConfig): + def __init__( + self, + opensearch_url: str, + http_auth: Tuple[str, str], + vector_dimension: int = 1536, + collection_name: Optional[str] = None, + dir: Optional[str] = None, + **extra_params: Dict[str, any], + ): + """ + Initializes a configuration class instance for an OpenSearch client. + + :param collection_name: Default name for the collection, defaults to None + :type collection_name: Optional[str], optional + :param opensearch_url: URL of the OpenSearch domain + :type opensearch_url: str, Eg, "http://localhost:9200" + :param http_auth: Tuple of username and password + :type http_auth: Tuple[str, str], Eg, ("username", "password") + :param vector_dimension: Dimension of the vector, defaults to 1536 (openai embedding model) + :type vector_dimension: int, optional + :param dir: Path to the database directory, where the database is stored, defaults to None + :type dir: Optional[str], optional + """ + self.opensearch_url = opensearch_url + self.http_auth = http_auth + self.vector_dimension = vector_dimension + self.extra_params = extra_params + + super().__init__(collection_name=collection_name, dir=dir) diff --git a/embedchain/embedchain.py b/embedchain/embedchain.py index 99fc6a4f..90162863 100644 --- a/embedchain/embedchain.py +++ b/embedchain/embedchain.py @@ -61,16 +61,13 @@ class EmbedChain(JSONSerializable): """ self.config = config - - # Add subclasses - ## Llm + # Llm self.llm = llm - ## Database # Database has support for config assignment for backwards compatibility if db is None and (not hasattr(self.config, "db") or self.config.db is None): raise ValueError("App requires Database.") self.db = db or self.config.db - ## Embedder + # Embedder if embedder is None: raise ValueError("App requires Embedder.") self.embedder = embedder @@ -256,7 +253,6 @@ class EmbedChain(JSONSerializable): ) return self.add(source=source, data_type=data_type, metadata=metadata, config=config) - def _get_existing_doc_id(self, chunker: BaseChunker, src: Any): """ Get id of existing document for a given source, based on the data type @@ -395,10 +391,10 @@ class EmbedChain(JSONSerializable): return list(documents), metadatas, ids, 0 # Count before, to calculate a delta in the end. - chunks_before_addition = self.count() + chunks_before_addition = self.db.count() self.db.add(documents=documents, metadatas=metadatas, ids=ids) - count_new_chunks = self.count() - chunks_before_addition + count_new_chunks = self.db.count() - chunks_before_addition print((f"Successfully saved {src} ({chunker.data_type}). New chunks count: {count_new_chunks}")) return list(documents), metadatas, ids, count_new_chunks diff --git a/embedchain/embedder/base.py b/embedchain/embedder/base.py index 28cef207..dc024a3f 100644 --- a/embedchain/embedder/base.py +++ b/embedchain/embedder/base.py @@ -1,6 +1,6 @@ from typing import Any, Callable, Optional -from embedchain.config.embedder.BaseEmbedderConfig import BaseEmbedderConfig +from embedchain.config.embedder.base import BaseEmbedderConfig try: from chromadb.api.types import Documents, Embeddings diff --git a/embedchain/embedder/openai.py b/embedchain/embedder/openai.py index 032da3c5..ad8e4a06 100644 --- a/embedchain/embedder/openai.py +++ b/embedchain/embedder/openai.py @@ -16,7 +16,7 @@ except RuntimeError: from chromadb.utils import embedding_functions -class OpenAiEmbedder(BaseEmbedder): +class OpenAIEmbedder(BaseEmbedder): def __init__(self, config: Optional[BaseEmbedderConfig] = None): super().__init__(config=config) if self.config.model is None: diff --git a/embedchain/models/vector_databases.py b/embedchain/models/vector_databases.py index 5abf3844..e8ff5947 100644 --- a/embedchain/models/vector_databases.py +++ b/embedchain/models/vector_databases.py @@ -4,3 +4,4 @@ from enum import Enum class VectorDatabases(Enum): CHROMADB = "CHROMADB" ELASTICSEARCH = "ELASTICSEARCH" + OPENSEARCH = "OPENSEARCH" diff --git a/embedchain/vectordb/base.py b/embedchain/vectordb/base.py index b877ab0b..77d6efc7 100644 --- a/embedchain/vectordb/base.py +++ b/embedchain/vectordb/base.py @@ -1,4 +1,4 @@ -from embedchain.config.vectordbs.BaseVectorDbConfig import BaseVectorDbConfig +from embedchain.config.vectordb.base import BaseVectorDbConfig from embedchain.embedder.base import BaseEmbedder from embedchain.helper.json_serializable import JSONSerializable diff --git a/embedchain/vectordb/base_vector_db.py b/embedchain/vectordb/base_vector_db.py deleted file mode 100644 index aee18f1c..00000000 --- a/embedchain/vectordb/base_vector_db.py +++ /dev/null @@ -1,50 +0,0 @@ -from embedchain.config.vectordbs.BaseVectorDbConfig import BaseVectorDbConfig -from embedchain.embedder.base_embedder import BaseEmbedder -from embedchain.helper_classes.json_serializable import JSONSerializable - - -class BaseVectorDB(JSONSerializable): - """Base class for vector database.""" - - def __init__(self, config: BaseVectorDbConfig): - self.client = self._get_or_create_db() - self.config: BaseVectorDbConfig = config - - def _initialize(self): - """ - This method is needed because `embedder` attribute needs to be set externally before it can be initialized. - - So it's can't be done in __init__ in one step. - """ - raise NotImplementedError - - def _get_or_create_db(self): - """Get or create the database.""" - raise NotImplementedError - - def _get_or_create_collection(self): - raise NotImplementedError - - def _set_embedder(self, embedder: BaseEmbedder): - self.embedder = embedder - - def get(self): - raise NotImplementedError - - def add(self): - raise NotImplementedError - - def query(self): - raise NotImplementedError - - def count(self): - raise NotImplementedError - - def delete(self): - raise NotImplementedError - - def reset(self): - raise NotImplementedError - - def set_collection_name(self, name: str): - raise NotImplementedError diff --git a/embedchain/vectordb/chroma.py b/embedchain/vectordb/chroma.py index 28145b25..9319d909 100644 --- a/embedchain/vectordb/chroma.py +++ b/embedchain/vectordb/chroma.py @@ -63,7 +63,9 @@ class ChromaDB(BaseVectorDB): This method is needed because `embedder` attribute needs to be set externally before it can be initialized. """ if not self.embedder: - raise ValueError("Embedder not set. Please set an embedder with `set_embedder` before initialization.") + raise ValueError( + "Embedder not set. Please set an embedder with `_set_embedder()` function before initialization." + ) self._get_or_create_collection(self.config.collection_name) def _get_or_create_db(self): diff --git a/embedchain/vectordb/opensearch.py b/embedchain/vectordb/opensearch.py new file mode 100644 index 00000000..2bc5afcb --- /dev/null +++ b/embedchain/vectordb/opensearch.py @@ -0,0 +1,196 @@ +import logging +from typing import Dict, List, Optional, Set + +try: + from opensearchpy import OpenSearch + from opensearchpy.helpers import bulk +except ImportError: + raise ImportError( + "OpenSearch requires extra dependencies. Install with `pip install --upgrade embedchain[opensearch]`" + ) from None + +from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.vectorstores import OpenSearchVectorSearch + +from embedchain.config import OpenSearchDBConfig +from embedchain.helper.json_serializable import register_deserializable +from embedchain.vectordb.base import BaseVectorDB + + +@register_deserializable +class OpenSearchDB(BaseVectorDB): + """ + OpenSearch as vector database + """ + + def __init__(self, config: OpenSearchDBConfig): + """OpenSearch as vector database. + + :param config: OpenSearch domain config + :type config: OpenSearchDBConfig + """ + if config is None: + raise ValueError("OpenSearchDBConfig is required") + self.config = config + self.client = OpenSearch( + hosts=[self.config.opensearch_url], + http_auth=self.config.http_auth, + **self.config.extra_params, + ) + info = self.client.info() + logging.info(f"Connected to {info['version']['distribution']}. Version: {info['version']['number']}") + # Remove auth credentials from config after successful connection + super().__init__(config=self.config) + + def _initialize(self): + logging.info(self.client.info()) + index_name = self._get_index() + if self.client.indices.exists(index=index_name): + print(f"Index '{index_name}' already exists.") + return + + index_body = { + "settings": {"knn": True}, + "mappings": { + "properties": { + "text": {"type": "text"}, + "embeddings": { + "type": "knn_vector", + "index": False, + "dimension": self.config.vector_dimension, + }, + } + }, + } + self.client.indices.create(index_name, body=index_body) + print(self.client.indices.get(index_name)) + + def _get_or_create_db(self): + """Called during initialization""" + return self.client + + def _get_or_create_collection(self, name): + """Note: nothing to return here. Discuss later""" + + def get( + self, ids: Optional[List[str]] = None, where: Optional[Dict[str, any]] = None, limit: Optional[int] = None + ) -> Set[str]: + """ + Get existing doc ids present in vector database + + :param ids: _list of doc ids to check for existence + :type ids: List[str] + :param where: to filter data + :type where: Dict[str, any] + :return: ids + :type: Set[str] + """ + if ids: + query = {"query": {"bool": {"must": [{"ids": {"values": ids}}]}}} + else: + query = {"query": {"bool": {"must": []}}} + if "app_id" in where: + app_id = where["app_id"] + query["query"]["bool"]["must"].append({"term": {"metadata.app_id": app_id}}) + + # OpenSearch syntax is different from Elasticsearch + response = self.client.search(index=self._get_index(), body=query, _source=False, size=limit) + docs = response["hits"]["hits"] + ids = [doc["_id"] for doc in docs] + return {"ids": set(ids)} + + def add(self, documents: List[str], metadatas: List[object], ids: List[str]): + """add data in vector database + + :param documents: list of texts to add + :type documents: List[str] + :param metadatas: list of metadata associated with docs + :type metadatas: List[object] + :param ids: ids of docs + :type ids: List[str] + """ + + docs = [] + embeddings = self.embedder.embedding_fn(documents) + for id, text, metadata, embeddings in zip(ids, documents, metadatas, embeddings): + docs.append( + { + "_index": self._get_index(), + "_id": id, + "_source": {"text": text, "metadata": metadata, "embeddings": embeddings}, + } + ) + bulk(self.client, docs) + self.client.indices.refresh(index=self._get_index()) + + def query(self, input_query: List[str], n_results: int, where: Dict[str, any]) -> List[str]: + """ + query contents from vector data base based on vector similarity + + :param input_query: list of query string + :type input_query: List[str] + :param n_results: no of similar documents to fetch from database + :type n_results: int + :param where: Optional. to filter data + :type where: Dict[str, any] + :return: Database contents that are the result of the query + :rtype: List[str] + """ + embeddings = OpenAIEmbeddings() + docsearch = OpenSearchVectorSearch( + index_name=self._get_index(), + embedding_function=embeddings, + opensearch_url=f"{self.config.opensearch_url}", + http_auth=self.config.http_auth, + use_ssl=True, + ) + docs = docsearch.similarity_search( + input_query, + search_type="script_scoring", + space_type="cosinesimil", + vector_field="embeddings", + text_field="text", + metadata_field="metadata", + ) + contents = [doc.page_content for doc in docs] + return contents + + def set_collection_name(self, name: str): + """ + Set the name of the collection. A collection is an isolated space for vectors. + + :param name: Name of the collection. + :type name: str + """ + if not isinstance(name, str): + raise TypeError("Collection name must be a string") + self.config.collection_name = name + + def count(self) -> int: + """ + Count number of documents/chunks embedded in the database. + + :return: number of documents + :rtype: int + """ + query = {"query": {"match_all": {}}} + response = self.client.count(index=self._get_index(), body=query) + doc_count = response["count"] + return doc_count + + def reset(self): + """ + Resets the database. Deletes all embeddings irreversibly. + """ + # Delete all data from the database + if self.client.indices.exists(index=self._get_index()): + # delete index in Es + self.client.indices.delete(index=self._get_index()) + + def _get_index(self) -> str: + """Get the OpenSearch index for a collection + + :return: OpenSearch index + :rtype: str + """ + return self.config.collection_name diff --git a/pyproject.toml b/pyproject.toml index 91007acf..c946f0c7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -98,6 +98,7 @@ torch = { version = ">=2.0.0, !=2.0.1", optional = true } # Torch 2.0.1 is not compatible with poetry (https://github.com/pytorch/pytorch/issues/100974) gpt4all = { version = "1.0.8", optional = true } # 1.0.9 is not working for some users (https://github.com/nomic-ai/gpt4all/issues/1394) +opensearch-py = { version = "2.3.1", optional = true } elasticsearch = { version = "^8.9.0", optional = true } flask = { version = "^2.3.3", optional = true } twilio = { version = "^8.5.0", optional = true } @@ -123,6 +124,7 @@ streamlit = ["streamlit"] community = ["llama-hub"] opensource = ["sentence-transformers", "torch", "gpt4all"] elasticsearch = ["elasticsearch"] +opensearch = ["opensearch-py"] poe = ["fastapi-poe"] discord = ["discord"] slack = ["slack-sdk", "flask"]