[Bug fix] Fix embedding issue for opensearch and some other vector databases (#1163)

2024-01-12 14:15:39 +05:30
parent c020e65a50
commit 862ff6cca6
13 changed files with 40 additions and 95 deletions
--- a/embedchain/chunkers/base_chunker.py
+++ b/embedchain/chunkers/base_chunker.py
@@ -27,7 +27,7 @@ class BaseChunker(JSONSerializable):
        chunk_ids = []
        id_map = {}
        min_chunk_size = config.min_chunk_size if config is not None else 1
-        logging.info(f"[INFO] Skipping chunks smaller than {min_chunk_size} characters")
+        logging.info(f"Skipping chunks smaller than {min_chunk_size} characters")
        data_result = loader.load_data(src)
        data_records = data_result["data"]
        doc_id = data_result["doc_id"]
--- a/embedchain/embedchain.py
+++ b/embedchain/embedchain.py
@@ -369,7 +369,7 @@ class EmbedChain(JSONSerializable):
        metadatas = embeddings_data["metadatas"]
        ids = embeddings_data["ids"]
        new_doc_id = embeddings_data["doc_id"]
-        embeddings = embeddings_data.get("embeddings")
+
        if existing_doc_id and existing_doc_id == new_doc_id:
            print("Doc content has not changed. Skipping creating chunks and embeddings")
            return [], [], [], 0
@@ -433,13 +433,7 @@ class EmbedChain(JSONSerializable):
        # Count before, to calculate a delta in the end.
        chunks_before_addition = self.db.count()

-        self.db.add(
-            embeddings=embeddings,
-            documents=documents,
-            metadatas=metadatas,
-            ids=ids,
-            **kwargs,
-        )
+        self.db.add(documents=documents, metadatas=metadatas, ids=ids, **kwargs)
        count_new_chunks = self.db.count() - chunks_before_addition

        print(f"Successfully saved {src} ({chunker.data_type}). New chunks count: {count_new_chunks}")
--- a/embedchain/vectordb/chroma.py
+++ b/embedchain/vectordb/chroma.py
@@ -129,17 +129,13 @@ class ChromaDB(BaseVectorDB):

    def add(
        self,
-        embeddings: list[list[float]],
        documents: list[str],
        metadatas: list[object],
        ids: list[str],
-        **kwargs: Optional[dict[str, Any]],
    ) -> Any:
        """
        Add vectors to chroma database

-        :param embeddings: list of embeddings to add
-        :type embeddings: list[list[str]]
        :param documents: Documents
        :type documents: list[str]
        :param metadatas: Metadatas
--- a/embedchain/vectordb/elasticsearch.py
+++ b/embedchain/vectordb/elasticsearch.py
@@ -110,7 +110,6 @@ class ElasticsearchDB(BaseVectorDB):

    def add(
        self,
-        embeddings: list[list[float]],
        documents: list[str],
        metadatas: list[object],
        ids: list[str],
@@ -118,8 +117,6 @@ class ElasticsearchDB(BaseVectorDB):
    ) -> Any:
        """
        add data in vector database
-        :param embeddings: list of embeddings to add
-        :type embeddings: list[list[str]]
        :param documents: list of texts to add
        :type documents: list[str]
        :param metadatas: list of metadata associated with docs
--- a/embedchain/vectordb/opensearch.py
+++ b/embedchain/vectordb/opensearch.py
@@ -114,22 +114,10 @@ class OpenSearchDB(BaseVectorDB):
            result["metadatas"].append({"doc_id": doc_id})
        return result

-    def add(
-        self,
-        embeddings: list[list[str]],
-        documents: list[str],
-        metadatas: list[object],
-        ids: list[str],
-        **kwargs: Optional[dict[str, any]],
-    ):
-        """Add data in vector database.
+    def add(self, documents: list[str], metadatas: list[object], ids: list[str], **kwargs: Optional[dict[str, any]]):
+        """Adds documents to the opensearch index"""

-        Args:
-            embeddings (list[list[str]]): list of embeddings to add.
-            documents (list[str]): list of texts to add.
-            metadatas (list[object]): list of metadata associated with docs.
-            ids (list[str]): IDs of docs.
-        """
+        embeddings = self.embedder.embedding_fn(documents)
        for batch_start in tqdm(range(0, len(documents), self.BATCH_SIZE), desc="Inserting batches in opensearch"):
            batch_end = batch_start + self.BATCH_SIZE
            batch_documents = documents[batch_start:batch_end]
--- a/embedchain/vectordb/pinecone.py
+++ b/embedchain/vectordb/pinecone.py
@@ -88,7 +88,6 @@ class PineconeDB(BaseVectorDB):

    def add(
        self,
-        embeddings: list[list[float]],
        documents: list[str],
        metadatas: list[object],
        ids: list[str],
--- a/embedchain/vectordb/qdrant.py
+++ b/embedchain/vectordb/qdrant.py
@@ -122,15 +122,12 @@ class QdrantDB(BaseVectorDB):

    def add(
        self,
-        embeddings: list[list[float]],
        documents: list[str],
        metadatas: list[object],
        ids: list[str],
        **kwargs: Optional[dict[str, any]],
    ):
        """add data in vector database
-        :param embeddings: list of embeddings for the corresponding documents to be added
-        :type documents: list[list[float]]
        :param documents: list of texts to add
        :type documents: list[str]
        :param metadatas: list of metadata associated with docs
--- a/embedchain/vectordb/weaviate.py
+++ b/embedchain/vectordb/weaviate.py
@@ -1,6 +1,6 @@
 import copy
 import os
-from typing import Any, Optional, Union
+from typing import Optional, Union

 try:
    import weaviate
@@ -151,17 +151,8 @@ class WeaviateDB(BaseVectorDB):

        return {"ids": existing_ids}

-    def add(
-        self,
-        embeddings: list[list[float]],
-        documents: list[str],
-        metadatas: list[object],
-        ids: list[str],
-        **kwargs: Optional[dict[str, any]],
-    ):
+    def add(self, documents: list[str], metadatas: list[object], ids: list[str], **kwargs: Optional[dict[str, any]]):
        """add data in vector database
-        :param embeddings: list of embeddings for the corresponding documents to be added
-        :type documents: list[list[float]]
        :param documents: list of texts to add
        :type documents: list[str]
        :param metadatas: list of metadata associated with docs
@@ -191,12 +182,7 @@ class WeaviateDB(BaseVectorDB):
                )

    def query(
-        self,
-        input_query: list[str],
-        n_results: int,
-        where: dict[str, any],
-        citations: bool = False,
-        **kwargs: Optional[dict[str, Any]],
+        self, input_query: list[str], n_results: int, where: dict[str, any], citations: bool = False
    ) -> Union[list[tuple[str, dict]], list[str]]:
        """
        query contents from vector database based on vector similarity
--- a/embedchain/vectordb/zilliz.py
+++ b/embedchain/vectordb/zilliz.py
@@ -108,7 +108,6 @@ class ZillizVectorDB(BaseVectorDB):

    def add(
        self,
-        embeddings: list[list[float]],
        documents: list[str],
        metadatas: list[object],
        ids: list[str],