[Feature] Add support for hybrid search for pinecone vector database (#1259)

2024-02-15 13:20:14 -08:00
parent 0766a44ccf
commit 38b4e06963
18 changed files with 470 additions and 326 deletions
--- a/embedchain/config/vectordb/pinecone.py
+++ b/embedchain/config/vectordb/pinecone.py
@@ -15,6 +15,7 @@ class PineconeDBConfig(BaseVectorDbConfig):
        metric: Optional[str] = "cosine",
        pod_config: Optional[dict[str, any]] = None,
        serverless_config: Optional[dict[str, any]] = None,
+        hybrid_search: bool = False,
        **extra_params: dict[str, any],
    ):
        self.metric = metric
@@ -22,6 +23,7 @@ class PineconeDBConfig(BaseVectorDbConfig):
        self.index_name = index_name
        self.vector_dimension = vector_dimension
        self.extra_params = extra_params
+        self.hybrid_search = hybrid_search
        if pod_config is None and serverless_config is None:
            # If no config is provided, use the default pod spec config
            pod_environment = os.environ.get("PINECONE_ENV", "gcp-starter")
@@ -33,4 +35,9 @@ class PineconeDBConfig(BaseVectorDbConfig):
        if self.pod_config and self.serverless_config:
            raise ValueError("Only one of pod_config or serverless_config can be provided.")

+        if self.hybrid_search and self.metric != "dotproduct":
+            raise ValueError(
+                "Hybrid search is only supported with dotproduct metric in Pinecone. See full docs here: https://docs.pinecone.io/docs/hybrid-search#limitations"
+            )  # noqa:E501
+
        super().__init__(collection_name=self.index_name, dir=None)
--- a/embedchain/embedchain.py
+++ b/embedchain/embedchain.py
@@ -237,46 +237,6 @@ class EmbedChain(JSONSerializable):

        return source_hash

-    def add_local(
-        self,
-        source: Any,
-        data_type: Optional[DataType] = None,
-        metadata: Optional[dict[str, Any]] = None,
-        config: Optional[AddConfig] = None,
-        **kwargs: Optional[dict[str, Any]],
-    ):
-        """
-        Adds the data from the given URL to the vector db.
-        Loads the data, chunks it, create embedding for each chunk
-        and then stores the embedding to vector database.
-
-        Warning:
-            This method is deprecated and will be removed in future versions. Use `add` instead.
-
-        :param source: The data to embed, can be a URL, local file or raw content, depending on the data type.
-        :type source: Any
-        :param data_type: Automatically detected, but can be forced with this argument. The type of the data to add,
-        defaults to None
-        :type data_type: Optional[DataType], optional
-        :param metadata: Metadata associated with the data source., defaults to None
-        :type metadata: Optional[dict[str, Any]], optional
-        :param config: The `AddConfig` instance to use as configuration options., defaults to None
-        :type config: Optional[AddConfig], optional
-        :raises ValueError: Invalid data type
-        :return: source_hash, a md5-hash of the source, in hexadecimal representation.
-        :rtype: str
-        """
-        logging.warning(
-            "The `add_local` method is deprecated and will be removed in future versions. Please use the `add` method for both local and remote files."  # noqa: E501
-        )
-        return self.add(
-            source=source,
-            data_type=data_type,
-            metadata=metadata,
-            config=config,
-            **kwargs,
-        )
-
    def _get_existing_doc_id(self, chunker: BaseChunker, src: Any):
        """
        Get id of existing document for a given source, based on the data type
--- a/embedchain/vectordb/pinecone.py
+++ b/embedchain/vectordb/pinecone.py
@@ -1,3 +1,4 @@
+import logging
 import os
 from typing import Optional, Union

@@ -8,6 +9,8 @@ except ImportError:
        "Pinecone requires extra dependencies. Install with `pip install --upgrade 'embedchain[pinecone]'`"
    ) from None

+from pinecone_text.sparse import BM25Encoder
+
 from embedchain.config.vectordb.pinecone import PineconeDBConfig
 from embedchain.helpers.json_serializable import register_deserializable
 from embedchain.utils.misc import chunks
@@ -42,6 +45,14 @@ class PineconeDB(BaseVectorDB):
                )
            self.config = config
        self._setup_pinecone_index()
+
+        # Setup BM25Encoder if sparse vectors are to be used
+        self.bm25_encoder = None
+        if self.config.hybrid_search:
+            # TODO: Add support for fitting BM25Encoder on any corpus
+            logging.info("Initializing BM25Encoder for sparse vectors..")
+            self.bm25_encoder = BM25Encoder.default()
+
        # Call parent init here because embedder is needed
        super().__init__(config=self.config)

@@ -119,12 +130,17 @@ class PineconeDB(BaseVectorDB):
        docs = []
        embeddings = self.embedder.embedding_fn(documents)
        for id, text, metadata, embedding in zip(ids, documents, metadatas, embeddings):
+            # Insert sparse vectors as well if the user wants to do the hybrid search
+            sparse_vector_dict = (
+                {"sparse_values": self.bm25_encoder.encode_documents(text)} if self.bm25_encoder else {}
+            )
            docs.append(
                {
                    "id": id,
                    "values": embedding,
                    "metadata": {**metadata, "text": text},
-                }
+                    **sparse_vector_dict,
+                },
            )

        for chunk in chunks(docs, self.BATCH_SIZE, desc="Adding chunks in batches"):
@@ -159,14 +175,19 @@ class PineconeDB(BaseVectorDB):
            query_filter["app_id"] = {"$eq": app_id}

        query_vector = self.embedder.embedding_fn([input_query])[0]
-        data = self.pinecone_index.query(
-            vector=query_vector,
-            filter=query_filter,
-            top_k=n_results,
-            include_metadata=True,
+        params = {
+            "vector": query_vector,
+            "filter": query_filter,
+            "top_k": n_results,
+            "include_metadata": True,
            **kwargs,
-        )
+        }

+        if self.bm25_encoder:
+            sparse_query_vector = self.bm25_encoder.encode_queries(input_query)
+            params["sparse_vector"] = sparse_query_vector
+
+        data = self.pinecone_index.query(**params)
        return [
            (metadata.get("text"), {**metadata, "score": doc.get("score")}) if citations else metadata.get("text")
            for doc in data.get("matches", [])