[Feature] Add support for hybrid search for pinecone vector database (#1259)

This commit is contained in:
Deshraj Yadav
2024-02-15 13:20:14 -08:00
committed by GitHub
parent 0766a44ccf
commit 38b4e06963
18 changed files with 470 additions and 326 deletions

View File

@@ -15,6 +15,7 @@ class PineconeDBConfig(BaseVectorDbConfig):
metric: Optional[str] = "cosine",
pod_config: Optional[dict[str, any]] = None,
serverless_config: Optional[dict[str, any]] = None,
hybrid_search: bool = False,
**extra_params: dict[str, any],
):
self.metric = metric
@@ -22,6 +23,7 @@ class PineconeDBConfig(BaseVectorDbConfig):
self.index_name = index_name
self.vector_dimension = vector_dimension
self.extra_params = extra_params
self.hybrid_search = hybrid_search
if pod_config is None and serverless_config is None:
# If no config is provided, use the default pod spec config
pod_environment = os.environ.get("PINECONE_ENV", "gcp-starter")
@@ -33,4 +35,9 @@ class PineconeDBConfig(BaseVectorDbConfig):
if self.pod_config and self.serverless_config:
raise ValueError("Only one of pod_config or serverless_config can be provided.")
if self.hybrid_search and self.metric != "dotproduct":
raise ValueError(
"Hybrid search is only supported with dotproduct metric in Pinecone. See full docs here: https://docs.pinecone.io/docs/hybrid-search#limitations"
) # noqa:E501
super().__init__(collection_name=self.index_name, dir=None)

View File

@@ -237,46 +237,6 @@ class EmbedChain(JSONSerializable):
return source_hash
def add_local(
self,
source: Any,
data_type: Optional[DataType] = None,
metadata: Optional[dict[str, Any]] = None,
config: Optional[AddConfig] = None,
**kwargs: Optional[dict[str, Any]],
):
"""
Adds the data from the given URL to the vector db.
Loads the data, chunks it, create embedding for each chunk
and then stores the embedding to vector database.
Warning:
This method is deprecated and will be removed in future versions. Use `add` instead.
:param source: The data to embed, can be a URL, local file or raw content, depending on the data type.
:type source: Any
:param data_type: Automatically detected, but can be forced with this argument. The type of the data to add,
defaults to None
:type data_type: Optional[DataType], optional
:param metadata: Metadata associated with the data source., defaults to None
:type metadata: Optional[dict[str, Any]], optional
:param config: The `AddConfig` instance to use as configuration options., defaults to None
:type config: Optional[AddConfig], optional
:raises ValueError: Invalid data type
:return: source_hash, a md5-hash of the source, in hexadecimal representation.
:rtype: str
"""
logging.warning(
"The `add_local` method is deprecated and will be removed in future versions. Please use the `add` method for both local and remote files." # noqa: E501
)
return self.add(
source=source,
data_type=data_type,
metadata=metadata,
config=config,
**kwargs,
)
def _get_existing_doc_id(self, chunker: BaseChunker, src: Any):
"""
Get id of existing document for a given source, based on the data type

View File

@@ -1,3 +1,4 @@
import logging
import os
from typing import Optional, Union
@@ -8,6 +9,8 @@ except ImportError:
"Pinecone requires extra dependencies. Install with `pip install --upgrade 'embedchain[pinecone]'`"
) from None
from pinecone_text.sparse import BM25Encoder
from embedchain.config.vectordb.pinecone import PineconeDBConfig
from embedchain.helpers.json_serializable import register_deserializable
from embedchain.utils.misc import chunks
@@ -42,6 +45,14 @@ class PineconeDB(BaseVectorDB):
)
self.config = config
self._setup_pinecone_index()
# Setup BM25Encoder if sparse vectors are to be used
self.bm25_encoder = None
if self.config.hybrid_search:
# TODO: Add support for fitting BM25Encoder on any corpus
logging.info("Initializing BM25Encoder for sparse vectors..")
self.bm25_encoder = BM25Encoder.default()
# Call parent init here because embedder is needed
super().__init__(config=self.config)
@@ -119,12 +130,17 @@ class PineconeDB(BaseVectorDB):
docs = []
embeddings = self.embedder.embedding_fn(documents)
for id, text, metadata, embedding in zip(ids, documents, metadatas, embeddings):
# Insert sparse vectors as well if the user wants to do the hybrid search
sparse_vector_dict = (
{"sparse_values": self.bm25_encoder.encode_documents(text)} if self.bm25_encoder else {}
)
docs.append(
{
"id": id,
"values": embedding,
"metadata": {**metadata, "text": text},
}
**sparse_vector_dict,
},
)
for chunk in chunks(docs, self.BATCH_SIZE, desc="Adding chunks in batches"):
@@ -159,14 +175,19 @@ class PineconeDB(BaseVectorDB):
query_filter["app_id"] = {"$eq": app_id}
query_vector = self.embedder.embedding_fn([input_query])[0]
data = self.pinecone_index.query(
vector=query_vector,
filter=query_filter,
top_k=n_results,
include_metadata=True,
params = {
"vector": query_vector,
"filter": query_filter,
"top_k": n_results,
"include_metadata": True,
**kwargs,
)
}
if self.bm25_encoder:
sparse_query_vector = self.bm25_encoder.encode_queries(input_query)
params["sparse_vector"] = sparse_query_vector
data = self.pinecone_index.query(**params)
return [
(metadata.get("text"), {**metadata, "score": doc.get("score")}) if citations else metadata.get("text")
for doc in data.get("matches", [])