[Feature] Add support for hybrid search for pinecone vector database (#1259)
This commit is contained in:
@@ -15,6 +15,7 @@ class PineconeDBConfig(BaseVectorDbConfig):
|
||||
metric: Optional[str] = "cosine",
|
||||
pod_config: Optional[dict[str, any]] = None,
|
||||
serverless_config: Optional[dict[str, any]] = None,
|
||||
hybrid_search: bool = False,
|
||||
**extra_params: dict[str, any],
|
||||
):
|
||||
self.metric = metric
|
||||
@@ -22,6 +23,7 @@ class PineconeDBConfig(BaseVectorDbConfig):
|
||||
self.index_name = index_name
|
||||
self.vector_dimension = vector_dimension
|
||||
self.extra_params = extra_params
|
||||
self.hybrid_search = hybrid_search
|
||||
if pod_config is None and serverless_config is None:
|
||||
# If no config is provided, use the default pod spec config
|
||||
pod_environment = os.environ.get("PINECONE_ENV", "gcp-starter")
|
||||
@@ -33,4 +35,9 @@ class PineconeDBConfig(BaseVectorDbConfig):
|
||||
if self.pod_config and self.serverless_config:
|
||||
raise ValueError("Only one of pod_config or serverless_config can be provided.")
|
||||
|
||||
if self.hybrid_search and self.metric != "dotproduct":
|
||||
raise ValueError(
|
||||
"Hybrid search is only supported with dotproduct metric in Pinecone. See full docs here: https://docs.pinecone.io/docs/hybrid-search#limitations"
|
||||
) # noqa:E501
|
||||
|
||||
super().__init__(collection_name=self.index_name, dir=None)
|
||||
|
||||
@@ -237,46 +237,6 @@ class EmbedChain(JSONSerializable):
|
||||
|
||||
return source_hash
|
||||
|
||||
def add_local(
|
||||
self,
|
||||
source: Any,
|
||||
data_type: Optional[DataType] = None,
|
||||
metadata: Optional[dict[str, Any]] = None,
|
||||
config: Optional[AddConfig] = None,
|
||||
**kwargs: Optional[dict[str, Any]],
|
||||
):
|
||||
"""
|
||||
Adds the data from the given URL to the vector db.
|
||||
Loads the data, chunks it, create embedding for each chunk
|
||||
and then stores the embedding to vector database.
|
||||
|
||||
Warning:
|
||||
This method is deprecated and will be removed in future versions. Use `add` instead.
|
||||
|
||||
:param source: The data to embed, can be a URL, local file or raw content, depending on the data type.
|
||||
:type source: Any
|
||||
:param data_type: Automatically detected, but can be forced with this argument. The type of the data to add,
|
||||
defaults to None
|
||||
:type data_type: Optional[DataType], optional
|
||||
:param metadata: Metadata associated with the data source., defaults to None
|
||||
:type metadata: Optional[dict[str, Any]], optional
|
||||
:param config: The `AddConfig` instance to use as configuration options., defaults to None
|
||||
:type config: Optional[AddConfig], optional
|
||||
:raises ValueError: Invalid data type
|
||||
:return: source_hash, a md5-hash of the source, in hexadecimal representation.
|
||||
:rtype: str
|
||||
"""
|
||||
logging.warning(
|
||||
"The `add_local` method is deprecated and will be removed in future versions. Please use the `add` method for both local and remote files." # noqa: E501
|
||||
)
|
||||
return self.add(
|
||||
source=source,
|
||||
data_type=data_type,
|
||||
metadata=metadata,
|
||||
config=config,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def _get_existing_doc_id(self, chunker: BaseChunker, src: Any):
|
||||
"""
|
||||
Get id of existing document for a given source, based on the data type
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import logging
|
||||
import os
|
||||
from typing import Optional, Union
|
||||
|
||||
@@ -8,6 +9,8 @@ except ImportError:
|
||||
"Pinecone requires extra dependencies. Install with `pip install --upgrade 'embedchain[pinecone]'`"
|
||||
) from None
|
||||
|
||||
from pinecone_text.sparse import BM25Encoder
|
||||
|
||||
from embedchain.config.vectordb.pinecone import PineconeDBConfig
|
||||
from embedchain.helpers.json_serializable import register_deserializable
|
||||
from embedchain.utils.misc import chunks
|
||||
@@ -42,6 +45,14 @@ class PineconeDB(BaseVectorDB):
|
||||
)
|
||||
self.config = config
|
||||
self._setup_pinecone_index()
|
||||
|
||||
# Setup BM25Encoder if sparse vectors are to be used
|
||||
self.bm25_encoder = None
|
||||
if self.config.hybrid_search:
|
||||
# TODO: Add support for fitting BM25Encoder on any corpus
|
||||
logging.info("Initializing BM25Encoder for sparse vectors..")
|
||||
self.bm25_encoder = BM25Encoder.default()
|
||||
|
||||
# Call parent init here because embedder is needed
|
||||
super().__init__(config=self.config)
|
||||
|
||||
@@ -119,12 +130,17 @@ class PineconeDB(BaseVectorDB):
|
||||
docs = []
|
||||
embeddings = self.embedder.embedding_fn(documents)
|
||||
for id, text, metadata, embedding in zip(ids, documents, metadatas, embeddings):
|
||||
# Insert sparse vectors as well if the user wants to do the hybrid search
|
||||
sparse_vector_dict = (
|
||||
{"sparse_values": self.bm25_encoder.encode_documents(text)} if self.bm25_encoder else {}
|
||||
)
|
||||
docs.append(
|
||||
{
|
||||
"id": id,
|
||||
"values": embedding,
|
||||
"metadata": {**metadata, "text": text},
|
||||
}
|
||||
**sparse_vector_dict,
|
||||
},
|
||||
)
|
||||
|
||||
for chunk in chunks(docs, self.BATCH_SIZE, desc="Adding chunks in batches"):
|
||||
@@ -159,14 +175,19 @@ class PineconeDB(BaseVectorDB):
|
||||
query_filter["app_id"] = {"$eq": app_id}
|
||||
|
||||
query_vector = self.embedder.embedding_fn([input_query])[0]
|
||||
data = self.pinecone_index.query(
|
||||
vector=query_vector,
|
||||
filter=query_filter,
|
||||
top_k=n_results,
|
||||
include_metadata=True,
|
||||
params = {
|
||||
"vector": query_vector,
|
||||
"filter": query_filter,
|
||||
"top_k": n_results,
|
||||
"include_metadata": True,
|
||||
**kwargs,
|
||||
)
|
||||
}
|
||||
|
||||
if self.bm25_encoder:
|
||||
sparse_query_vector = self.bm25_encoder.encode_queries(input_query)
|
||||
params["sparse_vector"] = sparse_query_vector
|
||||
|
||||
data = self.pinecone_index.query(**params)
|
||||
return [
|
||||
(metadata.get("text"), {**metadata, "score": doc.get("score")}) if citations else metadata.get("text")
|
||||
for doc in data.get("matches", [])
|
||||
|
||||
Reference in New Issue
Block a user