[Feature] Add support for metadata filtering on search API (#1245)

2024-02-06 15:42:51 -08:00
parent 8fe2c3effc
commit 4afef04f26
10 changed files with 173 additions and 104 deletions
--- a/docs/api-reference/app/search.mdx
+++ b/docs/api-reference/app/search.mdx
@@ -12,6 +12,13 @@ title: '🔍 search'
 <ParamField path="num_documents" type="int" optional>
    Number of relevant documents to fetch. Defaults to `3`
 </ParamField>
+<ParamField path="where" type="dict" optional>
+    Key value pair for metadata filtering.
+</ParamField>
+<ParamField path="raw_filter" type="dict" optional>
+    Pass raw filter query based on your vector database.
+    Currently, `raw_filter` param is only supported for Pinecone vector database.
+</ParamField>

 ### Returns

@@ -21,37 +28,84 @@ title: '🔍 search'

 ## Usage

+### Basic
+
 Refer to the following example on how to use the search api:

 ```python Code example
 from embedchain import App

-# Initialize app
 app = App()
-
-# Add data source
 app.add("https://www.forbes.com/profile/elon-musk")

-# Get relevant context using semantic search
 context = app.search("What is the net worth of Elon?", num_documents=2)
 print(context)
-# Context:
-# [
-#     {
-#         'context': 'Elon Musk PROFILEElon MuskCEO, Tesla$221.9BReal Time Net Worth ...',
-#         'metadata': {
-#             'source': 'https://www.forbes.com/profile/elon-musk',
-#             'document_id': 'some_document_id',
-#             'score': 0.404,
-#         }
-#     },
-#     {
-#         'context': 'company, which is now called X.Wealth HistoryHOVER TO REVEAL NET WORTH ...',
-#         'metadata': {
-#             'source': 'https://www.forbes.com/profile/elon-musk',
-#             'document_id': 'some_document_id',
-#             'score': 0.435,
-#         }
-#     }
-# ]
+```
+
+### Advanced
+
+#### Metadata filtering using `where` params
+
+Here is an advanced example of `search()` API with metadata filtering on pinecone database:
+
+```python
+import os
+
+from embedchain import App
+
+os.environ["PINECONE_API_KEY"] = "xxx"
+
+config = {
+    "vectordb": {
+        "provider": "pinecone",
+        "config": {
+            "metric": "dotproduct",
+            "vector_dimension": 1536,
+            "index_name": "ec-test",
+            "serverless_config": {"cloud": "aws", "region": "us-west-2"},
+        },
+    }
+}
+
+app = App.from_config(config=config)
+
+app.add("https://www.forbes.com/profile/bill-gates", metadata={"type": "forbes", "person": "gates"})
+app.add("https://en.wikipedia.org/wiki/Bill_Gates", metadata={"type": "wiki", "person": "gates"})
+
+results = app.search("What is the net worth of Bill Gates?", where={"person": "gates"})
+print("Num of search results: ", len(results))
+```
+
+#### Metadata filtering using `raw_filter` params
+
+Following is an example of metadata filtering by passing the raw filter query that pinecone vector database follows:
+
+```python
+import os
+
+from embedchain import App
+
+os.environ["PINECONE_API_KEY"] = "xxx"
+
+config = {
+    "vectordb": {
+        "provider": "pinecone",
+        "config": {
+            "metric": "dotproduct",
+            "vector_dimension": 1536,
+            "index_name": "ec-test",
+            "serverless_config": {"cloud": "aws", "region": "us-west-2"},
+        },
+    }
+}
+
+app = App.from_config(config=config)
+
+app.add("https://www.forbes.com/profile/bill-gates", metadata={"year": 2022, "person": "gates"})
+app.add("https://en.wikipedia.org/wiki/Bill_Gates", metadata={"year": 2024, "person": "gates"})
+
+print("Filter with person: gates and year > 2023")
+raw_filter = {"$and": [{"person": "gates"}, {"year": {"$gt": 2023}}]}
+results = app.search("What is the net worth of Bill Gates?", raw_filter=raw_filter)
+print("Num of search results: ", len(results))
 ```
--- a/docs/components/vector-databases.mdx
+++ b/docs/components/vector-databases.mdx
@@ -186,7 +186,7 @@ vectordb:
  config:
    metric: cosine
    vector_dimension: 1536
-    collection_name: my-pinecone-index
+    index_name: my-pinecone-index
    pod_config:
      environment: gcp-starter
      metadata_config:
@@ -201,7 +201,7 @@ vectordb:
  config:
    metric: cosine
    vector_dimension: 1536
-    collection_name: my-pinecone-index
+    index_name: my-pinecone-index
    serverless_config:
      cloud: aws
      region: us-west-2