[Feature] Add citations flag in query and chat functions of App to return context along with the answer (#859)

2023-11-01 13:06:28 -07:00
parent 5022c1ae29
commit 930280f4ce
15 changed files with 279 additions and 112 deletions
--- a/embedchain/vectordb/chroma.py
+++ b/embedchain/vectordb/chroma.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple, Union

 from chromadb import Collection, QueryResult
 from langchain.docstore.document import Document
@@ -192,8 +192,13 @@ class ChromaDB(BaseVectorDB):
        ]

    def query(
-        self, input_query: List[str], n_results: int, where: Dict[str, any], skip_embedding: bool
-    ) -> List[Tuple[str, str, str]]:
+        self,
+        input_query: List[str],
+        n_results: int,
+        where: Dict[str, any],
+        skip_embedding: bool,
+        citations: bool = False,
+    ) -> Union[List[Tuple[str, str, str]], List[str]]:
        """
        Query contents from vector database based on vector similarity

@@ -205,9 +210,12 @@ class ChromaDB(BaseVectorDB):
        :type where: Dict[str, Any]
        :param skip_embedding: Optional. If True, then the input_query is assumed to be already embedded.
        :type skip_embedding: bool
+        :param citations: we use citations boolean param to return context along with the answer.
+        :type citations: bool, default is False.
        :raises InvalidDimensionException: Dimensions do not match.
-        :return: The content of the document that matched your query, url of the source, doc_id
-        :rtype: List[Tuple[str,str,str]]
+        :return: The content of the document that matched your query,
+        along with url of the source and doc_id (if citations flag is true)
+        :rtype: List[str], if citations=False, otherwise List[Tuple[str, str, str]]
        """
        try:
            if skip_embedding:
@@ -236,10 +244,13 @@ class ChromaDB(BaseVectorDB):
        contexts = []
        for result in results_formatted:
            context = result[0].page_content
-            metadata = result[0].metadata
-            source = metadata["url"]
-            doc_id = metadata["doc_id"]
-            contexts.append((context, source, doc_id))
+            if citations:
+                metadata = result[0].metadata
+                source = metadata["url"]
+                doc_id = metadata["doc_id"]
+                contexts.append((context, source, doc_id))
+            else:
+                contexts.append(context)
        return contexts

    def set_collection_name(self, name: str):
--- a/embedchain/vectordb/elasticsearch.py
+++ b/embedchain/vectordb/elasticsearch.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple, Union

 try:
    from elasticsearch import Elasticsearch
@@ -136,8 +136,13 @@ class ElasticsearchDB(BaseVectorDB):
        self.client.indices.refresh(index=self._get_index())

    def query(
-        self, input_query: List[str], n_results: int, where: Dict[str, any], skip_embedding: bool
-    ) -> List[Tuple[str, str, str]]:
+        self,
+        input_query: List[str],
+        n_results: int,
+        where: Dict[str, any],
+        skip_embedding: bool,
+        citations: bool = False,
+    ) -> Union[List[Tuple[str, str, str]], List[str]]:
        """
        query contents from vector data base based on vector similarity

@@ -150,8 +155,11 @@ class ElasticsearchDB(BaseVectorDB):
        :param skip_embedding: Optional. If True, then the input_query is assumed to be already embedded.
        :type skip_embedding: bool
        :return: The context of the document that matched your query, url of the source, doc_id
-
-        :rtype: List[Tuple[str,str,str]]
+        :param citations: we use citations boolean param to return context along with the answer.
+        :type citations: bool, default is False.
+        :return: The content of the document that matched your query,
+        along with url of the source and doc_id (if citations flag is true)
+        :rtype: List[str], if citations=False, otherwise List[Tuple[str, str, str]]
        """
        if skip_embedding:
            query_vector = input_query
@@ -175,14 +183,17 @@ class ElasticsearchDB(BaseVectorDB):
        _source = ["text", "metadata.url", "metadata.doc_id"]
        response = self.client.search(index=self._get_index(), query=query, _source=_source, size=n_results)
        docs = response["hits"]["hits"]
-        contents = []
+        contexts = []
        for doc in docs:
            context = doc["_source"]["text"]
-            metadata = doc["_source"]["metadata"]
-            source = metadata["url"]
-            doc_id = metadata["doc_id"]
-            contents.append(tuple((context, source, doc_id)))
-        return contents
+            if citations:
+                metadata = doc["_source"]["metadata"]
+                source = metadata["url"]
+                doc_id = metadata["doc_id"]
+                contexts.append(tuple((context, source, doc_id)))
+            else:
+                contexts.append(context)
+        return contexts

    def set_collection_name(self, name: str):
        """
--- a/embedchain/vectordb/opensearch.py
+++ b/embedchain/vectordb/opensearch.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Dict, List, Optional, Set, Tuple
+from typing import Dict, List, Optional, Set, Tuple, Union

 try:
    from opensearchpy import OpenSearch
@@ -146,8 +146,13 @@ class OpenSearchDB(BaseVectorDB):
        self.client.indices.refresh(index=self._get_index())

    def query(
-        self, input_query: List[str], n_results: int, where: Dict[str, any], skip_embedding: bool
-    ) -> List[Tuple[str, str, str]]:
+        self,
+        input_query: List[str],
+        n_results: int,
+        where: Dict[str, any],
+        skip_embedding: bool,
+        citations: bool = False,
+    ) -> Union[List[Tuple[str, str, str]], List[str]]:
        """
        query contents from vector data base based on vector similarity

@@ -159,8 +164,11 @@ class OpenSearchDB(BaseVectorDB):
        :type where: Dict[str, any]
        :param skip_embedding: Optional. If True, then the input_query is assumed to be already embedded.
        :type skip_embedding: bool
-        :return: The content of the document that matched your query, url of the source, doc_id
-        :rtype: List[Tuple[str,str,str]]
+        :param citations: we use citations boolean param to return context along with the answer.
+        :type citations: bool, default is False.
+        :return: The content of the document that matched your query,
+        along with url of the source and doc_id (if citations flag is true)
+        :rtype: List[str], if citations=False, otherwise List[Tuple[str, str, str]]
        """
        # TODO(rupeshbansal, deshraj): Add support for skip embeddings here if already exists
        embeddings = OpenAIEmbeddings()
@@ -188,13 +196,16 @@ class OpenSearchDB(BaseVectorDB):
            k=n_results,
        )

-        contents = []
+        contexts = []
        for doc in docs:
            context = doc.page_content
-            source = doc.metadata["url"]
-            doc_id = doc.metadata["doc_id"]
-            contents.append(tuple((context, source, doc_id)))
-        return contents
+            if citations:
+                source = doc.metadata["url"]
+                doc_id = doc.metadata["doc_id"]
+                contexts.append(tuple((context, source, doc_id)))
+            else:
+                contexts.append(context)
+        return contexts

    def set_collection_name(self, name: str):
        """
--- a/embedchain/vectordb/pinecone.py
+++ b/embedchain/vectordb/pinecone.py
@@ -1,5 +1,5 @@
 import os
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple, Union

 try:
    import pinecone
@@ -119,8 +119,13 @@ class PineconeDB(BaseVectorDB):
            self.client.upsert(docs[i : i + self.BATCH_SIZE])

    def query(
-        self, input_query: List[str], n_results: int, where: Dict[str, any], skip_embedding: bool
-    ) -> List[Tuple[str, str, str]]:
+        self,
+        input_query: List[str],
+        n_results: int,
+        where: Dict[str, any],
+        skip_embedding: bool,
+        citations: bool = False,
+    ) -> Union[List[Tuple[str, str, str]], List[str]]:
        """
        query contents from vector database based on vector similarity
        :param input_query: list of query string
@@ -131,22 +136,28 @@ class PineconeDB(BaseVectorDB):
        :type where: Dict[str, any]
        :param skip_embedding: Optional. if True, input_query is already embedded
        :type skip_embedding: bool
-        :return: The content of the document that matched your query, url of the source, doc_id
-        :rtype: List[Tuple[str,str,str]]
+        :param citations: we use citations boolean param to return context along with the answer.
+        :type citations: bool, default is False.
+        :return: The content of the document that matched your query,
+        along with url of the source and doc_id (if citations flag is true)
+        :rtype: List[str], if citations=False, otherwise List[Tuple[str, str, str]]
        """
        if not skip_embedding:
            query_vector = self.embedder.embedding_fn([input_query])[0]
        else:
            query_vector = input_query
        data = self.client.query(vector=query_vector, filter=where, top_k=n_results, include_metadata=True)
-        contents = []
+        contexts = []
        for doc in data["matches"]:
            metadata = doc["metadata"]
            context = metadata["text"]
-            source = metadata["url"]
-            doc_id = metadata["doc_id"]
-            contents.append(tuple((context, source, doc_id)))
-        return contents
+            if citations:
+                source = metadata["url"]
+                doc_id = metadata["doc_id"]
+                contexts.append(tuple((context, source, doc_id)))
+            else:
+                contexts.append(context)
+        return contexts

    def set_collection_name(self, name: str):
        """
--- a/embedchain/vectordb/qdrant.py
+++ b/embedchain/vectordb/qdrant.py
@@ -1,7 +1,7 @@
 import copy
 import os
 import uuid
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple, Union

 try:
    from qdrant_client import QdrantClient
@@ -161,8 +161,13 @@ class QdrantDB(BaseVectorDB):
            )

    def query(
-        self, input_query: List[str], n_results: int, where: Dict[str, any], skip_embedding: bool
-    ) -> List[Tuple[str, str, str]]:
+        self,
+        input_query: List[str],
+        n_results: int,
+        where: Dict[str, any],
+        skip_embedding: bool,
+        citations: bool = False,
+    ) -> Union[List[Tuple[str, str, str]], List[str]]:
        """
        query contents from vector database based on vector similarity
        :param input_query: list of query string
@@ -174,8 +179,11 @@ class QdrantDB(BaseVectorDB):
        :param skip_embedding: A boolean flag indicating if the embedding for the documents to be added is to be
        generated or not
        :type skip_embedding: bool
-        :return: The context of the document that matched your query, url of the source, doc_id
-        :rtype: List[Tuple[str,str,str]]
+        :param citations: we use citations boolean param to return context along with the answer.
+        :type citations: bool, default is False.
+        :return: The content of the document that matched your query,
+        along with url of the source and doc_id (if citations flag is true)
+        :rtype: List[str], if citations=False, otherwise List[Tuple[str, str, str]]
        """
        if not skip_embedding:
            query_vector = self.embedder.embedding_fn([input_query])[0]
@@ -202,14 +210,17 @@ class QdrantDB(BaseVectorDB):
            limit=n_results,
        )

-        response = []
+        contexts = []
        for result in results:
            context = result.payload["text"]
-            metadata = result.payload["metadata"]
-            source = metadata["url"]
-            doc_id = metadata["doc_id"]
-            response.append(tuple((context, source, doc_id)))
-        return response
+            if citations:
+                metadata = result.payload["metadata"]
+                source = metadata["url"]
+                doc_id = metadata["doc_id"]
+                contexts.append(tuple((context, source, doc_id)))
+            else:
+                contexts.append(context)
+        return contexts

    def count(self) -> int:
        response = self.client.get_collection(collection_name=self.collection_name)
--- a/embedchain/vectordb/weaviate.py
+++ b/embedchain/vectordb/weaviate.py
@@ -1,6 +1,6 @@
 import copy
 import os
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple, Union

 try:
    import weaviate
@@ -58,10 +58,14 @@ class WeaviateDB(BaseVectorDB):
            raise ValueError("Embedder not set. Please set an embedder with `set_embedder` before initialization.")

        self.index_name = self._get_index_name()
-        self.metadata_keys = {"data_type", "doc_id", "url", "hash", "app_id", "text"}
+        self.metadata_keys = {"data_type", "doc_id", "url", "hash", "app_id"}
        if not self.client.schema.exists(self.index_name):
            # id is a reserved field in Weaviate, hence we had to change the name of the id field to identifier
            # The none vectorizer is crucial as we have our own custom embedding function
+            """
+            TODO: wait for weaviate to add indexing on `object[]` data-type so that we can add filter while querying.
+            Once that is done, change `dataType` of "metadata" field to `object[]` and update the query below.
+            """
            class_obj = {
                "classes": [
                    {
@@ -106,10 +110,6 @@ class WeaviateDB(BaseVectorDB):
                                "name": "app_id",
                                "dataType": ["text"],
                            },
-                            {
-                                "name": "text",
-                                "dataType": ["text"],
-                            },
                        ],
                    },
                ]
@@ -195,8 +195,13 @@ class WeaviateDB(BaseVectorDB):
                batch.add_reference(obj_uuid, self.index_name, "metadata", metadata_uuid, self.index_name + "_metadata")

    def query(
-        self, input_query: List[str], n_results: int, where: Dict[str, any], skip_embedding: bool
-    ) -> List[Tuple[str, str, str]]:
+        self,
+        input_query: List[str],
+        n_results: int,
+        where: Dict[str, any],
+        skip_embedding: bool,
+        citations: bool = False,
+    ) -> Union[List[Tuple[str, str, str]], List[str]]:
        """
        query contents from vector database based on vector similarity
        :param input_query: list of query string
@@ -208,15 +213,23 @@ class WeaviateDB(BaseVectorDB):
        :param skip_embedding: A boolean flag indicating if the embedding for the documents to be added is to be
        generated or not
        :type skip_embedding: bool
-        :return: The context of the document that matched your query, url of the source, doc_id
-        :rtype: List[Tuple[str,str,str]]
+        :param citations: we use citations boolean param to return context along with the answer.
+        :type citations: bool, default is False.
+        :return: The content of the document that matched your query,
+        along with url of the source and doc_id (if citations flag is true)
+        :rtype: List[str], if citations=False, otherwise List[Tuple[str, str, str]]
        """
        if not skip_embedding:
            query_vector = self.embedder.embedding_fn([input_query])[0]
        else:
            query_vector = input_query
+
        keys = set(where.keys() if where is not None else set())
        data_fields = ["text"]
+
+        if citations:
+            data_fields.append(weaviate.LinkTo("metadata", self.index_name + "_metadata", list(self.metadata_keys)))
+
        if len(keys.intersection(self.metadata_keys)) != 0:
            weaviate_where_operands = []
            for key in keys:
@@ -247,7 +260,18 @@ class WeaviateDB(BaseVectorDB):
                .with_limit(n_results)
                .do()
            )
-        contexts = results["data"]["Get"].get(self.index_name)
+
+        docs = results["data"]["Get"].get(self.index_name)
+        contexts = []
+        for doc in docs:
+            context = doc["text"]
+            if citations:
+                metadata = doc["metadata"][0]
+                source = metadata["url"]
+                doc_id = metadata["doc_id"]
+                contexts.append((context, source, doc_id))
+            else:
+                contexts.append(context)
        return contexts

    def set_collection_name(self, name: str):
--- a/embedchain/vectordb/zilliz.py
+++ b/embedchain/vectordb/zilliz.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple, Union

 from embedchain.config import ZillizDBConfig
 from embedchain.helper.json_serializable import register_deserializable
@@ -127,8 +127,13 @@ class ZillizVectorDB(BaseVectorDB):
        self.client.flush(self.config.collection_name)

    def query(
-        self, input_query: List[str], n_results: int, where: Dict[str, any], skip_embedding: bool
-    ) -> List[Tuple[str, str, str]]:
+        self,
+        input_query: List[str],
+        n_results: int,
+        where: Dict[str, any],
+        skip_embedding: bool,
+        citations: bool = False,
+    ) -> Union[List[Tuple[str, str, str]], List[str]]:
        """
        Query contents from vector data base based on vector similarity

@@ -139,8 +144,11 @@ class ZillizVectorDB(BaseVectorDB):
        :param where: to filter data
        :type where: str
        :raises InvalidDimensionException: Dimensions do not match.
-        :return: The context of the document that matched your query, url of the source, doc_id
-        :rtype: List[Tuple[str,str,str]]
+        :param citations: we use citations boolean param to return context along with the answer.
+        :type citations: bool, default is False.
+        :return: The content of the document that matched your query,
+        along with url of the source and doc_id (if citations flag is true)
+        :rtype: List[str], if citations=False, otherwise List[Tuple[str, str, str]]
        """

        if self.collection.is_empty:
@@ -170,14 +178,17 @@ class ZillizVectorDB(BaseVectorDB):
                output_fields=output_fields,
            )

-        doc_list = []
+        contexts = []
        for query in query_result:
            data = query[0]["entity"]
            context = data["text"]
-            source = data["url"]
-            doc_id = data["doc_id"]
-            doc_list.append(tuple((context, source, doc_id)))
-        return doc_list
+            if citations:
+                source = data["url"]
+                doc_id = data["doc_id"]
+                contexts.append(tuple((context, source, doc_id)))
+            else:
+                contexts.append(context)
+        return contexts

    def count(self) -> int:
        """