[Feature] Update db.query to return source of context (#831)

2023-10-25 22:20:32 -07:00
parent a27eeb3255
commit d77e8da3f3
13 changed files with 195 additions and 73 deletions
--- a/embedchain/embedchain.py
+++ b/embedchain/embedchain.py
@@ -500,13 +500,17 @@ class EmbedChain(JSONSerializable):

            db_query = ClipProcessor.get_text_features(query=input_query)

-        contents = self.db.query(
+        contexts = self.db.query(
            input_query=db_query,
            n_results=query_config.number_documents,
            where=where,
            skip_embedding=(hasattr(config, "query_type") and config.query_type == "Images"),
        )
-        return contents
+
+        if len(contexts) > 0 and isinstance(contexts[0], tuple):
+            contexts = list(map(lambda x: x[0], contexts))
+
+        return contexts

    def query(self, input_query: str, config: BaseLlmConfig = None, dry_run=False, where: Optional[Dict] = None) -> str:
        """
--- a/embedchain/factory.py
+++ b/embedchain/factory.py
@@ -41,15 +41,15 @@ class LlmFactory:

 class EmbedderFactory:
    provider_to_class = {
+        "azure_openai": "embedchain.embedder.openai.OpenAIEmbedder",
        "gpt4all": "embedchain.embedder.gpt4all.GPT4AllEmbedder",
        "huggingface": "embedchain.embedder.huggingface.HuggingFaceEmbedder",
-        "vertexai": "embedchain.embedder.vertexai.VertexAIEmbedder",
-        "azure_openai": "embedchain.embedder.openai.OpenAIEmbedder",
        "openai": "embedchain.embedder.openai.OpenAIEmbedder",
+        "vertexai": "embedchain.embedder.vertexai.VertexAIEmbedder",
    }
    provider_to_config_class = {
-        "openai": "embedchain.config.embedder.base.BaseEmbedderConfig",
        "azure_openai": "embedchain.config.embedder.base.BaseEmbedderConfig",
+        "openai": "embedchain.config.embedder.base.BaseEmbedderConfig",
    }

    @classmethod
@@ -72,16 +72,18 @@ class VectorDBFactory:
        "elasticsearch": "embedchain.vectordb.elasticsearch.ElasticsearchDB",
        "opensearch": "embedchain.vectordb.opensearch.OpenSearchDB",
        "pinecone": "embedchain.vectordb.pinecone.PineconeDB",
-        "weaviate": "embedchain.vectordb.weaviate.WeaviateDB",
        "qdrant": "embedchain.vectordb.qdrant.QdrantDB",
+        "weaviate": "embedchain.vectordb.weaviate.WeaviateDB",
+        "zilliz": "embedchain.vectordb.zilliz.ZillizVectorDB",
    }
    provider_to_config_class = {
        "chroma": "embedchain.config.vectordb.chroma.ChromaDbConfig",
        "elasticsearch": "embedchain.config.vectordb.elasticsearch.ElasticsearchDBConfig",
        "opensearch": "embedchain.config.vectordb.opensearch.OpenSearchDBConfig",
        "pinecone": "embedchain.config.vectordb.pinecone.PineconeDBConfig",
-        "weaviate": "embedchain.config.vectordb.weaviate.WeaviateDBConfig",
        "qdrant": "embedchain.config.vectordb.qdrant.QdrantDBConfig",
+        "weaviate": "embedchain.config.vectordb.weaviate.WeaviateDBConfig",
+        "zilliz": "embedchain.config.vectordb.zilliz.ZillizDBConfig",
    }

    @classmethod
--- a/embedchain/vectordb/chroma.py
+++ b/embedchain/vectordb/chroma.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple

 from chromadb import Collection, QueryResult
 from langchain.docstore.document import Document
@@ -191,7 +191,9 @@ class ChromaDB(BaseVectorDB):
            )
        ]

-    def query(self, input_query: List[str], n_results: int, where: Dict[str, any], skip_embedding: bool) -> List[str]:
+    def query(
+        self, input_query: List[str], n_results: int, where: Dict[str, any], skip_embedding: bool
+    ) -> List[Tuple[str, str, str]]:
        """
        Query contents from vector database based on vector similarity

@@ -204,8 +206,8 @@ class ChromaDB(BaseVectorDB):
        :param skip_embedding: Optional. If True, then the input_query is assumed to be already embedded.
        :type skip_embedding: bool
        :raises InvalidDimensionException: Dimensions do not match.
-        :return: The content of the document that matched your query.
-        :rtype: List[str]
+        :return: The content of the document that matched your query, url of the source, doc_id
+        :rtype: List[Tuple[str,str,str]]
        """
        try:
            if skip_embedding:
@@ -231,8 +233,14 @@ class ChromaDB(BaseVectorDB):
                " embeddings, is used to retrieve an embedding from the database."
            ) from None
        results_formatted = self._format_result(result)
-        contents = [result[0].page_content for result in results_formatted]
-        return contents
+        contexts = []
+        for result in results_formatted:
+            context = result[0].page_content
+            metadata = result[0].metadata
+            source = metadata["url"]
+            doc_id = metadata["doc_id"]
+            contexts.append((context, source, doc_id))
+        return contexts

    def set_collection_name(self, name: str):
        """
--- a/embedchain/vectordb/elasticsearch.py
+++ b/embedchain/vectordb/elasticsearch.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple

 try:
    from elasticsearch import Elasticsearch
@@ -135,7 +135,9 @@ class ElasticsearchDB(BaseVectorDB):
        bulk(self.client, docs)
        self.client.indices.refresh(index=self._get_index())

-    def query(self, input_query: List[str], n_results: int, where: Dict[str, any], skip_embedding: bool) -> List[str]:
+    def query(
+        self, input_query: List[str], n_results: int, where: Dict[str, any], skip_embedding: bool
+    ) -> List[Tuple[str, str, str]]:
        """
        query contents from vector data base based on vector similarity

@@ -147,8 +149,9 @@ class ElasticsearchDB(BaseVectorDB):
        :type where: Dict[str, any]
        :param skip_embedding: Optional. If True, then the input_query is assumed to be already embedded.
        :type skip_embedding: bool
-        :return: Database contents that are the result of the query
-        :rtype: List[str]
+        :return: The context of the document that matched your query, url of the source, doc_id
+
+        :rtype: List[Tuple[str,str,str]]
        """
        if skip_embedding:
            query_vector = input_query
@@ -156,6 +159,7 @@ class ElasticsearchDB(BaseVectorDB):
            input_query_vector = self.embedder.embedding_fn(input_query)
            query_vector = input_query_vector[0]

+        # `https://www.elastic.co/guide/en/elasticsearch/reference/7.17/query-dsl-script-score-query.html`
        query = {
            "script_score": {
                "query": {"bool": {"must": [{"exists": {"field": "text"}}]}},
@@ -167,11 +171,17 @@ class ElasticsearchDB(BaseVectorDB):
        }
        if "app_id" in where:
            app_id = where["app_id"]
-            query["script_score"]["query"]["bool"]["must"] = [{"term": {"metadata.app_id": app_id}}]
-        _source = ["text"]
+            query["script_score"]["query"] = {"match": {"metadata.app_id": app_id}}
+        _source = ["text", "metadata.url", "metadata.doc_id"]
        response = self.client.search(index=self._get_index(), query=query, _source=_source, size=n_results)
        docs = response["hits"]["hits"]
-        contents = [doc["_source"]["text"] for doc in docs]
+        contents = []
+        for doc in docs:
+            context = doc["_source"]["text"]
+            metadata = doc["_source"]["metadata"]
+            source = metadata["url"]
+            doc_id = metadata["doc_id"]
+            contents.append(tuple((context, source, doc_id)))
        return contents

    def set_collection_name(self, name: str):
--- a/embedchain/vectordb/opensearch.py
+++ b/embedchain/vectordb/opensearch.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Dict, List, Optional, Set
+from typing import Dict, List, Optional, Set, Tuple

 try:
    from opensearchpy import OpenSearch
@@ -145,7 +145,9 @@ class OpenSearchDB(BaseVectorDB):
        bulk(self.client, docs)
        self.client.indices.refresh(index=self._get_index())

-    def query(self, input_query: List[str], n_results: int, where: Dict[str, any], skip_embedding: bool) -> List[str]:
+    def query(
+        self, input_query: List[str], n_results: int, where: Dict[str, any], skip_embedding: bool
+    ) -> List[Tuple[str, str, str]]:
        """
        query contents from vector data base based on vector similarity

@@ -157,8 +159,8 @@ class OpenSearchDB(BaseVectorDB):
        :type where: Dict[str, any]
        :param skip_embedding: Optional. If True, then the input_query is assumed to be already embedded.
        :type skip_embedding: bool
-        :return: Database contents that are the result of the query
-        :rtype: List[str]
+        :return: The content of the document that matched your query, url of the source, doc_id
+        :rtype: List[Tuple[str,str,str]]
        """
        # TODO(rupeshbansal, deshraj): Add support for skip embeddings here if already exists
        embeddings = OpenAIEmbeddings()
@@ -185,7 +187,13 @@ class OpenSearchDB(BaseVectorDB):
            pre_filter=pre_filter,
            k=n_results,
        )
-        contents = [doc.page_content for doc in docs]
+
+        contents = []
+        for doc in docs:
+            context = doc.page_content
+            source = doc.metadata["url"]
+            doc_id = doc.metadata["doc_id"]
+            contents.append(tuple((context, source, doc_id)))
        return contents

    def set_collection_name(self, name: str):
--- a/embedchain/vectordb/pinecone.py
+++ b/embedchain/vectordb/pinecone.py
@@ -1,5 +1,5 @@
 import os
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Tuple

 try:
    import pinecone
@@ -118,7 +118,9 @@ class PineconeDB(BaseVectorDB):
        for i in range(0, len(docs), self.BATCH_SIZE):
            self.client.upsert(docs[i : i + self.BATCH_SIZE])

-    def query(self, input_query: List[str], n_results: int, where: Dict[str, any], skip_embedding: bool) -> List[str]:
+    def query(
+        self, input_query: List[str], n_results: int, where: Dict[str, any], skip_embedding: bool
+    ) -> List[Tuple[str, str, str]]:
        """
        query contents from vector database based on vector similarity
        :param input_query: list of query string
@@ -129,16 +131,22 @@ class PineconeDB(BaseVectorDB):
        :type where: Dict[str, any]
        :param skip_embedding: Optional. if True, input_query is already embedded
        :type skip_embedding: bool
-        :return: Database contents that are the result of the query
-        :rtype: List[str]
+        :return: The content of the document that matched your query, url of the source, doc_id
+        :rtype: List[Tuple[str,str,str]]
        """
        if not skip_embedding:
            query_vector = self.embedder.embedding_fn([input_query])[0]
        else:
            query_vector = input_query
-        contents = self.client.query(vector=query_vector, filter=where, top_k=n_results, include_metadata=True)
-        embeddings = list(map(lambda content: content["metadata"]["text"], contents["matches"]))
-        return embeddings
+        data = self.client.query(vector=query_vector, filter=where, top_k=n_results, include_metadata=True)
+        contents = []
+        for doc in data["matches"]:
+            metadata = doc["metadata"]
+            context = metadata["text"]
+            source = metadata["url"]
+            doc_id = metadata["doc_id"]
+            contents.append(tuple((context, source, doc_id)))
+        return contents

    def set_collection_name(self, name: str):
        """
--- a/embedchain/vectordb/qdrant.py
+++ b/embedchain/vectordb/qdrant.py
@@ -1,7 +1,7 @@
 import copy
 import os
 import uuid
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Tuple

 try:
    from qdrant_client import QdrantClient
@@ -160,7 +160,9 @@ class QdrantDB(BaseVectorDB):
                ),
            )

-    def query(self, input_query: List[str], n_results: int, where: Dict[str, any], skip_embedding: bool) -> List[str]:
+    def query(
+        self, input_query: List[str], n_results: int, where: Dict[str, any], skip_embedding: bool
+    ) -> List[Tuple[str, str, str]]:
        """
        query contents from vector database based on vector similarity
        :param input_query: list of query string
@@ -172,8 +174,8 @@ class QdrantDB(BaseVectorDB):
        :param skip_embedding: A boolean flag indicating if the embedding for the documents to be added is to be
        generated or not
        :type skip_embedding: bool
-        :return: Database contents that are the result of the query
-        :rtype: List[str]
+        :return: The context of the document that matched your query, url of the source, doc_id
+        :rtype: List[Tuple[str,str,str]]
        """
        if not skip_embedding:
            query_vector = self.embedder.embedding_fn([input_query])[0]
@@ -199,9 +201,14 @@ class QdrantDB(BaseVectorDB):
            query_vector=query_vector,
            limit=n_results,
        )
+
        response = []
        for result in results:
-            response.append(result.payload.get("text", ""))
+            context = result.payload["text"]
+            metadata = result.payload["metadata"]
+            source = metadata["url"]
+            doc_id = metadata["doc_id"]
+            response.append(tuple((context, source, doc_id)))
        return response

    def count(self) -> int:
@@ -211,3 +218,15 @@ class QdrantDB(BaseVectorDB):
    def reset(self):
        self.client.delete_collection(collection_name=self.collection_name)
        self._initialize()
+
+    def set_collection_name(self, name: str):
+        """
+        Set the name of the collection. A collection is an isolated space for vectors.
+
+        :param name: Name of the collection.
+        :type name: str
+        """
+        if not isinstance(name, str):
+            raise TypeError("Collection name must be a string")
+        self.config.collection_name = name
+        self.collection_name = self._get_or_create_collection()
--- a/embedchain/vectordb/weaviate.py
+++ b/embedchain/vectordb/weaviate.py
@@ -1,6 +1,6 @@
 import copy
 import os
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Tuple

 try:
    import weaviate
@@ -194,7 +194,9 @@ class WeaviateDB(BaseVectorDB):
                )
                batch.add_reference(obj_uuid, self.index_name, "metadata", metadata_uuid, self.index_name + "_metadata")

-    def query(self, input_query: List[str], n_results: int, where: Dict[str, any], skip_embedding: bool) -> List[str]:
+    def query(
+        self, input_query: List[str], n_results: int, where: Dict[str, any], skip_embedding: bool
+    ) -> List[Tuple[str, str, str]]:
        """
        query contents from vector database based on vector similarity
        :param input_query: list of query string
@@ -206,14 +208,15 @@ class WeaviateDB(BaseVectorDB):
        :param skip_embedding: A boolean flag indicating if the embedding for the documents to be added is to be
        generated or not
        :type skip_embedding: bool
-        :return: Database contents that are the result of the query
-        :rtype: List[str]
+        :return: The context of the document that matched your query, url of the source, doc_id
+        :rtype: List[Tuple[str,str,str]]
        """
        if not skip_embedding:
            query_vector = self.embedder.embedding_fn([input_query])[0]
        else:
            query_vector = input_query
        keys = set(where.keys() if where is not None else set())
+        data_fields = ["text"]
        if len(keys.intersection(self.metadata_keys)) != 0:
            weaviate_where_operands = []
            for key in keys:
@@ -231,7 +234,7 @@ class WeaviateDB(BaseVectorDB):
                weaviate_where_clause = {"operator": "And", "operands": weaviate_where_operands}

            results = (
-                self.client.query.get(self.index_name, ["text"])
+                self.client.query.get(self.index_name, data_fields)
                .with_where(weaviate_where_clause)
                .with_near_vector({"vector": query_vector})
                .with_limit(n_results)
@@ -239,16 +242,13 @@ class WeaviateDB(BaseVectorDB):
            )
        else:
            results = (
-                self.client.query.get(self.index_name, ["text"])
+                self.client.query.get(self.index_name, data_fields)
                .with_near_vector({"vector": query_vector})
                .with_limit(n_results)
                .do()
            )
-        matched_tokens = []
-        for result in results["data"]["Get"].get(self.index_name):
-            matched_tokens.append(result["text"])
-
-        return matched_tokens
+        contexts = results["data"]["Get"].get(self.index_name)
+        return contexts

    def set_collection_name(self, name: str):
        """
--- a/embedchain/vectordb/zilliz.py
+++ b/embedchain/vectordb/zilliz.py
@@ -1,4 +1,5 @@
-from typing import Dict, List, Optional
+import logging
+from typing import Dict, List, Optional, Tuple

 from embedchain.config import ZillizDBConfig
 from embedchain.helper.json_serializable import register_deserializable
@@ -61,6 +62,7 @@ class ZillizVectorDB(BaseVectorDB):
        :type name: str
        """
        if utility.has_collection(name):
+            logging.info(f"[ZillizDB]: found an existing collection {name}, make sure the auto-id is disabled.")
            self.collection = Collection(name)
        else:
            fields = [
@@ -124,7 +126,9 @@ class ZillizVectorDB(BaseVectorDB):
        self.collection.flush()
        self.client.flush(self.config.collection_name)

-    def query(self, input_query: List[str], n_results: int, where: Dict[str, any], skip_embedding: bool) -> List[str]:
+    def query(
+        self, input_query: List[str], n_results: int, where: Dict[str, any], skip_embedding: bool
+    ) -> List[Tuple[str, str, str]]:
        """
        Query contents from vector data base based on vector similarity

@@ -135,8 +139,8 @@ class ZillizVectorDB(BaseVectorDB):
        :param where: to filter data
        :type where: str
        :raises InvalidDimensionException: Dimensions do not match.
-        :return: The content of the document that matched your query.
-        :rtype: List[str]
+        :return: The context of the document that matched your query, url of the source, doc_id
+        :rtype: List[Tuple[str,str,str]]
        """

        if self.collection.is_empty:
@@ -145,13 +149,14 @@ class ZillizVectorDB(BaseVectorDB):
        if not isinstance(where, str):
            where = None

+        output_fields = ["text", "url", "doc_id"]
        if skip_embedding:
            query_vector = input_query
            query_result = self.client.search(
                collection_name=self.config.collection_name,
                data=query_vector,
                limit=n_results,
-                output_fields=["text"],
+                output_fields=output_fields,
            )

        else:
@@ -162,13 +167,16 @@ class ZillizVectorDB(BaseVectorDB):
                collection_name=self.config.collection_name,
                data=[query_vector],
                limit=n_results,
-                output_fields=["text"],
+                output_fields=output_fields,
            )

        doc_list = []
        for query in query_result:
-            doc_list.append(query[0]["entity"]["text"])
-
+            data = query[0]["entity"]
+            context = data["text"]
+            source = data["url"]
+            doc_id = data["doc_id"]
+            doc_list.append(tuple((context, source, doc_id)))
        return doc_list

    def count(self) -> int: