Add GPT4Vision Image loader (#1089)

Co-authored-by: Deshraj Yadav <deshrajdry@gmail.com>
2024-01-02 03:57:23 +05:30
parent 367d6b70e2
commit c62663f2e4
29 changed files with 291 additions and 714 deletions
--- a/embedchain/vectordb/chroma.py
+++ b/embedchain/vectordb/chroma.py
@@ -132,7 +132,6 @@ class ChromaDB(BaseVectorDB):
        documents: List[str],
        metadatas: List[object],
        ids: List[str],
-        skip_embedding: bool,
        **kwargs: Optional[Dict[str, Any]],
    ) -> Any:
        """
@@ -146,13 +145,8 @@ class ChromaDB(BaseVectorDB):
        :type metadatas: List[object]
        :param ids: ids
        :type ids: List[str]
-        :param skip_embedding: Optional. If True, then the embeddings are assumed to be already generated.
-        :type skip_embedding: bool
        """
        size = len(documents)
-        if skip_embedding and (embeddings is None or len(embeddings) != len(documents)):
-            raise ValueError("Cannot add documents to chromadb with inconsistent embeddings")
-
        if len(documents) != size or len(metadatas) != size or len(ids) != size:
            raise ValueError(
                "Cannot add documents to chromadb with inconsistent sizes. Documents size: {}, Metadata size: {},"
@@ -160,19 +154,11 @@ class ChromaDB(BaseVectorDB):
            )

        for i in tqdm(range(0, len(documents), self.BATCH_SIZE), desc="Inserting batches in chromadb"):
-            if skip_embedding:
-                self.collection.add(
-                    embeddings=embeddings[i : i + self.BATCH_SIZE],
-                    documents=documents[i : i + self.BATCH_SIZE],
-                    metadatas=metadatas[i : i + self.BATCH_SIZE],
-                    ids=ids[i : i + self.BATCH_SIZE],
-                )
-            else:
-                self.collection.add(
-                    documents=documents[i : i + self.BATCH_SIZE],
-                    metadatas=metadatas[i : i + self.BATCH_SIZE],
-                    ids=ids[i : i + self.BATCH_SIZE],
-                )
+            self.collection.add(
+                documents=documents[i : i + self.BATCH_SIZE],
+                metadatas=metadatas[i : i + self.BATCH_SIZE],
+                ids=ids[i : i + self.BATCH_SIZE],
+            )

    def _format_result(self, results: QueryResult) -> list[tuple[Document, float]]:
        """
@@ -197,7 +183,6 @@ class ChromaDB(BaseVectorDB):
        input_query: List[str],
        n_results: int,
        where: Dict[str, any],
-        skip_embedding: bool,
        citations: bool = False,
        **kwargs: Optional[Dict[str, Any]],
    ) -> Union[List[Tuple[str, Dict]], List[str]]:
@@ -210,8 +195,6 @@ class ChromaDB(BaseVectorDB):
        :type n_results: int
        :param where: to filter data
        :type where: Dict[str, Any]
-        :param skip_embedding: Optional. If True, then the input_query is assumed to be already embedded.
-        :type skip_embedding: bool
        :param citations: we use citations boolean param to return context along with the answer.
        :type citations: bool, default is False.
        :raises InvalidDimensionException: Dimensions do not match.
@@ -220,24 +203,14 @@ class ChromaDB(BaseVectorDB):
        :rtype: List[str], if citations=False, otherwise List[Tuple[str, str, str]]
        """
        try:
-            if skip_embedding:
-                result = self.collection.query(
-                    query_embeddings=[
-                        input_query,
-                    ],
-                    n_results=n_results,
-                    where=self._generate_where_clause(where),
-                    **kwargs,
-                )
-            else:
-                result = self.collection.query(
-                    query_texts=[
-                        input_query,
-                    ],
-                    n_results=n_results,
-                    where=self._generate_where_clause(where),
-                    **kwargs,
-                )
+            result = self.collection.query(
+                query_texts=[
+                    input_query,
+                ],
+                n_results=n_results,
+                where=self._generate_where_clause(where),
+                **kwargs,
+            )
        except InvalidDimensionException as e:
            raise InvalidDimensionException(
                e.message()
--- a/embedchain/vectordb/elasticsearch.py
+++ b/embedchain/vectordb/elasticsearch.py
@@ -114,7 +114,6 @@ class ElasticsearchDB(BaseVectorDB):
        documents: List[str],
        metadatas: List[object],
        ids: List[str],
-        skip_embedding: bool,
        **kwargs: Optional[Dict[str, any]],
    ) -> Any:
        """
@@ -127,12 +126,9 @@ class ElasticsearchDB(BaseVectorDB):
        :type metadatas: List[object]
        :param ids: ids of docs
        :type ids: List[str]
-        :param skip_embedding: Optional. If True, then the input_query is assumed to be already embedded.
-        :type skip_embedding: bool
        """

-        if not skip_embedding:
-            embeddings = self.embedder.embedding_fn(documents)
+        embeddings = self.embedder.embedding_fn(documents)

        for chunk in chunks(
            list(zip(ids, documents, metadatas, embeddings)), self.BATCH_SIZE, desc="Inserting batches in elasticsearch"
@@ -161,7 +157,6 @@ class ElasticsearchDB(BaseVectorDB):
        input_query: List[str],
        n_results: int,
        where: Dict[str, any],
-        skip_embedding: bool,
        citations: bool = False,
        **kwargs: Optional[Dict[str, Any]],
    ) -> Union[List[Tuple[str, Dict]], List[str]]:
@@ -174,8 +169,6 @@ class ElasticsearchDB(BaseVectorDB):
        :type n_results: int
        :param where: Optional. to filter data
        :type where: Dict[str, any]
-        :param skip_embedding: Optional. If True, then the input_query is assumed to be already embedded.
-        :type skip_embedding: bool
        :return: The context of the document that matched your query, url of the source, doc_id
        :param citations: we use citations boolean param to return context along with the answer.
        :type citations: bool, default is False.
@@ -183,11 +176,8 @@ class ElasticsearchDB(BaseVectorDB):
        along with url of the source and doc_id (if citations flag is true)
        :rtype: List[str], if citations=False, otherwise List[Tuple[str, str, str]]
        """
-        if skip_embedding:
-            query_vector = input_query
-        else:
-            input_query_vector = self.embedder.embedding_fn(input_query)
-            query_vector = input_query_vector[0]
+        input_query_vector = self.embedder.embedding_fn(input_query)
+        query_vector = input_query_vector[0]

        # `https://www.elastic.co/guide/en/elasticsearch/reference/7.17/query-dsl-script-score-query.html`
        query = {
--- a/embedchain/vectordb/opensearch.py
+++ b/embedchain/vectordb/opensearch.py
@@ -120,7 +120,6 @@ class OpenSearchDB(BaseVectorDB):
        documents: List[str],
        metadatas: List[object],
        ids: List[str],
-        skip_embedding: bool,
        **kwargs: Optional[Dict[str, any]],
    ):
        """Add data in vector database.
@@ -130,17 +129,11 @@ class OpenSearchDB(BaseVectorDB):
            documents (List[str]): List of texts to add.
            metadatas (List[object]): List of metadata associated with docs.
            ids (List[str]): IDs of docs.
-            skip_embedding (bool): If True, then embeddings are assumed to be already generated.
        """
        for batch_start in tqdm(range(0, len(documents), self.BATCH_SIZE), desc="Inserting batches in opensearch"):
            batch_end = batch_start + self.BATCH_SIZE
            batch_documents = documents[batch_start:batch_end]
-
-            # Generate embeddings for the batch if not skipping embedding
-            if not skip_embedding:
-                batch_embeddings = self.embedder.embedding_fn(batch_documents)
-            else:
-                batch_embeddings = embeddings[batch_start:batch_end]
+            batch_embeddings = embeddings[batch_start:batch_end]

            # Create document entries for bulk upload
            batch_entries = [
@@ -166,7 +159,6 @@ class OpenSearchDB(BaseVectorDB):
        input_query: List[str],
        n_results: int,
        where: Dict[str, any],
-        skip_embedding: bool,
        citations: bool = False,
        **kwargs: Optional[Dict[str, Any]],
    ) -> Union[List[Tuple[str, Dict]], List[str]]:
@@ -179,15 +171,12 @@ class OpenSearchDB(BaseVectorDB):
        :type n_results: int
        :param where: Optional. to filter data
        :type where: Dict[str, any]
-        :param skip_embedding: Optional. If True, then the input_query is assumed to be already embedded.
-        :type skip_embedding: bool
        :param citations: we use citations boolean param to return context along with the answer.
        :type citations: bool, default is False.
        :return: The content of the document that matched your query,
        along with url of the source and doc_id (if citations flag is true)
        :rtype: List[str], if citations=False, otherwise List[Tuple[str, str, str]]
        """
-        # TODO(rupeshbansal, deshraj): Add support for skip embeddings here if already exists
        embeddings = OpenAIEmbeddings()
        docsearch = OpenSearchVectorSearch(
            index_name=self._get_index(),
--- a/embedchain/vectordb/pinecone.py
+++ b/embedchain/vectordb/pinecone.py
@@ -92,7 +92,6 @@ class PineconeDB(BaseVectorDB):
        documents: List[str],
        metadatas: List[object],
        ids: List[str],
-        skip_embedding: bool,
        **kwargs: Optional[Dict[str, any]],
    ):
        """add data in vector database
@@ -124,7 +123,6 @@ class PineconeDB(BaseVectorDB):
        input_query: List[str],
        n_results: int,
        where: Dict[str, any],
-        skip_embedding: bool,
        citations: bool = False,
        **kwargs: Optional[Dict[str, any]],
    ) -> Union[List[Tuple[str, Dict]], List[str]]:
@@ -136,18 +134,13 @@ class PineconeDB(BaseVectorDB):
        :type n_results: int
        :param where: Optional. to filter data
        :type where: Dict[str, any]
-        :param skip_embedding: Optional. if True, input_query is already embedded
-        :type skip_embedding: bool
        :param citations: we use citations boolean param to return context along with the answer.
        :type citations: bool, default is False.
        :return: The content of the document that matched your query,
        along with url of the source and doc_id (if citations flag is true)
        :rtype: List[str], if citations=False, otherwise List[Tuple[str, str, str]]
        """
-        if not skip_embedding:
-            query_vector = self.embedder.embedding_fn([input_query])[0]
-        else:
-            query_vector = input_query
+        query_vector = self.embedder.embedding_fn([input_query])[0]
        data = self.client.query(vector=query_vector, filter=where, top_k=n_results, include_metadata=True, **kwargs)
        contexts = []
        for doc in data["matches"]:
--- a/embedchain/vectordb/qdrant.py
+++ b/embedchain/vectordb/qdrant.py
@@ -126,7 +126,6 @@ class QdrantDB(BaseVectorDB):
        documents: List[str],
        metadatas: List[object],
        ids: List[str],
-        skip_embedding: bool,
        **kwargs: Optional[Dict[str, any]],
    ):
        """add data in vector database
@@ -138,12 +137,8 @@ class QdrantDB(BaseVectorDB):
        :type metadatas: List[object]
        :param ids: ids of docs
        :type ids: List[str]
-        :param skip_embedding: A boolean flag indicating if the embedding for the documents to be added is to be
-        generated or not
-        :type skip_embedding: bool
        """
-        if not skip_embedding:
-            embeddings = self.embedder.embedding_fn(documents)
+        embeddings = self.embedder.embedding_fn(documents)

        payloads = []
        qdrant_ids = []
@@ -167,7 +162,6 @@ class QdrantDB(BaseVectorDB):
        input_query: List[str],
        n_results: int,
        where: Dict[str, any],
-        skip_embedding: bool,
        citations: bool = False,
        **kwargs: Optional[Dict[str, Any]],
    ) -> Union[List[Tuple[str, Dict]], List[str]]:
@@ -179,20 +173,13 @@ class QdrantDB(BaseVectorDB):
        :type n_results: int
        :param where: Optional. to filter data
        :type where: Dict[str, any]
-        :param skip_embedding: A boolean flag indicating if the embedding for the documents to be added is to be
-        generated or not
-        :type skip_embedding: bool
        :param citations: we use citations boolean param to return context along with the answer.
        :type citations: bool, default is False.
        :return: The content of the document that matched your query,
        along with url of the source and doc_id (if citations flag is true)
        :rtype: List[str], if citations=False, otherwise List[Tuple[str, str, str]]
        """
-        if not skip_embedding:
-            query_vector = self.embedder.embedding_fn([input_query])[0]
-        else:
-            query_vector = input_query
-
+        query_vector = self.embedder.embedding_fn([input_query])[0]
        keys = set(where.keys() if where is not None else set())

        qdrant_must_filters = []
--- a/embedchain/vectordb/weaviate.py
+++ b/embedchain/vectordb/weaviate.py
@@ -157,7 +157,6 @@ class WeaviateDB(BaseVectorDB):
        documents: List[str],
        metadatas: List[object],
        ids: List[str],
-        skip_embedding: bool,
        **kwargs: Optional[Dict[str, any]],
    ):
        """add data in vector database
@@ -169,14 +168,8 @@ class WeaviateDB(BaseVectorDB):
        :type metadatas: List[object]
        :param ids: ids of docs
        :type ids: List[str]
-        :param skip_embedding: A boolean flag indicating if the embedding for the documents to be added is to be
-        generated or not
-        :type skip_embedding: bool
        """
-
-        print("Adding documents to Weaviate...")
-        if not skip_embedding:
-            embeddings = self.embedder.embedding_fn(documents)
+        embeddings = self.embedder.embedding_fn(documents)
        self.client.batch.configure(batch_size=self.BATCH_SIZE, timeout_retries=3)  # Configure batch
        with self.client.batch as batch:  # Initialize a batch process
            for id, text, metadata, embedding in zip(ids, documents, metadatas, embeddings):
@@ -202,7 +195,6 @@ class WeaviateDB(BaseVectorDB):
        input_query: List[str],
        n_results: int,
        where: Dict[str, any],
-        skip_embedding: bool,
        citations: bool = False,
        **kwargs: Optional[Dict[str, Any]],
    ) -> Union[List[Tuple[str, Dict]], List[str]]:
@@ -214,20 +206,13 @@ class WeaviateDB(BaseVectorDB):
        :type n_results: int
        :param where: Optional. to filter data
        :type where: Dict[str, any]
-        :param skip_embedding: A boolean flag indicating if the embedding for the documents to be added is to be
-        generated or not
-        :type skip_embedding: bool
        :param citations: we use citations boolean param to return context along with the answer.
        :type citations: bool, default is False.
        :return: The content of the document that matched your query,
        along with url of the source and doc_id (if citations flag is true)
        :rtype: List[str], if citations=False, otherwise List[Tuple[str, str, str]]
        """
-        if not skip_embedding:
-            query_vector = self.embedder.embedding_fn([input_query])[0]
-        else:
-            query_vector = input_query
-
+        query_vector = self.embedder.embedding_fn([input_query])[0]
        keys = set(where.keys() if where is not None else set())
        data_fields = ["text"]

--- a/embedchain/vectordb/zilliz.py
+++ b/embedchain/vectordb/zilliz.py
@@ -112,12 +112,10 @@ class ZillizVectorDB(BaseVectorDB):
        documents: List[str],
        metadatas: List[object],
        ids: List[str],
-        skip_embedding: bool,
        **kwargs: Optional[Dict[str, any]],
    ):
        """Add to database"""
-        if not skip_embedding:
-            embeddings = self.embedder.embedding_fn(documents)
+        embeddings = self.embedder.embedding_fn(documents)

        for id, doc, metadata, embedding in zip(ids, documents, metadatas, embeddings):
            data = {**metadata, "id": id, "text": doc, "embeddings": embedding}
@@ -132,7 +130,6 @@ class ZillizVectorDB(BaseVectorDB):
        input_query: List[str],
        n_results: int,
        where: Dict[str, any],
-        skip_embedding: bool,
        citations: bool = False,
        **kwargs: Optional[Dict[str, Any]],
    ) -> Union[List[Tuple[str, Dict]], List[str]]:
@@ -160,27 +157,16 @@ class ZillizVectorDB(BaseVectorDB):
            where = None

        output_fields = ["*"]
-        if skip_embedding:
-            query_vector = input_query
-            query_result = self.client.search(
-                collection_name=self.config.collection_name,
-                data=query_vector,
-                limit=n_results,
-                output_fields=output_fields,
-                **kwargs,
-            )
+        input_query_vector = self.embedder.embedding_fn([input_query])
+        query_vector = input_query_vector[0]

-        else:
-            input_query_vector = self.embedder.embedding_fn([input_query])
-            query_vector = input_query_vector[0]
-
-            query_result = self.client.search(
-                collection_name=self.config.collection_name,
-                data=[query_vector],
-                limit=n_results,
-                output_fields=output_fields,
-                **kwargs,
-            )
+        query_result = self.client.search(
+            collection_name=self.config.collection_name,
+            data=[query_vector],
+            limit=n_results,
+            output_fields=output_fields,
+            **kwargs,
+        )
        query_result = query_result[0]
        contexts = []
        for query in query_result: