Add GPT4Vision Image loader (#1089)

Co-authored-by: Deshraj Yadav <deshrajdry@gmail.com>
This commit is contained in:
Sidharth Mohanty
2024-01-02 03:57:23 +05:30
committed by GitHub
parent 367d6b70e2
commit c62663f2e4
29 changed files with 291 additions and 714 deletions

View File

@@ -132,7 +132,6 @@ class ChromaDB(BaseVectorDB):
documents: List[str],
metadatas: List[object],
ids: List[str],
skip_embedding: bool,
**kwargs: Optional[Dict[str, Any]],
) -> Any:
"""
@@ -146,13 +145,8 @@ class ChromaDB(BaseVectorDB):
:type metadatas: List[object]
:param ids: ids
:type ids: List[str]
:param skip_embedding: Optional. If True, then the embeddings are assumed to be already generated.
:type skip_embedding: bool
"""
size = len(documents)
if skip_embedding and (embeddings is None or len(embeddings) != len(documents)):
raise ValueError("Cannot add documents to chromadb with inconsistent embeddings")
if len(documents) != size or len(metadatas) != size or len(ids) != size:
raise ValueError(
"Cannot add documents to chromadb with inconsistent sizes. Documents size: {}, Metadata size: {},"
@@ -160,19 +154,11 @@ class ChromaDB(BaseVectorDB):
)
for i in tqdm(range(0, len(documents), self.BATCH_SIZE), desc="Inserting batches in chromadb"):
if skip_embedding:
self.collection.add(
embeddings=embeddings[i : i + self.BATCH_SIZE],
documents=documents[i : i + self.BATCH_SIZE],
metadatas=metadatas[i : i + self.BATCH_SIZE],
ids=ids[i : i + self.BATCH_SIZE],
)
else:
self.collection.add(
documents=documents[i : i + self.BATCH_SIZE],
metadatas=metadatas[i : i + self.BATCH_SIZE],
ids=ids[i : i + self.BATCH_SIZE],
)
self.collection.add(
documents=documents[i : i + self.BATCH_SIZE],
metadatas=metadatas[i : i + self.BATCH_SIZE],
ids=ids[i : i + self.BATCH_SIZE],
)
def _format_result(self, results: QueryResult) -> list[tuple[Document, float]]:
"""
@@ -197,7 +183,6 @@ class ChromaDB(BaseVectorDB):
input_query: List[str],
n_results: int,
where: Dict[str, any],
skip_embedding: bool,
citations: bool = False,
**kwargs: Optional[Dict[str, Any]],
) -> Union[List[Tuple[str, Dict]], List[str]]:
@@ -210,8 +195,6 @@ class ChromaDB(BaseVectorDB):
:type n_results: int
:param where: to filter data
:type where: Dict[str, Any]
:param skip_embedding: Optional. If True, then the input_query is assumed to be already embedded.
:type skip_embedding: bool
:param citations: we use citations boolean param to return context along with the answer.
:type citations: bool, default is False.
:raises InvalidDimensionException: Dimensions do not match.
@@ -220,24 +203,14 @@ class ChromaDB(BaseVectorDB):
:rtype: List[str], if citations=False, otherwise List[Tuple[str, str, str]]
"""
try:
if skip_embedding:
result = self.collection.query(
query_embeddings=[
input_query,
],
n_results=n_results,
where=self._generate_where_clause(where),
**kwargs,
)
else:
result = self.collection.query(
query_texts=[
input_query,
],
n_results=n_results,
where=self._generate_where_clause(where),
**kwargs,
)
result = self.collection.query(
query_texts=[
input_query,
],
n_results=n_results,
where=self._generate_where_clause(where),
**kwargs,
)
except InvalidDimensionException as e:
raise InvalidDimensionException(
e.message()

View File

@@ -114,7 +114,6 @@ class ElasticsearchDB(BaseVectorDB):
documents: List[str],
metadatas: List[object],
ids: List[str],
skip_embedding: bool,
**kwargs: Optional[Dict[str, any]],
) -> Any:
"""
@@ -127,12 +126,9 @@ class ElasticsearchDB(BaseVectorDB):
:type metadatas: List[object]
:param ids: ids of docs
:type ids: List[str]
:param skip_embedding: Optional. If True, then the input_query is assumed to be already embedded.
:type skip_embedding: bool
"""
if not skip_embedding:
embeddings = self.embedder.embedding_fn(documents)
embeddings = self.embedder.embedding_fn(documents)
for chunk in chunks(
list(zip(ids, documents, metadatas, embeddings)), self.BATCH_SIZE, desc="Inserting batches in elasticsearch"
@@ -161,7 +157,6 @@ class ElasticsearchDB(BaseVectorDB):
input_query: List[str],
n_results: int,
where: Dict[str, any],
skip_embedding: bool,
citations: bool = False,
**kwargs: Optional[Dict[str, Any]],
) -> Union[List[Tuple[str, Dict]], List[str]]:
@@ -174,8 +169,6 @@ class ElasticsearchDB(BaseVectorDB):
:type n_results: int
:param where: Optional. to filter data
:type where: Dict[str, any]
:param skip_embedding: Optional. If True, then the input_query is assumed to be already embedded.
:type skip_embedding: bool
:return: The context of the document that matched your query, url of the source, doc_id
:param citations: we use citations boolean param to return context along with the answer.
:type citations: bool, default is False.
@@ -183,11 +176,8 @@ class ElasticsearchDB(BaseVectorDB):
along with url of the source and doc_id (if citations flag is true)
:rtype: List[str], if citations=False, otherwise List[Tuple[str, str, str]]
"""
if skip_embedding:
query_vector = input_query
else:
input_query_vector = self.embedder.embedding_fn(input_query)
query_vector = input_query_vector[0]
input_query_vector = self.embedder.embedding_fn(input_query)
query_vector = input_query_vector[0]
# `https://www.elastic.co/guide/en/elasticsearch/reference/7.17/query-dsl-script-score-query.html`
query = {

View File

@@ -120,7 +120,6 @@ class OpenSearchDB(BaseVectorDB):
documents: List[str],
metadatas: List[object],
ids: List[str],
skip_embedding: bool,
**kwargs: Optional[Dict[str, any]],
):
"""Add data in vector database.
@@ -130,17 +129,11 @@ class OpenSearchDB(BaseVectorDB):
documents (List[str]): List of texts to add.
metadatas (List[object]): List of metadata associated with docs.
ids (List[str]): IDs of docs.
skip_embedding (bool): If True, then embeddings are assumed to be already generated.
"""
for batch_start in tqdm(range(0, len(documents), self.BATCH_SIZE), desc="Inserting batches in opensearch"):
batch_end = batch_start + self.BATCH_SIZE
batch_documents = documents[batch_start:batch_end]
# Generate embeddings for the batch if not skipping embedding
if not skip_embedding:
batch_embeddings = self.embedder.embedding_fn(batch_documents)
else:
batch_embeddings = embeddings[batch_start:batch_end]
batch_embeddings = embeddings[batch_start:batch_end]
# Create document entries for bulk upload
batch_entries = [
@@ -166,7 +159,6 @@ class OpenSearchDB(BaseVectorDB):
input_query: List[str],
n_results: int,
where: Dict[str, any],
skip_embedding: bool,
citations: bool = False,
**kwargs: Optional[Dict[str, Any]],
) -> Union[List[Tuple[str, Dict]], List[str]]:
@@ -179,15 +171,12 @@ class OpenSearchDB(BaseVectorDB):
:type n_results: int
:param where: Optional. to filter data
:type where: Dict[str, any]
:param skip_embedding: Optional. If True, then the input_query is assumed to be already embedded.
:type skip_embedding: bool
:param citations: we use citations boolean param to return context along with the answer.
:type citations: bool, default is False.
:return: The content of the document that matched your query,
along with url of the source and doc_id (if citations flag is true)
:rtype: List[str], if citations=False, otherwise List[Tuple[str, str, str]]
"""
# TODO(rupeshbansal, deshraj): Add support for skip embeddings here if already exists
embeddings = OpenAIEmbeddings()
docsearch = OpenSearchVectorSearch(
index_name=self._get_index(),

View File

@@ -92,7 +92,6 @@ class PineconeDB(BaseVectorDB):
documents: List[str],
metadatas: List[object],
ids: List[str],
skip_embedding: bool,
**kwargs: Optional[Dict[str, any]],
):
"""add data in vector database
@@ -124,7 +123,6 @@ class PineconeDB(BaseVectorDB):
input_query: List[str],
n_results: int,
where: Dict[str, any],
skip_embedding: bool,
citations: bool = False,
**kwargs: Optional[Dict[str, any]],
) -> Union[List[Tuple[str, Dict]], List[str]]:
@@ -136,18 +134,13 @@ class PineconeDB(BaseVectorDB):
:type n_results: int
:param where: Optional. to filter data
:type where: Dict[str, any]
:param skip_embedding: Optional. if True, input_query is already embedded
:type skip_embedding: bool
:param citations: we use citations boolean param to return context along with the answer.
:type citations: bool, default is False.
:return: The content of the document that matched your query,
along with url of the source and doc_id (if citations flag is true)
:rtype: List[str], if citations=False, otherwise List[Tuple[str, str, str]]
"""
if not skip_embedding:
query_vector = self.embedder.embedding_fn([input_query])[0]
else:
query_vector = input_query
query_vector = self.embedder.embedding_fn([input_query])[0]
data = self.client.query(vector=query_vector, filter=where, top_k=n_results, include_metadata=True, **kwargs)
contexts = []
for doc in data["matches"]:

View File

@@ -126,7 +126,6 @@ class QdrantDB(BaseVectorDB):
documents: List[str],
metadatas: List[object],
ids: List[str],
skip_embedding: bool,
**kwargs: Optional[Dict[str, any]],
):
"""add data in vector database
@@ -138,12 +137,8 @@ class QdrantDB(BaseVectorDB):
:type metadatas: List[object]
:param ids: ids of docs
:type ids: List[str]
:param skip_embedding: A boolean flag indicating if the embedding for the documents to be added is to be
generated or not
:type skip_embedding: bool
"""
if not skip_embedding:
embeddings = self.embedder.embedding_fn(documents)
embeddings = self.embedder.embedding_fn(documents)
payloads = []
qdrant_ids = []
@@ -167,7 +162,6 @@ class QdrantDB(BaseVectorDB):
input_query: List[str],
n_results: int,
where: Dict[str, any],
skip_embedding: bool,
citations: bool = False,
**kwargs: Optional[Dict[str, Any]],
) -> Union[List[Tuple[str, Dict]], List[str]]:
@@ -179,20 +173,13 @@ class QdrantDB(BaseVectorDB):
:type n_results: int
:param where: Optional. to filter data
:type where: Dict[str, any]
:param skip_embedding: A boolean flag indicating if the embedding for the documents to be added is to be
generated or not
:type skip_embedding: bool
:param citations: we use citations boolean param to return context along with the answer.
:type citations: bool, default is False.
:return: The content of the document that matched your query,
along with url of the source and doc_id (if citations flag is true)
:rtype: List[str], if citations=False, otherwise List[Tuple[str, str, str]]
"""
if not skip_embedding:
query_vector = self.embedder.embedding_fn([input_query])[0]
else:
query_vector = input_query
query_vector = self.embedder.embedding_fn([input_query])[0]
keys = set(where.keys() if where is not None else set())
qdrant_must_filters = []

View File

@@ -157,7 +157,6 @@ class WeaviateDB(BaseVectorDB):
documents: List[str],
metadatas: List[object],
ids: List[str],
skip_embedding: bool,
**kwargs: Optional[Dict[str, any]],
):
"""add data in vector database
@@ -169,14 +168,8 @@ class WeaviateDB(BaseVectorDB):
:type metadatas: List[object]
:param ids: ids of docs
:type ids: List[str]
:param skip_embedding: A boolean flag indicating if the embedding for the documents to be added is to be
generated or not
:type skip_embedding: bool
"""
print("Adding documents to Weaviate...")
if not skip_embedding:
embeddings = self.embedder.embedding_fn(documents)
embeddings = self.embedder.embedding_fn(documents)
self.client.batch.configure(batch_size=self.BATCH_SIZE, timeout_retries=3) # Configure batch
with self.client.batch as batch: # Initialize a batch process
for id, text, metadata, embedding in zip(ids, documents, metadatas, embeddings):
@@ -202,7 +195,6 @@ class WeaviateDB(BaseVectorDB):
input_query: List[str],
n_results: int,
where: Dict[str, any],
skip_embedding: bool,
citations: bool = False,
**kwargs: Optional[Dict[str, Any]],
) -> Union[List[Tuple[str, Dict]], List[str]]:
@@ -214,20 +206,13 @@ class WeaviateDB(BaseVectorDB):
:type n_results: int
:param where: Optional. to filter data
:type where: Dict[str, any]
:param skip_embedding: A boolean flag indicating if the embedding for the documents to be added is to be
generated or not
:type skip_embedding: bool
:param citations: we use citations boolean param to return context along with the answer.
:type citations: bool, default is False.
:return: The content of the document that matched your query,
along with url of the source and doc_id (if citations flag is true)
:rtype: List[str], if citations=False, otherwise List[Tuple[str, str, str]]
"""
if not skip_embedding:
query_vector = self.embedder.embedding_fn([input_query])[0]
else:
query_vector = input_query
query_vector = self.embedder.embedding_fn([input_query])[0]
keys = set(where.keys() if where is not None else set())
data_fields = ["text"]

View File

@@ -112,12 +112,10 @@ class ZillizVectorDB(BaseVectorDB):
documents: List[str],
metadatas: List[object],
ids: List[str],
skip_embedding: bool,
**kwargs: Optional[Dict[str, any]],
):
"""Add to database"""
if not skip_embedding:
embeddings = self.embedder.embedding_fn(documents)
embeddings = self.embedder.embedding_fn(documents)
for id, doc, metadata, embedding in zip(ids, documents, metadatas, embeddings):
data = {**metadata, "id": id, "text": doc, "embeddings": embedding}
@@ -132,7 +130,6 @@ class ZillizVectorDB(BaseVectorDB):
input_query: List[str],
n_results: int,
where: Dict[str, any],
skip_embedding: bool,
citations: bool = False,
**kwargs: Optional[Dict[str, Any]],
) -> Union[List[Tuple[str, Dict]], List[str]]:
@@ -160,27 +157,16 @@ class ZillizVectorDB(BaseVectorDB):
where = None
output_fields = ["*"]
if skip_embedding:
query_vector = input_query
query_result = self.client.search(
collection_name=self.config.collection_name,
data=query_vector,
limit=n_results,
output_fields=output_fields,
**kwargs,
)
input_query_vector = self.embedder.embedding_fn([input_query])
query_vector = input_query_vector[0]
else:
input_query_vector = self.embedder.embedding_fn([input_query])
query_vector = input_query_vector[0]
query_result = self.client.search(
collection_name=self.config.collection_name,
data=[query_vector],
limit=n_results,
output_fields=output_fields,
**kwargs,
)
query_result = self.client.search(
collection_name=self.config.collection_name,
data=[query_vector],
limit=n_results,
output_fields=output_fields,
**kwargs,
)
query_result = query_result[0]
contexts = []
for query in query_result: