Add GPT4Vision Image loader (#1089)
Co-authored-by: Deshraj Yadav <deshrajdry@gmail.com>
This commit is contained in:
@@ -132,7 +132,6 @@ class ChromaDB(BaseVectorDB):
|
||||
documents: List[str],
|
||||
metadatas: List[object],
|
||||
ids: List[str],
|
||||
skip_embedding: bool,
|
||||
**kwargs: Optional[Dict[str, Any]],
|
||||
) -> Any:
|
||||
"""
|
||||
@@ -146,13 +145,8 @@ class ChromaDB(BaseVectorDB):
|
||||
:type metadatas: List[object]
|
||||
:param ids: ids
|
||||
:type ids: List[str]
|
||||
:param skip_embedding: Optional. If True, then the embeddings are assumed to be already generated.
|
||||
:type skip_embedding: bool
|
||||
"""
|
||||
size = len(documents)
|
||||
if skip_embedding and (embeddings is None or len(embeddings) != len(documents)):
|
||||
raise ValueError("Cannot add documents to chromadb with inconsistent embeddings")
|
||||
|
||||
if len(documents) != size or len(metadatas) != size or len(ids) != size:
|
||||
raise ValueError(
|
||||
"Cannot add documents to chromadb with inconsistent sizes. Documents size: {}, Metadata size: {},"
|
||||
@@ -160,19 +154,11 @@ class ChromaDB(BaseVectorDB):
|
||||
)
|
||||
|
||||
for i in tqdm(range(0, len(documents), self.BATCH_SIZE), desc="Inserting batches in chromadb"):
|
||||
if skip_embedding:
|
||||
self.collection.add(
|
||||
embeddings=embeddings[i : i + self.BATCH_SIZE],
|
||||
documents=documents[i : i + self.BATCH_SIZE],
|
||||
metadatas=metadatas[i : i + self.BATCH_SIZE],
|
||||
ids=ids[i : i + self.BATCH_SIZE],
|
||||
)
|
||||
else:
|
||||
self.collection.add(
|
||||
documents=documents[i : i + self.BATCH_SIZE],
|
||||
metadatas=metadatas[i : i + self.BATCH_SIZE],
|
||||
ids=ids[i : i + self.BATCH_SIZE],
|
||||
)
|
||||
self.collection.add(
|
||||
documents=documents[i : i + self.BATCH_SIZE],
|
||||
metadatas=metadatas[i : i + self.BATCH_SIZE],
|
||||
ids=ids[i : i + self.BATCH_SIZE],
|
||||
)
|
||||
|
||||
def _format_result(self, results: QueryResult) -> list[tuple[Document, float]]:
|
||||
"""
|
||||
@@ -197,7 +183,6 @@ class ChromaDB(BaseVectorDB):
|
||||
input_query: List[str],
|
||||
n_results: int,
|
||||
where: Dict[str, any],
|
||||
skip_embedding: bool,
|
||||
citations: bool = False,
|
||||
**kwargs: Optional[Dict[str, Any]],
|
||||
) -> Union[List[Tuple[str, Dict]], List[str]]:
|
||||
@@ -210,8 +195,6 @@ class ChromaDB(BaseVectorDB):
|
||||
:type n_results: int
|
||||
:param where: to filter data
|
||||
:type where: Dict[str, Any]
|
||||
:param skip_embedding: Optional. If True, then the input_query is assumed to be already embedded.
|
||||
:type skip_embedding: bool
|
||||
:param citations: we use citations boolean param to return context along with the answer.
|
||||
:type citations: bool, default is False.
|
||||
:raises InvalidDimensionException: Dimensions do not match.
|
||||
@@ -220,24 +203,14 @@ class ChromaDB(BaseVectorDB):
|
||||
:rtype: List[str], if citations=False, otherwise List[Tuple[str, str, str]]
|
||||
"""
|
||||
try:
|
||||
if skip_embedding:
|
||||
result = self.collection.query(
|
||||
query_embeddings=[
|
||||
input_query,
|
||||
],
|
||||
n_results=n_results,
|
||||
where=self._generate_where_clause(where),
|
||||
**kwargs,
|
||||
)
|
||||
else:
|
||||
result = self.collection.query(
|
||||
query_texts=[
|
||||
input_query,
|
||||
],
|
||||
n_results=n_results,
|
||||
where=self._generate_where_clause(where),
|
||||
**kwargs,
|
||||
)
|
||||
result = self.collection.query(
|
||||
query_texts=[
|
||||
input_query,
|
||||
],
|
||||
n_results=n_results,
|
||||
where=self._generate_where_clause(where),
|
||||
**kwargs,
|
||||
)
|
||||
except InvalidDimensionException as e:
|
||||
raise InvalidDimensionException(
|
||||
e.message()
|
||||
|
||||
@@ -114,7 +114,6 @@ class ElasticsearchDB(BaseVectorDB):
|
||||
documents: List[str],
|
||||
metadatas: List[object],
|
||||
ids: List[str],
|
||||
skip_embedding: bool,
|
||||
**kwargs: Optional[Dict[str, any]],
|
||||
) -> Any:
|
||||
"""
|
||||
@@ -127,12 +126,9 @@ class ElasticsearchDB(BaseVectorDB):
|
||||
:type metadatas: List[object]
|
||||
:param ids: ids of docs
|
||||
:type ids: List[str]
|
||||
:param skip_embedding: Optional. If True, then the input_query is assumed to be already embedded.
|
||||
:type skip_embedding: bool
|
||||
"""
|
||||
|
||||
if not skip_embedding:
|
||||
embeddings = self.embedder.embedding_fn(documents)
|
||||
embeddings = self.embedder.embedding_fn(documents)
|
||||
|
||||
for chunk in chunks(
|
||||
list(zip(ids, documents, metadatas, embeddings)), self.BATCH_SIZE, desc="Inserting batches in elasticsearch"
|
||||
@@ -161,7 +157,6 @@ class ElasticsearchDB(BaseVectorDB):
|
||||
input_query: List[str],
|
||||
n_results: int,
|
||||
where: Dict[str, any],
|
||||
skip_embedding: bool,
|
||||
citations: bool = False,
|
||||
**kwargs: Optional[Dict[str, Any]],
|
||||
) -> Union[List[Tuple[str, Dict]], List[str]]:
|
||||
@@ -174,8 +169,6 @@ class ElasticsearchDB(BaseVectorDB):
|
||||
:type n_results: int
|
||||
:param where: Optional. to filter data
|
||||
:type where: Dict[str, any]
|
||||
:param skip_embedding: Optional. If True, then the input_query is assumed to be already embedded.
|
||||
:type skip_embedding: bool
|
||||
:return: The context of the document that matched your query, url of the source, doc_id
|
||||
:param citations: we use citations boolean param to return context along with the answer.
|
||||
:type citations: bool, default is False.
|
||||
@@ -183,11 +176,8 @@ class ElasticsearchDB(BaseVectorDB):
|
||||
along with url of the source and doc_id (if citations flag is true)
|
||||
:rtype: List[str], if citations=False, otherwise List[Tuple[str, str, str]]
|
||||
"""
|
||||
if skip_embedding:
|
||||
query_vector = input_query
|
||||
else:
|
||||
input_query_vector = self.embedder.embedding_fn(input_query)
|
||||
query_vector = input_query_vector[0]
|
||||
input_query_vector = self.embedder.embedding_fn(input_query)
|
||||
query_vector = input_query_vector[0]
|
||||
|
||||
# `https://www.elastic.co/guide/en/elasticsearch/reference/7.17/query-dsl-script-score-query.html`
|
||||
query = {
|
||||
|
||||
@@ -120,7 +120,6 @@ class OpenSearchDB(BaseVectorDB):
|
||||
documents: List[str],
|
||||
metadatas: List[object],
|
||||
ids: List[str],
|
||||
skip_embedding: bool,
|
||||
**kwargs: Optional[Dict[str, any]],
|
||||
):
|
||||
"""Add data in vector database.
|
||||
@@ -130,17 +129,11 @@ class OpenSearchDB(BaseVectorDB):
|
||||
documents (List[str]): List of texts to add.
|
||||
metadatas (List[object]): List of metadata associated with docs.
|
||||
ids (List[str]): IDs of docs.
|
||||
skip_embedding (bool): If True, then embeddings are assumed to be already generated.
|
||||
"""
|
||||
for batch_start in tqdm(range(0, len(documents), self.BATCH_SIZE), desc="Inserting batches in opensearch"):
|
||||
batch_end = batch_start + self.BATCH_SIZE
|
||||
batch_documents = documents[batch_start:batch_end]
|
||||
|
||||
# Generate embeddings for the batch if not skipping embedding
|
||||
if not skip_embedding:
|
||||
batch_embeddings = self.embedder.embedding_fn(batch_documents)
|
||||
else:
|
||||
batch_embeddings = embeddings[batch_start:batch_end]
|
||||
batch_embeddings = embeddings[batch_start:batch_end]
|
||||
|
||||
# Create document entries for bulk upload
|
||||
batch_entries = [
|
||||
@@ -166,7 +159,6 @@ class OpenSearchDB(BaseVectorDB):
|
||||
input_query: List[str],
|
||||
n_results: int,
|
||||
where: Dict[str, any],
|
||||
skip_embedding: bool,
|
||||
citations: bool = False,
|
||||
**kwargs: Optional[Dict[str, Any]],
|
||||
) -> Union[List[Tuple[str, Dict]], List[str]]:
|
||||
@@ -179,15 +171,12 @@ class OpenSearchDB(BaseVectorDB):
|
||||
:type n_results: int
|
||||
:param where: Optional. to filter data
|
||||
:type where: Dict[str, any]
|
||||
:param skip_embedding: Optional. If True, then the input_query is assumed to be already embedded.
|
||||
:type skip_embedding: bool
|
||||
:param citations: we use citations boolean param to return context along with the answer.
|
||||
:type citations: bool, default is False.
|
||||
:return: The content of the document that matched your query,
|
||||
along with url of the source and doc_id (if citations flag is true)
|
||||
:rtype: List[str], if citations=False, otherwise List[Tuple[str, str, str]]
|
||||
"""
|
||||
# TODO(rupeshbansal, deshraj): Add support for skip embeddings here if already exists
|
||||
embeddings = OpenAIEmbeddings()
|
||||
docsearch = OpenSearchVectorSearch(
|
||||
index_name=self._get_index(),
|
||||
|
||||
@@ -92,7 +92,6 @@ class PineconeDB(BaseVectorDB):
|
||||
documents: List[str],
|
||||
metadatas: List[object],
|
||||
ids: List[str],
|
||||
skip_embedding: bool,
|
||||
**kwargs: Optional[Dict[str, any]],
|
||||
):
|
||||
"""add data in vector database
|
||||
@@ -124,7 +123,6 @@ class PineconeDB(BaseVectorDB):
|
||||
input_query: List[str],
|
||||
n_results: int,
|
||||
where: Dict[str, any],
|
||||
skip_embedding: bool,
|
||||
citations: bool = False,
|
||||
**kwargs: Optional[Dict[str, any]],
|
||||
) -> Union[List[Tuple[str, Dict]], List[str]]:
|
||||
@@ -136,18 +134,13 @@ class PineconeDB(BaseVectorDB):
|
||||
:type n_results: int
|
||||
:param where: Optional. to filter data
|
||||
:type where: Dict[str, any]
|
||||
:param skip_embedding: Optional. if True, input_query is already embedded
|
||||
:type skip_embedding: bool
|
||||
:param citations: we use citations boolean param to return context along with the answer.
|
||||
:type citations: bool, default is False.
|
||||
:return: The content of the document that matched your query,
|
||||
along with url of the source and doc_id (if citations flag is true)
|
||||
:rtype: List[str], if citations=False, otherwise List[Tuple[str, str, str]]
|
||||
"""
|
||||
if not skip_embedding:
|
||||
query_vector = self.embedder.embedding_fn([input_query])[0]
|
||||
else:
|
||||
query_vector = input_query
|
||||
query_vector = self.embedder.embedding_fn([input_query])[0]
|
||||
data = self.client.query(vector=query_vector, filter=where, top_k=n_results, include_metadata=True, **kwargs)
|
||||
contexts = []
|
||||
for doc in data["matches"]:
|
||||
|
||||
@@ -126,7 +126,6 @@ class QdrantDB(BaseVectorDB):
|
||||
documents: List[str],
|
||||
metadatas: List[object],
|
||||
ids: List[str],
|
||||
skip_embedding: bool,
|
||||
**kwargs: Optional[Dict[str, any]],
|
||||
):
|
||||
"""add data in vector database
|
||||
@@ -138,12 +137,8 @@ class QdrantDB(BaseVectorDB):
|
||||
:type metadatas: List[object]
|
||||
:param ids: ids of docs
|
||||
:type ids: List[str]
|
||||
:param skip_embedding: A boolean flag indicating if the embedding for the documents to be added is to be
|
||||
generated or not
|
||||
:type skip_embedding: bool
|
||||
"""
|
||||
if not skip_embedding:
|
||||
embeddings = self.embedder.embedding_fn(documents)
|
||||
embeddings = self.embedder.embedding_fn(documents)
|
||||
|
||||
payloads = []
|
||||
qdrant_ids = []
|
||||
@@ -167,7 +162,6 @@ class QdrantDB(BaseVectorDB):
|
||||
input_query: List[str],
|
||||
n_results: int,
|
||||
where: Dict[str, any],
|
||||
skip_embedding: bool,
|
||||
citations: bool = False,
|
||||
**kwargs: Optional[Dict[str, Any]],
|
||||
) -> Union[List[Tuple[str, Dict]], List[str]]:
|
||||
@@ -179,20 +173,13 @@ class QdrantDB(BaseVectorDB):
|
||||
:type n_results: int
|
||||
:param where: Optional. to filter data
|
||||
:type where: Dict[str, any]
|
||||
:param skip_embedding: A boolean flag indicating if the embedding for the documents to be added is to be
|
||||
generated or not
|
||||
:type skip_embedding: bool
|
||||
:param citations: we use citations boolean param to return context along with the answer.
|
||||
:type citations: bool, default is False.
|
||||
:return: The content of the document that matched your query,
|
||||
along with url of the source and doc_id (if citations flag is true)
|
||||
:rtype: List[str], if citations=False, otherwise List[Tuple[str, str, str]]
|
||||
"""
|
||||
if not skip_embedding:
|
||||
query_vector = self.embedder.embedding_fn([input_query])[0]
|
||||
else:
|
||||
query_vector = input_query
|
||||
|
||||
query_vector = self.embedder.embedding_fn([input_query])[0]
|
||||
keys = set(where.keys() if where is not None else set())
|
||||
|
||||
qdrant_must_filters = []
|
||||
|
||||
@@ -157,7 +157,6 @@ class WeaviateDB(BaseVectorDB):
|
||||
documents: List[str],
|
||||
metadatas: List[object],
|
||||
ids: List[str],
|
||||
skip_embedding: bool,
|
||||
**kwargs: Optional[Dict[str, any]],
|
||||
):
|
||||
"""add data in vector database
|
||||
@@ -169,14 +168,8 @@ class WeaviateDB(BaseVectorDB):
|
||||
:type metadatas: List[object]
|
||||
:param ids: ids of docs
|
||||
:type ids: List[str]
|
||||
:param skip_embedding: A boolean flag indicating if the embedding for the documents to be added is to be
|
||||
generated or not
|
||||
:type skip_embedding: bool
|
||||
"""
|
||||
|
||||
print("Adding documents to Weaviate...")
|
||||
if not skip_embedding:
|
||||
embeddings = self.embedder.embedding_fn(documents)
|
||||
embeddings = self.embedder.embedding_fn(documents)
|
||||
self.client.batch.configure(batch_size=self.BATCH_SIZE, timeout_retries=3) # Configure batch
|
||||
with self.client.batch as batch: # Initialize a batch process
|
||||
for id, text, metadata, embedding in zip(ids, documents, metadatas, embeddings):
|
||||
@@ -202,7 +195,6 @@ class WeaviateDB(BaseVectorDB):
|
||||
input_query: List[str],
|
||||
n_results: int,
|
||||
where: Dict[str, any],
|
||||
skip_embedding: bool,
|
||||
citations: bool = False,
|
||||
**kwargs: Optional[Dict[str, Any]],
|
||||
) -> Union[List[Tuple[str, Dict]], List[str]]:
|
||||
@@ -214,20 +206,13 @@ class WeaviateDB(BaseVectorDB):
|
||||
:type n_results: int
|
||||
:param where: Optional. to filter data
|
||||
:type where: Dict[str, any]
|
||||
:param skip_embedding: A boolean flag indicating if the embedding for the documents to be added is to be
|
||||
generated or not
|
||||
:type skip_embedding: bool
|
||||
:param citations: we use citations boolean param to return context along with the answer.
|
||||
:type citations: bool, default is False.
|
||||
:return: The content of the document that matched your query,
|
||||
along with url of the source and doc_id (if citations flag is true)
|
||||
:rtype: List[str], if citations=False, otherwise List[Tuple[str, str, str]]
|
||||
"""
|
||||
if not skip_embedding:
|
||||
query_vector = self.embedder.embedding_fn([input_query])[0]
|
||||
else:
|
||||
query_vector = input_query
|
||||
|
||||
query_vector = self.embedder.embedding_fn([input_query])[0]
|
||||
keys = set(where.keys() if where is not None else set())
|
||||
data_fields = ["text"]
|
||||
|
||||
|
||||
@@ -112,12 +112,10 @@ class ZillizVectorDB(BaseVectorDB):
|
||||
documents: List[str],
|
||||
metadatas: List[object],
|
||||
ids: List[str],
|
||||
skip_embedding: bool,
|
||||
**kwargs: Optional[Dict[str, any]],
|
||||
):
|
||||
"""Add to database"""
|
||||
if not skip_embedding:
|
||||
embeddings = self.embedder.embedding_fn(documents)
|
||||
embeddings = self.embedder.embedding_fn(documents)
|
||||
|
||||
for id, doc, metadata, embedding in zip(ids, documents, metadatas, embeddings):
|
||||
data = {**metadata, "id": id, "text": doc, "embeddings": embedding}
|
||||
@@ -132,7 +130,6 @@ class ZillizVectorDB(BaseVectorDB):
|
||||
input_query: List[str],
|
||||
n_results: int,
|
||||
where: Dict[str, any],
|
||||
skip_embedding: bool,
|
||||
citations: bool = False,
|
||||
**kwargs: Optional[Dict[str, Any]],
|
||||
) -> Union[List[Tuple[str, Dict]], List[str]]:
|
||||
@@ -160,27 +157,16 @@ class ZillizVectorDB(BaseVectorDB):
|
||||
where = None
|
||||
|
||||
output_fields = ["*"]
|
||||
if skip_embedding:
|
||||
query_vector = input_query
|
||||
query_result = self.client.search(
|
||||
collection_name=self.config.collection_name,
|
||||
data=query_vector,
|
||||
limit=n_results,
|
||||
output_fields=output_fields,
|
||||
**kwargs,
|
||||
)
|
||||
input_query_vector = self.embedder.embedding_fn([input_query])
|
||||
query_vector = input_query_vector[0]
|
||||
|
||||
else:
|
||||
input_query_vector = self.embedder.embedding_fn([input_query])
|
||||
query_vector = input_query_vector[0]
|
||||
|
||||
query_result = self.client.search(
|
||||
collection_name=self.config.collection_name,
|
||||
data=[query_vector],
|
||||
limit=n_results,
|
||||
output_fields=output_fields,
|
||||
**kwargs,
|
||||
)
|
||||
query_result = self.client.search(
|
||||
collection_name=self.config.collection_name,
|
||||
data=[query_vector],
|
||||
limit=n_results,
|
||||
output_fields=output_fields,
|
||||
**kwargs,
|
||||
)
|
||||
query_result = query_result[0]
|
||||
contexts = []
|
||||
for query in query_result:
|
||||
|
||||
Reference in New Issue
Block a user