Add support for image dataset (#571)

Co-authored-by: Rupesh Bansal <rupeshbansal@Shankars-MacBook-Air.local>
This commit is contained in:
Rupesh Bansal
2023-10-04 09:50:40 +05:30
committed by GitHub
parent 55e9a1cbd6
commit d0af018b8d
19 changed files with 498 additions and 31 deletions

View File

@@ -1,5 +1,5 @@
import logging
from typing import Dict, List, Optional, Set
from typing import Any, Dict, List, Optional, Set
try:
from elasticsearch import Elasticsearch
@@ -100,9 +100,10 @@ class ElasticsearchDB(BaseVectorDB):
ids = [doc["_id"] for doc in docs]
return {"ids": set(ids)}
def add(self, documents: List[str], metadatas: List[object], ids: List[str]):
"""add data in vector database
def add(self, embeddings: List[List[float]], documents: List[str], metadatas: List[object],
ids: List[str], skip_embedding: bool) -> Any:
"""
add data in vector database
:param documents: list of texts to add
:type documents: List[str]
:param metadatas: list of metadata associated with docs
@@ -112,7 +113,9 @@ class ElasticsearchDB(BaseVectorDB):
"""
docs = []
embeddings = self.embedder.embedding_fn(documents)
if not skip_embedding:
embeddings = self.embedder.embedding_fn(documents)
for id, text, metadata, embeddings in zip(ids, documents, metadatas, embeddings):
docs.append(
{
@@ -124,7 +127,7 @@ class ElasticsearchDB(BaseVectorDB):
bulk(self.client, docs)
self.client.indices.refresh(index=self._get_index())
def query(self, input_query: List[str], n_results: int, where: Dict[str, any]) -> List[str]:
def query(self, input_query: List[str], n_results: int, where: Dict[str, any], skip_embedding: bool) -> List[str]:
"""
query contents from vector data base based on vector similarity
@@ -137,8 +140,12 @@ class ElasticsearchDB(BaseVectorDB):
:return: Database contents that are the result of the query
:rtype: List[str]
"""
input_query_vector = self.embedder.embedding_fn(input_query)
query_vector = input_query_vector[0]
if skip_embedding:
query_vector = input_query
else:
input_query_vector = self.embedder.embedding_fn(input_query)
query_vector = input_query_vector[0]
query = {
"script_score": {
"query": {"bool": {"must": [{"exists": {"field": "text"}}]}},