[BugFix] Fix issue of chunks not getting embedded in opensearch index (#983)

This commit is contained in:
Deshraj Yadav
2023-11-29 21:56:05 -08:00
committed by GitHub
parent 406c46e7f4
commit b02e8feeda
3 changed files with 32 additions and 31 deletions

View File

@@ -1,6 +1,6 @@
import hashlib
import logging import logging
import os import os
import hashlib
from embedchain.helpers.json_serializable import register_deserializable from embedchain.helpers.json_serializable import register_deserializable
from embedchain.loaders.base_loader import BaseLoader from embedchain.loaders.base_loader import BaseLoader

View File

@@ -122,41 +122,42 @@ class OpenSearchDB(BaseVectorDB):
ids: List[str], ids: List[str],
skip_embedding: bool, skip_embedding: bool,
): ):
"""add data in vector database """Add data in vector database.
:param embeddings: list of embeddings to add Args:
:type embeddings: List[List[str]] embeddings (List[List[str]]): List of embeddings to add.
:param documents: list of texts to add documents (List[str]): List of texts to add.
:type documents: List[str] metadatas (List[object]): List of metadata associated with docs.
:param metadatas: list of metadata associated with docs ids (List[str]): IDs of docs.
:type metadatas: List[object] skip_embedding (bool): If True, then embeddings are assumed to be already generated.
:param ids: ids of docs
:type ids: List[str]
:param skip_embedding: Optional. If True, then the embeddings are assumed to be already generated.
:type skip_embedding: bool
""" """
for batch_start in tqdm(range(0, len(documents), self.BATCH_SIZE), desc="Inserting batches in opensearch"):
batch_end = batch_start + self.BATCH_SIZE
batch_documents = documents[batch_start:batch_end]
for i in tqdm(range(0, len(documents), self.BATCH_SIZE), desc="Inserting batches in opensearch"): # Generate embeddings for the batch if not skipping embedding
if not skip_embedding: if not skip_embedding:
embeddings = self.embedder.embedding_fn(documents[i : i + self.BATCH_SIZE]) batch_embeddings = self.embedder.embedding_fn(batch_documents)
else:
batch_embeddings = embeddings[batch_start:batch_end]
docs = [] # Create document entries for bulk upload
for id, text, metadata, embeddings in zip( batch_entries = [
ids[i : i + self.BATCH_SIZE], {
documents[i : i + self.BATCH_SIZE], "_index": self._get_index(),
metadatas[i : i + self.BATCH_SIZE], "_id": doc_id,
embeddings[i : i + self.BATCH_SIZE], "_source": {"text": text, "metadata": metadata, "embeddings": embedding},
): }
docs.append( for doc_id, text, metadata, embedding in zip(
{ ids[batch_start:batch_end], batch_documents, metadatas[batch_start:batch_end], batch_embeddings
"_index": self._get_index(),
"_id": id,
"_source": {"text": text, "metadata": metadata, "embeddings": embeddings},
}
) )
bulk(self.client, docs) ]
# Perform bulk operation
bulk(self.client, batch_entries)
self.client.indices.refresh(index=self._get_index()) self.client.indices.refresh(index=self._get_index())
# Sleep for 0.1 seconds to avoid rate limiting
# Sleep to avoid rate limiting
time.sleep(0.1) time.sleep(0.1)
def query( def query(
@@ -250,7 +251,7 @@ class OpenSearchDB(BaseVectorDB):
""" """
# Delete all data from the database # Delete all data from the database
if self.client.indices.exists(index=self._get_index()): if self.client.indices.exists(index=self._get_index()):
# delete index in Es # delete index in ES
self.client.indices.delete(index=self._get_index()) self.client.indices.delete(index=self._get_index())
def delete(self, where): def delete(self, where):

View File

@@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "embedchain" name = "embedchain"
version = "0.1.23" version = "0.1.24"
description = "Data platform for LLMs - Load, index, retrieve and sync any unstructured data" description = "Data platform for LLMs - Load, index, retrieve and sync any unstructured data"
authors = [ authors = [
"Taranjeet Singh <taranjeet@embedchain.ai>", "Taranjeet Singh <taranjeet@embedchain.ai>",