[BugFix] Fix issue of chunks not getting embedded in opensearch index (#983)
This commit is contained in:
@@ -1,6 +1,6 @@
|
|||||||
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import hashlib
|
|
||||||
|
|
||||||
from embedchain.helpers.json_serializable import register_deserializable
|
from embedchain.helpers.json_serializable import register_deserializable
|
||||||
from embedchain.loaders.base_loader import BaseLoader
|
from embedchain.loaders.base_loader import BaseLoader
|
||||||
|
|||||||
@@ -122,41 +122,42 @@ class OpenSearchDB(BaseVectorDB):
|
|||||||
ids: List[str],
|
ids: List[str],
|
||||||
skip_embedding: bool,
|
skip_embedding: bool,
|
||||||
):
|
):
|
||||||
"""add data in vector database
|
"""Add data in vector database.
|
||||||
|
|
||||||
:param embeddings: list of embeddings to add
|
Args:
|
||||||
:type embeddings: List[List[str]]
|
embeddings (List[List[str]]): List of embeddings to add.
|
||||||
:param documents: list of texts to add
|
documents (List[str]): List of texts to add.
|
||||||
:type documents: List[str]
|
metadatas (List[object]): List of metadata associated with docs.
|
||||||
:param metadatas: list of metadata associated with docs
|
ids (List[str]): IDs of docs.
|
||||||
:type metadatas: List[object]
|
skip_embedding (bool): If True, then embeddings are assumed to be already generated.
|
||||||
:param ids: ids of docs
|
|
||||||
:type ids: List[str]
|
|
||||||
:param skip_embedding: Optional. If True, then the embeddings are assumed to be already generated.
|
|
||||||
:type skip_embedding: bool
|
|
||||||
"""
|
"""
|
||||||
|
for batch_start in tqdm(range(0, len(documents), self.BATCH_SIZE), desc="Inserting batches in opensearch"):
|
||||||
|
batch_end = batch_start + self.BATCH_SIZE
|
||||||
|
batch_documents = documents[batch_start:batch_end]
|
||||||
|
|
||||||
for i in tqdm(range(0, len(documents), self.BATCH_SIZE), desc="Inserting batches in opensearch"):
|
# Generate embeddings for the batch if not skipping embedding
|
||||||
if not skip_embedding:
|
if not skip_embedding:
|
||||||
embeddings = self.embedder.embedding_fn(documents[i : i + self.BATCH_SIZE])
|
batch_embeddings = self.embedder.embedding_fn(batch_documents)
|
||||||
|
else:
|
||||||
|
batch_embeddings = embeddings[batch_start:batch_end]
|
||||||
|
|
||||||
docs = []
|
# Create document entries for bulk upload
|
||||||
for id, text, metadata, embeddings in zip(
|
batch_entries = [
|
||||||
ids[i : i + self.BATCH_SIZE],
|
{
|
||||||
documents[i : i + self.BATCH_SIZE],
|
"_index": self._get_index(),
|
||||||
metadatas[i : i + self.BATCH_SIZE],
|
"_id": doc_id,
|
||||||
embeddings[i : i + self.BATCH_SIZE],
|
"_source": {"text": text, "metadata": metadata, "embeddings": embedding},
|
||||||
):
|
}
|
||||||
docs.append(
|
for doc_id, text, metadata, embedding in zip(
|
||||||
{
|
ids[batch_start:batch_end], batch_documents, metadatas[batch_start:batch_end], batch_embeddings
|
||||||
"_index": self._get_index(),
|
|
||||||
"_id": id,
|
|
||||||
"_source": {"text": text, "metadata": metadata, "embeddings": embeddings},
|
|
||||||
}
|
|
||||||
)
|
)
|
||||||
bulk(self.client, docs)
|
]
|
||||||
|
|
||||||
|
# Perform bulk operation
|
||||||
|
bulk(self.client, batch_entries)
|
||||||
self.client.indices.refresh(index=self._get_index())
|
self.client.indices.refresh(index=self._get_index())
|
||||||
# Sleep for 0.1 seconds to avoid rate limiting
|
|
||||||
|
# Sleep to avoid rate limiting
|
||||||
time.sleep(0.1)
|
time.sleep(0.1)
|
||||||
|
|
||||||
def query(
|
def query(
|
||||||
@@ -250,7 +251,7 @@ class OpenSearchDB(BaseVectorDB):
|
|||||||
"""
|
"""
|
||||||
# Delete all data from the database
|
# Delete all data from the database
|
||||||
if self.client.indices.exists(index=self._get_index()):
|
if self.client.indices.exists(index=self._get_index()):
|
||||||
# delete index in Es
|
# delete index in ES
|
||||||
self.client.indices.delete(index=self._get_index())
|
self.client.indices.delete(index=self._get_index())
|
||||||
|
|
||||||
def delete(self, where):
|
def delete(self, where):
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "embedchain"
|
name = "embedchain"
|
||||||
version = "0.1.23"
|
version = "0.1.24"
|
||||||
description = "Data platform for LLMs - Load, index, retrieve and sync any unstructured data"
|
description = "Data platform for LLMs - Load, index, retrieve and sync any unstructured data"
|
||||||
authors = [
|
authors = [
|
||||||
"Taranjeet Singh <taranjeet@embedchain.ai>",
|
"Taranjeet Singh <taranjeet@embedchain.ai>",
|
||||||
|
|||||||
Reference in New Issue
Block a user