[Bug fix] Fix embedding issue for opensearch and some other vector databases (#1163)
This commit is contained in:
@@ -27,7 +27,7 @@ class BaseChunker(JSONSerializable):
|
||||
chunk_ids = []
|
||||
id_map = {}
|
||||
min_chunk_size = config.min_chunk_size if config is not None else 1
|
||||
logging.info(f"[INFO] Skipping chunks smaller than {min_chunk_size} characters")
|
||||
logging.info(f"Skipping chunks smaller than {min_chunk_size} characters")
|
||||
data_result = loader.load_data(src)
|
||||
data_records = data_result["data"]
|
||||
doc_id = data_result["doc_id"]
|
||||
|
||||
@@ -369,7 +369,7 @@ class EmbedChain(JSONSerializable):
|
||||
metadatas = embeddings_data["metadatas"]
|
||||
ids = embeddings_data["ids"]
|
||||
new_doc_id = embeddings_data["doc_id"]
|
||||
embeddings = embeddings_data.get("embeddings")
|
||||
|
||||
if existing_doc_id and existing_doc_id == new_doc_id:
|
||||
print("Doc content has not changed. Skipping creating chunks and embeddings")
|
||||
return [], [], [], 0
|
||||
@@ -433,13 +433,7 @@ class EmbedChain(JSONSerializable):
|
||||
# Count before, to calculate a delta in the end.
|
||||
chunks_before_addition = self.db.count()
|
||||
|
||||
self.db.add(
|
||||
embeddings=embeddings,
|
||||
documents=documents,
|
||||
metadatas=metadatas,
|
||||
ids=ids,
|
||||
**kwargs,
|
||||
)
|
||||
self.db.add(documents=documents, metadatas=metadatas, ids=ids, **kwargs)
|
||||
count_new_chunks = self.db.count() - chunks_before_addition
|
||||
|
||||
print(f"Successfully saved {src} ({chunker.data_type}). New chunks count: {count_new_chunks}")
|
||||
|
||||
@@ -129,17 +129,13 @@ class ChromaDB(BaseVectorDB):
|
||||
|
||||
def add(
|
||||
self,
|
||||
embeddings: list[list[float]],
|
||||
documents: list[str],
|
||||
metadatas: list[object],
|
||||
ids: list[str],
|
||||
**kwargs: Optional[dict[str, Any]],
|
||||
) -> Any:
|
||||
"""
|
||||
Add vectors to chroma database
|
||||
|
||||
:param embeddings: list of embeddings to add
|
||||
:type embeddings: list[list[str]]
|
||||
:param documents: Documents
|
||||
:type documents: list[str]
|
||||
:param metadatas: Metadatas
|
||||
|
||||
@@ -110,7 +110,6 @@ class ElasticsearchDB(BaseVectorDB):
|
||||
|
||||
def add(
|
||||
self,
|
||||
embeddings: list[list[float]],
|
||||
documents: list[str],
|
||||
metadatas: list[object],
|
||||
ids: list[str],
|
||||
@@ -118,8 +117,6 @@ class ElasticsearchDB(BaseVectorDB):
|
||||
) -> Any:
|
||||
"""
|
||||
add data in vector database
|
||||
:param embeddings: list of embeddings to add
|
||||
:type embeddings: list[list[str]]
|
||||
:param documents: list of texts to add
|
||||
:type documents: list[str]
|
||||
:param metadatas: list of metadata associated with docs
|
||||
|
||||
@@ -114,22 +114,10 @@ class OpenSearchDB(BaseVectorDB):
|
||||
result["metadatas"].append({"doc_id": doc_id})
|
||||
return result
|
||||
|
||||
def add(
|
||||
self,
|
||||
embeddings: list[list[str]],
|
||||
documents: list[str],
|
||||
metadatas: list[object],
|
||||
ids: list[str],
|
||||
**kwargs: Optional[dict[str, any]],
|
||||
):
|
||||
"""Add data in vector database.
|
||||
def add(self, documents: list[str], metadatas: list[object], ids: list[str], **kwargs: Optional[dict[str, any]]):
|
||||
"""Adds documents to the opensearch index"""
|
||||
|
||||
Args:
|
||||
embeddings (list[list[str]]): list of embeddings to add.
|
||||
documents (list[str]): list of texts to add.
|
||||
metadatas (list[object]): list of metadata associated with docs.
|
||||
ids (list[str]): IDs of docs.
|
||||
"""
|
||||
embeddings = self.embedder.embedding_fn(documents)
|
||||
for batch_start in tqdm(range(0, len(documents), self.BATCH_SIZE), desc="Inserting batches in opensearch"):
|
||||
batch_end = batch_start + self.BATCH_SIZE
|
||||
batch_documents = documents[batch_start:batch_end]
|
||||
|
||||
@@ -88,7 +88,6 @@ class PineconeDB(BaseVectorDB):
|
||||
|
||||
def add(
|
||||
self,
|
||||
embeddings: list[list[float]],
|
||||
documents: list[str],
|
||||
metadatas: list[object],
|
||||
ids: list[str],
|
||||
|
||||
@@ -122,15 +122,12 @@ class QdrantDB(BaseVectorDB):
|
||||
|
||||
def add(
|
||||
self,
|
||||
embeddings: list[list[float]],
|
||||
documents: list[str],
|
||||
metadatas: list[object],
|
||||
ids: list[str],
|
||||
**kwargs: Optional[dict[str, any]],
|
||||
):
|
||||
"""add data in vector database
|
||||
:param embeddings: list of embeddings for the corresponding documents to be added
|
||||
:type documents: list[list[float]]
|
||||
:param documents: list of texts to add
|
||||
:type documents: list[str]
|
||||
:param metadatas: list of metadata associated with docs
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import copy
|
||||
import os
|
||||
from typing import Any, Optional, Union
|
||||
from typing import Optional, Union
|
||||
|
||||
try:
|
||||
import weaviate
|
||||
@@ -151,17 +151,8 @@ class WeaviateDB(BaseVectorDB):
|
||||
|
||||
return {"ids": existing_ids}
|
||||
|
||||
def add(
|
||||
self,
|
||||
embeddings: list[list[float]],
|
||||
documents: list[str],
|
||||
metadatas: list[object],
|
||||
ids: list[str],
|
||||
**kwargs: Optional[dict[str, any]],
|
||||
):
|
||||
def add(self, documents: list[str], metadatas: list[object], ids: list[str], **kwargs: Optional[dict[str, any]]):
|
||||
"""add data in vector database
|
||||
:param embeddings: list of embeddings for the corresponding documents to be added
|
||||
:type documents: list[list[float]]
|
||||
:param documents: list of texts to add
|
||||
:type documents: list[str]
|
||||
:param metadatas: list of metadata associated with docs
|
||||
@@ -191,12 +182,7 @@ class WeaviateDB(BaseVectorDB):
|
||||
)
|
||||
|
||||
def query(
|
||||
self,
|
||||
input_query: list[str],
|
||||
n_results: int,
|
||||
where: dict[str, any],
|
||||
citations: bool = False,
|
||||
**kwargs: Optional[dict[str, Any]],
|
||||
self, input_query: list[str], n_results: int, where: dict[str, any], citations: bool = False
|
||||
) -> Union[list[tuple[str, dict]], list[str]]:
|
||||
"""
|
||||
query contents from vector database based on vector similarity
|
||||
|
||||
@@ -108,7 +108,6 @@ class ZillizVectorDB(BaseVectorDB):
|
||||
|
||||
def add(
|
||||
self,
|
||||
embeddings: list[list[float]],
|
||||
documents: list[str],
|
||||
metadatas: list[object],
|
||||
ids: list[str],
|
||||
|
||||
Reference in New Issue
Block a user