From ef69c91b602a5bfc11cebcbf690545b2020fc26d Mon Sep 17 00:00:00 2001 From: berwin joule Date: Thu, 14 Mar 2024 02:01:46 +0800 Subject: [PATCH] [Bug fix]: fix Cannot add documents to chromadb with inconsistent sizes. (#1314) --- embedchain/embedchain.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/embedchain/embedchain.py b/embedchain/embedchain.py index 3e7331f5..386ccdb6 100644 --- a/embedchain/embedchain.py +++ b/embedchain/embedchain.py @@ -378,11 +378,13 @@ class EmbedChain(JSONSerializable): # Chunk documents into batches of 2048 and handle each batch # helps wigth large loads of embeddings that hit OpenAI limits document_batches = [documents[i : i + 2048] for i in range(0, len(documents), 2048)] - for batch in document_batches: + metadata_batches = [metadatas[i : i + 2048] for i in range(0, len(metadatas), 2048)] + id_batches = [ids[i : i + 2048] for i in range(0, len(ids), 2048)] + for batch_docs, batch_meta, batch_ids in zip(document_batches, metadata_batches, id_batches): try: # Add only valid batches - if batch: - self.db.add(documents=batch, metadatas=metadatas, ids=ids, **kwargs) + if batch_docs: + self.db.add(documents=batch_docs, metadatas=batch_meta, ids=batch_ids, **kwargs) except Exception as e: print(f"Failed to add batch due to a bad request: {e}") # Handle the error, e.g., by logging, retrying, or skipping