From e40f1e50618aedce59af811ab6d19bb85a1ce492 Mon Sep 17 00:00:00 2001 From: Taranjeet Singh Date: Fri, 23 Jun 2023 17:08:46 +0530 Subject: [PATCH] Skip creating embedding if it already exists This commit adds support to skip generating embedding for chunks which are already present in vector db. --- embedchain/embedchain.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/embedchain/embedchain.py b/embedchain/embedchain.py index d6cd2f5f..d68c7394 100644 --- a/embedchain/embedchain.py +++ b/embedchain/embedchain.py @@ -97,6 +97,24 @@ class EmbedChain: documents = embeddings_data["documents"] metadatas = embeddings_data["metadatas"] ids = embeddings_data["ids"] + # get existing ids, and discard doc if any common id exist. + existing_docs = self.collection.get( + ids=ids, + # where={"url": url} + ) + existing_ids = set(existing_docs["ids"]) + + if len(existing_ids): + data_dict = {id: (doc, meta) for id, doc, meta in zip(ids, documents, metadatas)} + data_dict = {id: value for id, value in data_dict.items() if id not in existing_ids} + + if not data_dict: + print(f"All data from {url} already exists in the database.") + return + + ids = list(data_dict.keys()) + documents, metadatas = zip(*data_dict.values()) + self.collection.add( documents=documents, metadatas=metadatas,