Skip creating embedding if it already exists

This commit adds support to skip generating embedding
for chunks which are already present in vector db.
This commit is contained in:
Taranjeet Singh
2023-06-23 17:08:46 +05:30
parent 21527e417a
commit e40f1e5061

View File

@@ -97,6 +97,24 @@ class EmbedChain:
documents = embeddings_data["documents"]
metadatas = embeddings_data["metadatas"]
ids = embeddings_data["ids"]
# get existing ids, and discard doc if any common id exist.
existing_docs = self.collection.get(
ids=ids,
# where={"url": url}
)
existing_ids = set(existing_docs["ids"])
if len(existing_ids):
data_dict = {id: (doc, meta) for id, doc, meta in zip(ids, documents, metadatas)}
data_dict = {id: value for id, value in data_dict.items() if id not in existing_ids}
if not data_dict:
print(f"All data from {url} already exists in the database.")
return
ids = list(data_dict.keys())
documents, metadatas = zip(*data_dict.values())
self.collection.add(
documents=documents,
metadatas=metadatas,