bug: Prevent clashing chunk IDs (#160)

This commit inserts a repeating chunk once only
preventing the chroma duplicate id error.
This commit is contained in:
Hao (Harin) Wu
2023-07-07 21:59:47 -07:00
committed by GitHub
parent ae1e21833c
commit 996211e23e

View File

@@ -14,6 +14,7 @@ class BaseChunker:
""" """
documents = [] documents = []
ids = [] ids = []
idMap = {}
datas = loader.load_data(src) datas = loader.load_data(src)
metadatas = [] metadatas = []
for data in datas: for data in datas:
@@ -25,9 +26,11 @@ class BaseChunker:
for chunk in chunks: for chunk in chunks:
chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest() chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
ids.append(chunk_id) if (idMap.get(chunk_id) is None):
documents.append(chunk) idMap[chunk_id] = True
metadatas.append(meta_data) ids.append(chunk_id)
documents.append(chunk)
metadatas.append(meta_data)
return { return {
"documents": documents, "documents": documents,
"ids": ids, "ids": ids,