bug: Prevent clashing chunk IDs (#160)

This commit inserts a repeating chunk once only
preventing the chroma duplicate id error.
This commit is contained in:
Hao (Harin) Wu
2023-07-07 21:59:47 -07:00
committed by GitHub
parent ae1e21833c
commit 996211e23e

View File

@@ -14,6 +14,7 @@ class BaseChunker:
"""
documents = []
ids = []
idMap = {}
datas = loader.load_data(src)
metadatas = []
for data in datas:
@@ -25,9 +26,11 @@ class BaseChunker:
for chunk in chunks:
chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
ids.append(chunk_id)
documents.append(chunk)
metadatas.append(meta_data)
if (idMap.get(chunk_id) is None):
idMap[chunk_id] = True
ids.append(chunk_id)
documents.append(chunk)
metadatas.append(meta_data)
return {
"documents": documents,
"ids": ids,