bug: Prevent clashing chunk IDs (#160)
This commit inserts a repeating chunk once only preventing the chroma duplicate id error.
This commit is contained in:
@@ -14,6 +14,7 @@ class BaseChunker:
|
|||||||
"""
|
"""
|
||||||
documents = []
|
documents = []
|
||||||
ids = []
|
ids = []
|
||||||
|
idMap = {}
|
||||||
datas = loader.load_data(src)
|
datas = loader.load_data(src)
|
||||||
metadatas = []
|
metadatas = []
|
||||||
for data in datas:
|
for data in datas:
|
||||||
@@ -25,9 +26,11 @@ class BaseChunker:
|
|||||||
|
|
||||||
for chunk in chunks:
|
for chunk in chunks:
|
||||||
chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
|
chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
|
||||||
ids.append(chunk_id)
|
if (idMap.get(chunk_id) is None):
|
||||||
documents.append(chunk)
|
idMap[chunk_id] = True
|
||||||
metadatas.append(meta_data)
|
ids.append(chunk_id)
|
||||||
|
documents.append(chunk)
|
||||||
|
metadatas.append(meta_data)
|
||||||
return {
|
return {
|
||||||
"documents": documents,
|
"documents": documents,
|
||||||
"ids": ids,
|
"ids": ids,
|
||||||
|
|||||||
Reference in New Issue
Block a user