diff --git a/embedchain/chunkers/base_chunker.py b/embedchain/chunkers/base_chunker.py index 36e58064..f7b5cb09 100644 --- a/embedchain/chunkers/base_chunker.py +++ b/embedchain/chunkers/base_chunker.py @@ -44,6 +44,7 @@ class BaseChunker(JSONSerializable): for chunk in chunks: chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest() + chunk_id = f"{app_id}--{chunk_id}" if app_id is not None else chunk_id if idMap.get(chunk_id) is None: idMap[chunk_id] = True chunk_ids.append(chunk_id) diff --git a/tests/chunkers/test_base_chunker.py b/tests/chunkers/test_base_chunker.py index 6f89cd03..343653ee 100644 --- a/tests/chunkers/test_base_chunker.py +++ b/tests/chunkers/test_base_chunker.py @@ -44,8 +44,8 @@ def test_create_chunks(chunker, text_splitter_mock, loader_mock, app_id, data_ty result = chunker.create_chunks(loader_mock, "test_src", app_id) expected_ids = [ - hashlib.sha256(("Chunk 1" + "URL 1").encode()).hexdigest(), - hashlib.sha256(("Chunk 2" + "URL 1").encode()).hexdigest(), + f"{app_id}--" + hashlib.sha256(("Chunk 1" + "URL 1").encode()).hexdigest(), + f"{app_id}--" + hashlib.sha256(("Chunk 2" + "URL 1").encode()).hexdigest(), ] assert result["documents"] == ["Chunk 1", "Chunk 2"]