Clean json data before loading (#895)

Co-authored-by: Deven Patel <deven298@yahoo.com>
This commit is contained in:
Deven Patel
2023-11-01 21:52:34 -07:00
committed by GitHub
parent 930280f4ce
commit df314dc6d1

View File

@@ -6,6 +6,7 @@ import re
import requests
from embedchain.loaders.base_loader import BaseLoader
from embedchain.utils import clean_string
VALID_URL_PATTERN = "^https:\/\/[0-9A-z.]+.[0-9A-z.]+.[a-z]+\/.*\.json$"
@@ -49,7 +50,7 @@ class JSONLoader(BaseLoader):
docs = loader.load_data(json_data)
for doc in docs:
doc_content = doc.text
doc_content = clean_string(doc.text)
data.append({"content": doc_content, "meta_data": {"url": content}})
data_content.append(doc_content)
doc_id = hashlib.sha256((content + ", ".join(data_content)).encode()).hexdigest()