From df314dc6d118bcc8bb01c8e3b3ac335cbfb986c9 Mon Sep 17 00:00:00 2001 From: Deven Patel Date: Wed, 1 Nov 2023 21:52:34 -0700 Subject: [PATCH] Clean json data before loading (#895) Co-authored-by: Deven Patel --- embedchain/loaders/json.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/embedchain/loaders/json.py b/embedchain/loaders/json.py index 23d094b6..a76d0a0c 100644 --- a/embedchain/loaders/json.py +++ b/embedchain/loaders/json.py @@ -6,6 +6,7 @@ import re import requests from embedchain.loaders.base_loader import BaseLoader +from embedchain.utils import clean_string VALID_URL_PATTERN = "^https:\/\/[0-9A-z.]+.[0-9A-z.]+.[a-z]+\/.*\.json$" @@ -49,7 +50,7 @@ class JSONLoader(BaseLoader): docs = loader.load_data(json_data) for doc in docs: - doc_content = doc.text + doc_content = clean_string(doc.text) data.append({"content": doc_content, "meta_data": {"url": content}}) data_content.append(doc_content) doc_id = hashlib.sha256((content + ", ".join(data_content)).encode()).hexdigest()