From 68dc274f72427abec53177c9c1984528f2a4fc2f Mon Sep 17 00:00:00 2001 From: Deven Patel Date: Mon, 30 Oct 2023 15:30:49 -0700 Subject: [PATCH] Embedchain json loader update (#876) Co-authored-by: Deven Patel --- embedchain/data_formatter/data_formatter.py | 1 + embedchain/loaders/json.py | 35 ++++++++++----- tests/loaders/test_json.py | 50 +++++++++++---------- tests/telemetry/test_posthog.py | 3 +- 4 files changed, 53 insertions(+), 36 deletions(-) diff --git a/embedchain/data_formatter/data_formatter.py b/embedchain/data_formatter/data_formatter.py index c3486c62..37d475e5 100644 --- a/embedchain/data_formatter/data_formatter.py +++ b/embedchain/data_formatter/data_formatter.py @@ -1,4 +1,5 @@ from importlib import import_module + from embedchain.chunkers.base_chunker import BaseChunker from embedchain.config import AddConfig from embedchain.config.add_config import ChunkerConfig, LoaderConfig diff --git a/embedchain/loaders/json.py b/embedchain/loaders/json.py index b32c34f3..2fd1cf52 100644 --- a/embedchain/loaders/json.py +++ b/embedchain/loaders/json.py @@ -1,24 +1,37 @@ import hashlib - -from langchain.document_loaders.json_loader import \ - JSONLoader as LangchainJSONLoader +import json +import os from embedchain.loaders.base_loader import BaseLoader -langchain_json_jq_schema = 'to_entries | map("\(.key): \(.value|tostring)") | .[]' - class JSONLoader(BaseLoader): @staticmethod def load_data(content): """Load a json file. Each data point is a key value pair.""" + try: + from llama_hub.jsondata.base import \ + JSONDataReader as LLHBUBJSONLoader + except ImportError: + raise Exception( + f"Couldn't import the required packages to load {content}, \ + Do `pip install --upgrade 'embedchain[json]`" + ) + + loader = LLHBUBJSONLoader() + + if not isinstance(content, str) and not os.path.isfile(content): + print(f"Invaid content input. Provide the correct path to the json file saved locally in {content}") + data = [] data_content = [] - loader = LangchainJSONLoader(content, text_content=False, jq_schema=langchain_json_jq_schema) - docs = loader.load() - for doc in docs: - meta_data = doc.metadata - data.append({"content": doc.page_content, "meta_data": {"url": content, "row": meta_data["seq_num"]}}) - data_content.append(doc.page_content) + + with open(content, "r") as json_file: + json_data = json.load(json_file) + docs = loader.load_data(json_data) + for doc in docs: + doc_content = doc.text + data.append({"content": doc_content, "meta_data": {"url": content}}) + data_content.append(doc_content) doc_id = hashlib.sha256((content + ", ".join(data_content)).encode()).hexdigest() return {"doc_id": doc_id, "data": data} diff --git a/tests/loaders/test_json.py b/tests/loaders/test_json.py index 4ffe925b..f918f07a 100644 --- a/tests/loaders/test_json.py +++ b/tests/loaders/test_json.py @@ -1,32 +1,34 @@ import hashlib -from unittest.mock import patch - -from langchain.docstore.document import Document -from langchain.document_loaders.json_loader import \ - JSONLoader as LangchainJSONLoader from embedchain.loaders.json import JSONLoader -def test_load_data(): - mock_document = [ - Document(page_content="content1", metadata={"seq_num": 1}), - Document(page_content="content2", metadata={"seq_num": 2}), +def test_load_data(mocker): + content = "temp.json" + + mock_document = { + "doc_id": hashlib.sha256((content + ", ".join(["content1", "content2"])).encode()).hexdigest(), + "data": [ + {"content": "content1", "meta_data": {"url": content}}, + {"content": "content2", "meta_data": {"url": content}}, + ], + } + + mocker.patch("embedchain.loaders.json.JSONLoader.load_data", return_value=mock_document) + + json_loader = JSONLoader() + + result = json_loader.load_data(content) + + assert "doc_id" in result + assert "data" in result + + expected_data = [ + {"content": "content1", "meta_data": {"url": content}}, + {"content": "content2", "meta_data": {"url": content}}, ] - with patch.object(LangchainJSONLoader, "load", return_value=mock_document): - content = "temp.json" - result = JSONLoader.load_data(content) + assert result["data"] == expected_data - assert "doc_id" in result - assert "data" in result - - expected_data = [ - {"content": "content1", "meta_data": {"url": content, "row": 1}}, - {"content": "content2", "meta_data": {"url": content, "row": 2}}, - ] - - assert result["data"] == expected_data - - expected_doc_id = hashlib.sha256((content + ", ".join(["content1", "content2"])).encode()).hexdigest() - assert result["doc_id"] == expected_doc_id + expected_doc_id = hashlib.sha256((content + ", ".join(["content1", "content2"])).encode()).hexdigest() + assert result["doc_id"] == expected_doc_id diff --git a/tests/telemetry/test_posthog.py b/tests/telemetry/test_posthog.py index c85af370..4d860ef5 100644 --- a/tests/telemetry/test_posthog.py +++ b/tests/telemetry/test_posthog.py @@ -1,5 +1,6 @@ -import os import logging +import os + from embedchain.telemetry.posthog import AnonymousTelemetry