[Feature Improvement] Update JSON Loader to support loading data from more sources (#898)

Co-authored-by: Deven Patel <deven298@yahoo.com>
2023-11-03 10:00:27 -07:00
parent e2546a653d
commit 53037b5ed8
6 changed files with 166 additions and 67 deletions
--- a/tests/embedchain/test_embedchain.py
+++ b/tests/embedchain/test_embedchain.py
@@ -1,61 +1,65 @@
 import os
-import unittest
-from unittest.mock import patch
+
+import pytest
+from chromadb.api.models.Collection import Collection

 from embedchain import App
 from embedchain.config import AppConfig, ChromaDbConfig
+from embedchain.embedchain import EmbedChain
+from embedchain.llm.base import BaseLlm
+
+os.environ["OPENAI_API_KEY"] = "test-api-key"


-class TestChromaDbHostsLoglevel(unittest.TestCase):
-    os.environ["OPENAI_API_KEY"] = "test_key"
+@pytest.fixture
+def app_instance():
+    config = AppConfig(log_level="DEBUG", collect_metrics=False)
+    return App(config)

-    @patch("chromadb.api.models.Collection.Collection.add")
-    @patch("embedchain.embedchain.EmbedChain.retrieve_from_database")
-    @patch("embedchain.llm.base.BaseLlm.get_answer_from_llm")
-    @patch("embedchain.llm.base.BaseLlm.get_llm_model_answer")
-    def test_whole_app(
-        self,
-        _mock_add,
-        _mock_ec_retrieve_from_database,
-        _mock_get_answer_from_llm,
-        mock_ec_get_llm_model_answer,
-    ):
-        """
-        Test if the `App` instance is initialized without a config that does not contain default hosts and ports.
-        """
-        config = AppConfig(log_level="DEBUG", collect_metrics=False)

-        app = App(config)
+def test_whole_app(app_instance, mocker):
+    knowledge = "lorem ipsum dolor sit amet, consectetur adipiscing"

-        knowledge = "lorem ipsum dolor sit amet, consectetur adipiscing"
+    mocker.patch.object(EmbedChain, "add")
+    mocker.patch.object(EmbedChain, "retrieve_from_database")
+    mocker.patch.object(BaseLlm, "get_answer_from_llm", return_value=knowledge)
+    mocker.patch.object(BaseLlm, "get_llm_model_answer", return_value=knowledge)
+    mocker.patch.object(BaseLlm, "generate_prompt")

-        app.add(knowledge, data_type="text")
+    app_instance.add(knowledge, data_type="text")
+    app_instance.query("What text did I give you?")
+    app_instance.chat("What text did I give you?")

-        app.query("What text did I give you?")
-        app.chat("What text did I give you?")
+    assert BaseLlm.generate_prompt.call_count == 2
+    app_instance.reset()

-        self.assertEqual(mock_ec_get_llm_model_answer.call_args[1]["documents"], [knowledge])

-    def test_add_after_reset(self):
-        """
-        Test if the `App` instance is correctly reconstructed after a reset.
-        """
-        config = AppConfig(log_level="DEBUG", collect_metrics=False)
-        chroma_config = {"allow_reset": True}
-        app = App(config=config, db_config=ChromaDbConfig(**chroma_config))
-        app.reset()
+def test_add_after_reset(app_instance, mocker):
+    config = AppConfig(log_level="DEBUG", collect_metrics=False)
+    chroma_config = {"allow_reset": True}

-        # Make sure the client is still healthy
-        app.db.client.heartbeat()
-        # Make sure the collection exists, and can be added to
-        app.db.collection.add(
-            embeddings=[[1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2]],
-            metadatas=[
-                {"chapter": "3", "verse": "16"},
-                {"chapter": "3", "verse": "5"},
-                {"chapter": "29", "verse": "11"},
-            ],
-            ids=["id1", "id2", "id3"],
-        )
+    app_instance = App(config=config, db_config=ChromaDbConfig(**chroma_config))
+    app_instance.reset()

-        app.reset()
+    app_instance.db.client.heartbeat()
+
+    mocker.patch.object(Collection, "add")
+
+    app_instance.db.collection.add(
+        embeddings=[[1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2]],
+        metadatas=[
+            {"chapter": "3", "verse": "16"},
+            {"chapter": "3", "verse": "5"},
+            {"chapter": "29", "verse": "11"},
+        ],
+        ids=["id1", "id2", "id3"],
+    )
+
+    app_instance.reset()
+
+
+def test_add_with_incorrect_content(app_instance, mocker):
+    content = [{"foo": "bar"}]
+
+    with pytest.raises(ValueError):
+        app_instance.add(content, data_type="json")
--- a/tests/loaders/test_json.py
+++ b/tests/loaders/test_json.py
@@ -40,7 +40,7 @@ def test_load_data(mocker):
 def test_load_data_url(mocker):
    content = "https://example.com/posts.json"

-    mocker.patch("os.path.isfile", return_value=False)  # Mocking os.path.isfile to simulate a URL case
+    mocker.patch("os.path.isfile", return_value=False)
    mocker.patch(
        "llama_hub.jsondata.base.JSONDataReader.load_data",
        return_value=[Document(text="content1"), Document(text="content2")],
@@ -68,11 +68,11 @@ def test_load_data_url(mocker):
    assert result["doc_id"] == expected_doc_id


-def test_load_data_invalid_content(mocker):
+def test_load_data_invalid_string_content(mocker):
    mocker.patch("os.path.isfile", return_value=False)
    mocker.patch("requests.get")

-    content = "123"
+    content = "123: 345}"

    with pytest.raises(ValueError, match="Invalid content to load json data from"):
        JSONLoader.load_data(content)
@@ -89,3 +89,30 @@ def test_load_data_invalid_url(mocker):

    with pytest.raises(ValueError, match=f"Invalid content to load json data from: {content}"):
        JSONLoader.load_data(content)
+
+
+def test_load_data_from_json_string(mocker):
+    content = '{"foo": "bar"}'
+
+    content_url_str = hashlib.sha256((content).encode("utf-8")).hexdigest()
+
+    mocker.patch("os.path.isfile", return_value=False)
+    mocker.patch(
+        "llama_hub.jsondata.base.JSONDataReader.load_data",
+        return_value=[Document(text="content1"), Document(text="content2")],
+    )
+
+    result = JSONLoader.load_data(content)
+
+    assert "doc_id" in result
+    assert "data" in result
+
+    expected_data = [
+        {"content": "content1", "meta_data": {"url": content_url_str}},
+        {"content": "content2", "meta_data": {"url": content_url_str}},
+    ]
+
+    assert result["data"] == expected_data
+
+    expected_doc_id = hashlib.sha256((content_url_str + ", ".join(["content1", "content2"])).encode()).hexdigest()
+    assert result["doc_id"] == expected_doc_id