[Feature Improvement] Update JSON Loader to support loading data from more sources (#898)

Co-authored-by: Deven Patel <deven298@yahoo.com>
This commit is contained in:
Deven Patel
2023-11-03 10:00:27 -07:00
committed by GitHub
parent e2546a653d
commit 53037b5ed8
6 changed files with 166 additions and 67 deletions

View File

@@ -1,61 +1,65 @@
import os
import unittest
from unittest.mock import patch
import pytest
from chromadb.api.models.Collection import Collection
from embedchain import App
from embedchain.config import AppConfig, ChromaDbConfig
from embedchain.embedchain import EmbedChain
from embedchain.llm.base import BaseLlm
os.environ["OPENAI_API_KEY"] = "test-api-key"
class TestChromaDbHostsLoglevel(unittest.TestCase):
os.environ["OPENAI_API_KEY"] = "test_key"
@pytest.fixture
def app_instance():
config = AppConfig(log_level="DEBUG", collect_metrics=False)
return App(config)
@patch("chromadb.api.models.Collection.Collection.add")
@patch("embedchain.embedchain.EmbedChain.retrieve_from_database")
@patch("embedchain.llm.base.BaseLlm.get_answer_from_llm")
@patch("embedchain.llm.base.BaseLlm.get_llm_model_answer")
def test_whole_app(
self,
_mock_add,
_mock_ec_retrieve_from_database,
_mock_get_answer_from_llm,
mock_ec_get_llm_model_answer,
):
"""
Test if the `App` instance is initialized without a config that does not contain default hosts and ports.
"""
config = AppConfig(log_level="DEBUG", collect_metrics=False)
app = App(config)
def test_whole_app(app_instance, mocker):
knowledge = "lorem ipsum dolor sit amet, consectetur adipiscing"
knowledge = "lorem ipsum dolor sit amet, consectetur adipiscing"
mocker.patch.object(EmbedChain, "add")
mocker.patch.object(EmbedChain, "retrieve_from_database")
mocker.patch.object(BaseLlm, "get_answer_from_llm", return_value=knowledge)
mocker.patch.object(BaseLlm, "get_llm_model_answer", return_value=knowledge)
mocker.patch.object(BaseLlm, "generate_prompt")
app.add(knowledge, data_type="text")
app_instance.add(knowledge, data_type="text")
app_instance.query("What text did I give you?")
app_instance.chat("What text did I give you?")
app.query("What text did I give you?")
app.chat("What text did I give you?")
assert BaseLlm.generate_prompt.call_count == 2
app_instance.reset()
self.assertEqual(mock_ec_get_llm_model_answer.call_args[1]["documents"], [knowledge])
def test_add_after_reset(self):
"""
Test if the `App` instance is correctly reconstructed after a reset.
"""
config = AppConfig(log_level="DEBUG", collect_metrics=False)
chroma_config = {"allow_reset": True}
app = App(config=config, db_config=ChromaDbConfig(**chroma_config))
app.reset()
def test_add_after_reset(app_instance, mocker):
config = AppConfig(log_level="DEBUG", collect_metrics=False)
chroma_config = {"allow_reset": True}
# Make sure the client is still healthy
app.db.client.heartbeat()
# Make sure the collection exists, and can be added to
app.db.collection.add(
embeddings=[[1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2]],
metadatas=[
{"chapter": "3", "verse": "16"},
{"chapter": "3", "verse": "5"},
{"chapter": "29", "verse": "11"},
],
ids=["id1", "id2", "id3"],
)
app_instance = App(config=config, db_config=ChromaDbConfig(**chroma_config))
app_instance.reset()
app.reset()
app_instance.db.client.heartbeat()
mocker.patch.object(Collection, "add")
app_instance.db.collection.add(
embeddings=[[1.1, 2.3, 3.2], [4.5, 6.9, 4.4], [1.1, 2.3, 3.2]],
metadatas=[
{"chapter": "3", "verse": "16"},
{"chapter": "3", "verse": "5"},
{"chapter": "29", "verse": "11"},
],
ids=["id1", "id2", "id3"],
)
app_instance.reset()
def test_add_with_incorrect_content(app_instance, mocker):
content = [{"foo": "bar"}]
with pytest.raises(ValueError):
app_instance.add(content, data_type="json")

View File

@@ -40,7 +40,7 @@ def test_load_data(mocker):
def test_load_data_url(mocker):
content = "https://example.com/posts.json"
mocker.patch("os.path.isfile", return_value=False) # Mocking os.path.isfile to simulate a URL case
mocker.patch("os.path.isfile", return_value=False)
mocker.patch(
"llama_hub.jsondata.base.JSONDataReader.load_data",
return_value=[Document(text="content1"), Document(text="content2")],
@@ -68,11 +68,11 @@ def test_load_data_url(mocker):
assert result["doc_id"] == expected_doc_id
def test_load_data_invalid_content(mocker):
def test_load_data_invalid_string_content(mocker):
mocker.patch("os.path.isfile", return_value=False)
mocker.patch("requests.get")
content = "123"
content = "123: 345}"
with pytest.raises(ValueError, match="Invalid content to load json data from"):
JSONLoader.load_data(content)
@@ -89,3 +89,30 @@ def test_load_data_invalid_url(mocker):
with pytest.raises(ValueError, match=f"Invalid content to load json data from: {content}"):
JSONLoader.load_data(content)
def test_load_data_from_json_string(mocker):
content = '{"foo": "bar"}'
content_url_str = hashlib.sha256((content).encode("utf-8")).hexdigest()
mocker.patch("os.path.isfile", return_value=False)
mocker.patch(
"llama_hub.jsondata.base.JSONDataReader.load_data",
return_value=[Document(text="content1"), Document(text="content2")],
)
result = JSONLoader.load_data(content)
assert "doc_id" in result
assert "data" in result
expected_data = [
{"content": "content1", "meta_data": {"url": content_url_str}},
{"content": "content2", "meta_data": {"url": content_url_str}},
]
assert result["data"] == expected_data
expected_doc_id = hashlib.sha256((content_url_str + ", ".join(["content1", "content2"])).encode()).hexdigest()
assert result["doc_id"] == expected_doc_id