Add support for image dataset (#571)

Co-authored-by: Rupesh Bansal <rupeshbansal@Shankars-MacBook-Air.local>
This commit is contained in:
Rupesh Bansal
2023-10-04 09:50:40 +05:30
committed by GitHub
parent 55e9a1cbd6
commit d0af018b8d
19 changed files with 498 additions and 31 deletions

View File

@@ -186,6 +186,34 @@ class TestChromaDbCollection(unittest.TestCase):
# Should still be 1, not 2.
self.assertEqual(app.db.count(), 1)
def test_add_with_skip_embedding(self):
"""
Test that changes to one collection do not affect the other collection
"""
# Start with a clean app
self.app_with_settings.reset()
# app = App(config=AppConfig(collect_metrics=False), db=db)
# Collection should be empty when created
self.assertEqual(self.app_with_settings.db.count(), 0)
self.app_with_settings.db.add(embeddings=[[0, 0, 0]], documents=["document"], metadatas=[{"value": "somevalue"}], ids=["id"], skip_embedding=True)
# After adding, should contain one item
self.assertEqual(self.app_with_settings.db.count(), 1)
# Validate if the get utility of the database is working as expected
data = self.app_with_settings.db.get(["id"], limit=1)
expected_value = {'documents': ['document'],
'embeddings': None,
'ids': ['id'],
'metadatas': [{'value': 'somevalue'}]}
self.assertEqual(data, expected_value)
# Validate if the query utility of the database is working as expected
data = self.app_with_settings.db.query(input_query=[0, 0, 0], where={}, n_results=1, skip_embedding=True)
expected_value = ['document']
self.assertEqual(data, expected_value)
def test_collections_are_persistent(self):
"""
Test that a collection can be picked up later.

View File

@@ -1,14 +1,109 @@
import os
import unittest
from unittest.mock import patch
from embedchain.config import ElasticsearchDBConfig
from embedchain import App
from embedchain.config import AppConfig, ElasticsearchDBConfig
from embedchain.vectordb.elasticsearch import ElasticsearchDB
from embedchain.embedder.gpt4all import GPT4AllEmbedder
class TestEsDB(unittest.TestCase):
def setUp(self):
self.es_config = ElasticsearchDBConfig(es_url="http://mock-url.net")
@patch("embedchain.vectordb.elasticsearch.Elasticsearch")
def test_setUp(self, mock_client):
self.db = ElasticsearchDB(config=ElasticsearchDBConfig(es_url="https://localhost:9200"))
self.vector_dim = 384
app_config = AppConfig(collection_name=False, collect_metrics=False)
self.app = App(config=app_config, db=self.db)
# Assert that the Elasticsearch client is stored in the ElasticsearchDB class.
self.assertEqual(self.db.client, mock_client.return_value)
@patch("embedchain.vectordb.elasticsearch.Elasticsearch")
def test_query(self, mock_client):
self.db = ElasticsearchDB(config=ElasticsearchDBConfig(es_url="https://localhost:9200"))
app_config = AppConfig(collection_name=False, collect_metrics=False)
self.app = App(config=app_config, db=self.db, embedder=GPT4AllEmbedder())
# Assert that the Elasticsearch client is stored in the ElasticsearchDB class.
self.assertEqual(self.db.client, mock_client.return_value)
# Create some dummy data.
embeddings = [[1, 2, 3], [4, 5, 6]]
documents = ["This is a document.", "This is another document."]
metadatas = [{}, {}]
ids = ["doc_1", "doc_2"]
# Add the data to the database.
self.db.add(embeddings, documents, metadatas, ids, skip_embedding=False)
search_response = {"hits":
{"hits":
[
{
"_source": {"text": "This is a document."},
"_score": 0.9
},
{
"_source": {"text": "This is another document."},
"_score": 0.8
}
]
}
}
# Configure the mock client to return the mocked response.
mock_client.return_value.search.return_value = search_response
# Query the database for the documents that are most similar to the query "This is a document".
query = ["This is a document"]
results = self.db.query(query, n_results=2, where={}, skip_embedding=False)
# Assert that the results are correct.
self.assertEqual(results, ["This is a document.", "This is another document."])
@patch("embedchain.vectordb.elasticsearch.Elasticsearch")
def test_query_with_skip_embedding(self, mock_client):
self.db = ElasticsearchDB(config=ElasticsearchDBConfig(es_url="https://localhost:9200"))
app_config = AppConfig(collection_name=False, collect_metrics=False)
self.app = App(config=app_config, db=self.db)
# Assert that the Elasticsearch client is stored in the ElasticsearchDB class.
self.assertEqual(self.db.client, mock_client.return_value)
# Create some dummy data.
embeddings = [[1, 2, 3], [4, 5, 6]]
documents = ["This is a document.", "This is another document."]
metadatas = [{}, {}]
ids = ["doc_1", "doc_2"]
# Add the data to the database.
self.db.add(embeddings, documents, metadatas, ids, skip_embedding=True)
search_response = {"hits":
{"hits":
[
{
"_source": {"text": "This is a document."},
"_score": 0.9
},
{
"_source": {"text": "This is another document."},
"_score": 0.8
}
]
}
}
# Configure the mock client to return the mocked response.
mock_client.return_value.search.return_value = search_response
# Query the database for the documents that are most similar to the query "This is a document".
query = ["This is a document"]
results = self.db.query(query, n_results=2, where={}, skip_embedding=True)
# Assert that the results are correct.
self.assertEqual(results, ["This is a document.", "This is another document."])
def test_init_without_url(self):
# Make sure it's not loaded from env