diff --git a/docs/advanced/query_configuration.mdx b/docs/advanced/query_configuration.mdx index fe389a06..d141b865 100644 --- a/docs/advanced/query_configuration.mdx +++ b/docs/advanced/query_configuration.mdx @@ -10,6 +10,7 @@ title: '🔍 Query configurations' | embedding_fn| embedding function | chromadb.utils.embedding_functions | \{text-embedding-ada-002\} | | db | vector database (experimental) | BaseVectorDB | ChromaDB | | collection_name | initial collection name for the database | string | embedchain_store | +| collection_metrics | collect anonymous telemetry data to improve embedchain | boolean | true | ## AddConfig diff --git a/embedchain/config/apps/AppConfig.py b/embedchain/config/apps/AppConfig.py index 1e08040e..995a45ff 100644 --- a/embedchain/config/apps/AppConfig.py +++ b/embedchain/config/apps/AppConfig.py @@ -1,4 +1,5 @@ import os +from typing import Optional try: from chromadb.utils import embedding_functions @@ -16,7 +17,15 @@ class AppConfig(BaseAppConfig): Config to initialize an embedchain custom `App` instance, with extra config options. """ - def __init__(self, log_level=None, host=None, port=None, id=None, collection_name=None): + def __init__( + self, + log_level=None, + host=None, + port=None, + id=None, + collection_name=None, + collect_metrics: Optional[bool] = None, + ): """ :param log_level: Optional. (String) Debug level ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']. @@ -24,6 +33,7 @@ class AppConfig(BaseAppConfig): :param port: Optional. Port for the database server. :param id: Optional. ID of the app. Document metadata will have this id. :param collection_name: Optional. Collection name for the database. + :param collect_metrics: Defaults to True. Send anonymous telemetry to improve embedchain. """ super().__init__( log_level=log_level, @@ -32,6 +42,7 @@ class AppConfig(BaseAppConfig): port=port, id=id, collection_name=collection_name, + collect_metrics=collect_metrics, ) @staticmethod diff --git a/embedchain/config/apps/BaseAppConfig.py b/embedchain/config/apps/BaseAppConfig.py index 4b85c1a4..5706fd49 100644 --- a/embedchain/config/apps/BaseAppConfig.py +++ b/embedchain/config/apps/BaseAppConfig.py @@ -19,6 +19,7 @@ class BaseAppConfig(BaseConfig): port=None, id=None, collection_name=None, + collect_metrics: bool = True, db_type: VectorDatabases = None, vector_dim: VectorDimensions = None, es_config: ElasticsearchDBConfig = None, @@ -32,6 +33,7 @@ class BaseAppConfig(BaseConfig): :param port: Optional. Port for the database server. :param id: Optional. ID of the app. Document metadata will have this id. :param collection_name: Optional. Collection name for the database. + :param collect_metrics: Defaults to True. Send anonymous telemetry to improve embedchain. :param db_type: Optional. type of Vector database to use :param vector_dim: Vector dimension generated by embedding fn :param es_config: Optional. elasticsearch database config to be used for connection @@ -49,6 +51,7 @@ class BaseAppConfig(BaseConfig): es_config=es_config, ) self.id = id + self.collect_metrics = True if (collect_metrics is True or collect_metrics is None) else False return @staticmethod diff --git a/embedchain/config/apps/CustomAppConfig.py b/embedchain/config/apps/CustomAppConfig.py index 188c29d7..677c6aeb 100644 --- a/embedchain/config/apps/CustomAppConfig.py +++ b/embedchain/config/apps/CustomAppConfig.py @@ -1,4 +1,4 @@ -from typing import Any +from typing import Any, Optional from chromadb.api.types import Documents, Embeddings from dotenv import load_dotenv @@ -30,6 +30,7 @@ class CustomAppConfig(BaseAppConfig): provider: Providers = None, open_source_app_config=None, deployment_name=None, + collect_metrics: Optional[bool] = None, db_type: VectorDatabases = None, es_config: ElasticsearchDBConfig = None, ): @@ -45,6 +46,7 @@ class CustomAppConfig(BaseAppConfig): :param collection_name: Optional. Collection name for the database. :param provider: Optional. (Providers): LLM Provider to use. :param open_source_app_config: Optional. Config instance needed for open source apps. + :param collect_metrics: Defaults to True. Send anonymous telemetry to improve embedchain. :param db_type: Optional. type of Vector database to use. :param es_config: Optional. elasticsearch database config to be used for connection """ @@ -65,6 +67,7 @@ class CustomAppConfig(BaseAppConfig): port=port, id=id, collection_name=collection_name, + collect_metrics=collect_metrics, db_type=db_type, vector_dim=CustomAppConfig.get_vector_dimension(embedding_function=embedding_fn), es_config=es_config, diff --git a/embedchain/config/apps/OpenSourceAppConfig.py b/embedchain/config/apps/OpenSourceAppConfig.py index 8666f125..c2c8e9ed 100644 --- a/embedchain/config/apps/OpenSourceAppConfig.py +++ b/embedchain/config/apps/OpenSourceAppConfig.py @@ -1,3 +1,5 @@ +from typing import Optional + from chromadb.utils import embedding_functions from .BaseAppConfig import BaseAppConfig @@ -8,7 +10,16 @@ class OpenSourceAppConfig(BaseAppConfig): Config to initialize an embedchain custom `OpenSourceApp` instance, with extra config options. """ - def __init__(self, log_level=None, host=None, port=None, id=None, collection_name=None, model=None): + def __init__( + self, + log_level=None, + host=None, + port=None, + id=None, + collection_name=None, + collect_metrics: Optional[bool] = None, + model=None, + ): """ :param log_level: Optional. (String) Debug level ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']. @@ -16,6 +27,7 @@ class OpenSourceAppConfig(BaseAppConfig): :param collection_name: Optional. Collection name for the database. :param host: Optional. Hostname for the database server. :param port: Optional. Port for the database server. + :param collect_metrics: Defaults to True. Send anonymous telemetry to improve embedchain. :param model: Optional. GPT4ALL uses the model to instantiate the class. So unlike `App`, it has to be provided before querying. """ @@ -28,6 +40,7 @@ class OpenSourceAppConfig(BaseAppConfig): port=port, id=id, collection_name=collection_name, + collect_metrics=collect_metrics, ) @staticmethod diff --git a/embedchain/embedchain.py b/embedchain/embedchain.py index 8cfca455..a59cbee0 100644 --- a/embedchain/embedchain.py +++ b/embedchain/embedchain.py @@ -1,9 +1,14 @@ +import importlib.metadata import logging import os +import threading +from typing import Optional +import requests from dotenv import load_dotenv from langchain.docstore.document import Document from langchain.memory import ConversationBufferMemory +from tenacity import retry, stop_after_attempt, wait_fixed from embedchain.chunkers.base_chunker import BaseChunker from embedchain.config import AddConfig, ChatConfig, QueryConfig @@ -36,6 +41,10 @@ class EmbedChain: self.is_docs_site_instance = False self.online = False + # Send anonymous telemetry + thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("init",)) + thread_telemetry.start() + def add(self, data_type, url, metadata=None, config: AddConfig = None): """ Adds the data from the given URL to the vector db. @@ -53,10 +62,21 @@ class EmbedChain: data_formatter = DataFormatter(data_type, config) self.user_asks.append([data_type, url, metadata]) - self.load_and_embed(data_formatter.loader, data_formatter.chunker, url, metadata) + documents, _metadatas, _ids, new_chunks = self.load_and_embed( + data_formatter.loader, data_formatter.chunker, url, metadata + ) if data_type in ("docs_site",): self.is_docs_site_instance = True + # Send anonymous telemetry + if self.config.collect_metrics: + # it's quicker to check the variable twice than to count words when they won't be submitted. + word_count = sum([len(document.split(" ")) for document in documents]) + + extra_metadata = {"data_type": data_type, "word_count": word_count, "chunks_count": new_chunks} + thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("add", extra_metadata)) + thread_telemetry.start() + def add_local(self, data_type, content, metadata=None, config: AddConfig = None): """ Adds the data you supply to the vector db. @@ -90,6 +110,7 @@ class EmbedChain: :param src: The data to be handled by the loader. Can be a URL for remote sources or local content for local loaders. :param metadata: Optional. Metadata associated with the data source. + :return: (List) documents (embedded text), (List) metadata, (list) ids, (int) number of chunks """ embeddings_data = chunker.create_chunks(loader, src) documents = embeddings_data["documents"] @@ -109,7 +130,8 @@ class EmbedChain: if not data_dict: print(f"All data from {src} already exists in the database.") - return + # Make sure to return a matching return type + return [], [], [], 0 ids = list(data_dict.keys()) documents, metadatas = zip(*data_dict.values()) @@ -126,8 +148,10 @@ class EmbedChain: # Add metadata to each document metadatas_with_metadata = [{**meta, **metadata} for meta in metadatas] - self.db.add(documents=documents, metadatas=list(metadatas_with_metadata), ids=ids) - print((f"Successfully saved {src}. New chunks count: " f"{self.count() - chunks_before_addition}")) + self.db.add(documents=documents, metadatas=metadatas_with_metadata, ids=ids) + count_new_chunks = self.count() - chunks_before_addition + print((f"Successfully saved {src}. New chunks count: {count_new_chunks}")) + return list(documents), metadatas_with_metadata, ids, count_new_chunks def _format_result(self, results): return [ @@ -240,6 +264,10 @@ class EmbedChain: answer = self.get_answer_from_llm(prompt, config) + # Send anonymous telemetry + thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("query",)) + thread_telemetry.start() + if isinstance(answer, str): logging.info(f"Answer: {answer}") return answer @@ -297,6 +325,10 @@ class EmbedChain: memory.chat_memory.add_user_message(input_query) + # Send anonymous telemetry + thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("chat",)) + thread_telemetry.start() + if isinstance(answer, str): memory.chat_memory.add_ai_message(answer) logging.info(f"Answer: {answer}") @@ -321,7 +353,7 @@ class EmbedChain: """ self.collection = self.config.db._get_or_create_collection(collection_name) - def count(self): + def count(self) -> int: """ Count the number of embeddings. @@ -334,4 +366,27 @@ class EmbedChain: Resets the database. Deletes all embeddings irreversibly. `App` has to be reinitialized after using this method. """ + # Send anonymous telemetry + thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("reset",)) + thread_telemetry.start() + self.db.reset() + + @retry(stop=stop_after_attempt(3), wait=wait_fixed(1)) + def _send_telemetry_event(self, method: str, extra_metadata: Optional[dict] = None): + if not self.config.collect_metrics: + return + + with threading.Lock(): + url = "https://api.embedchain.ai/api/v1/telemetry/" + metadata = { + "app_id": self.config.id, + "version": importlib.metadata.version(__package__ or __name__), + "method": method, + "language": "py", + } + if extra_metadata: + metadata.update(extra_metadata) + + response = requests.post(url, json={"metadata": metadata}) + response.raise_for_status() diff --git a/tests/embedchain/test_add.py b/tests/embedchain/test_add.py index 9373afb1..777aedfd 100644 --- a/tests/embedchain/test_add.py +++ b/tests/embedchain/test_add.py @@ -3,13 +3,14 @@ import unittest from unittest.mock import MagicMock, patch from embedchain import App +from embedchain.config import AppConfig class TestApp(unittest.TestCase): os.environ["OPENAI_API_KEY"] = "test_key" def setUp(self): - self.app = App() + self.app = App(config=AppConfig(collect_metrics=False)) @patch("chromadb.api.models.Collection.Collection.add", MagicMock) def test_add(self): diff --git a/tests/embedchain/test_chat.py b/tests/embedchain/test_chat.py index 29032e4d..07fdcf48 100644 --- a/tests/embedchain/test_chat.py +++ b/tests/embedchain/test_chat.py @@ -3,13 +3,14 @@ import unittest from unittest.mock import patch from embedchain import App +from embedchain.config import AppConfig class TestApp(unittest.TestCase): os.environ["OPENAI_API_KEY"] = "test_key" def setUp(self): - self.app = App() + self.app = App(config=AppConfig(collect_metrics=False)) @patch("embedchain.embedchain.memory", autospec=True) @patch.object(App, "retrieve_from_database", return_value=["Test context"]) diff --git a/tests/embedchain/test_embedchain.py b/tests/embedchain/test_embedchain.py index 7d7e924e..bc483d2b 100644 --- a/tests/embedchain/test_embedchain.py +++ b/tests/embedchain/test_embedchain.py @@ -25,7 +25,7 @@ class TestChromaDbHostsLoglevel(unittest.TestCase): """ Test if the `App` instance is initialized without a config that does not contain default hosts and ports. """ - config = AppConfig(log_level="DEBUG") + config = AppConfig(log_level="DEBUG", collect_metrics=False) app = App(config) diff --git a/tests/embedchain/test_generate_prompt.py b/tests/embedchain/test_generate_prompt.py index af91635e..bc26d2d5 100644 --- a/tests/embedchain/test_generate_prompt.py +++ b/tests/embedchain/test_generate_prompt.py @@ -2,12 +2,12 @@ import unittest from string import Template from embedchain import App -from embedchain.embedchain import QueryConfig +from embedchain.config import AppConfig, QueryConfig class TestGeneratePrompt(unittest.TestCase): def setUp(self): - self.app = App() + self.app = App(config=AppConfig(collect_metrics=False)) def test_generate_prompt_with_template(self): """ diff --git a/tests/embedchain/test_query.py b/tests/embedchain/test_query.py index da84ffe7..5c47ce8c 100644 --- a/tests/embedchain/test_query.py +++ b/tests/embedchain/test_query.py @@ -3,14 +3,14 @@ import unittest from unittest.mock import MagicMock, patch from embedchain import App -from embedchain.embedchain import QueryConfig +from embedchain.config import AppConfig, QueryConfig class TestApp(unittest.TestCase): os.environ["OPENAI_API_KEY"] = "test_key" def setUp(self): - self.app = App() + self.app = App(config=AppConfig(collect_metrics=False)) @patch("chromadb.api.models.Collection.Collection.add", MagicMock) def test_query(self): diff --git a/tests/vectordb/test_chroma_db.py b/tests/vectordb/test_chroma_db.py index 37a62d57..37252e4b 100644 --- a/tests/vectordb/test_chroma_db.py +++ b/tests/vectordb/test_chroma_db.py @@ -39,7 +39,7 @@ class TestChromaDbHostsInit(unittest.TestCase): host = "test-host" port = "1234" - config = AppConfig(host=host, port=port) + config = AppConfig(host=host, port=port, collect_metrics=False) _app = App(config) @@ -54,7 +54,7 @@ class TestChromaDbHostsNone(unittest.TestCase): Test if the `App` instance is initialized without default hosts and ports. """ - _app = App() + _app = App(config=AppConfig(collect_metrics=False)) self.assertEqual(mock_client.call_args[0][0].chroma_server_host, None) self.assertEqual(mock_client.call_args[0][0].chroma_server_http_port, None) @@ -68,7 +68,7 @@ class TestChromaDbHostsLoglevel(unittest.TestCase): """ config = AppConfig(log_level="DEBUG") - _app = App(config) + _app = App(config=AppConfig(collect_metrics=False)) self.assertEqual(mock_client.call_args[0][0].chroma_server_host, None) self.assertEqual(mock_client.call_args[0][0].chroma_server_http_port, None) @@ -82,7 +82,7 @@ class TestChromaDbDuplicateHandling: # Start with a clean app App().reset() - app = App() + app = App(config=AppConfig(collect_metrics=False)) app.collection.add(embeddings=[[0, 0, 0]], ids=["0"]) app.collection.add(embeddings=[[0, 0, 0]], ids=["0"]) assert "Insert of existing embedding ID: 0" in caplog.text @@ -97,7 +97,7 @@ class TestChromaDbDuplicateHandling: # Start with a clean app App().reset() - app = App() + app = App(config=AppConfig(collect_metrics=False)) app.set_collection("test_collection_1") app.collection.add(embeddings=[[0, 0, 0]], ids=["0"]) app.set_collection("test_collection_2") @@ -111,7 +111,7 @@ class TestChromaDbCollection(unittest.TestCase): """ Test if the `App` instance is initialized with the correct default collection name. """ - app = App() + app = App(config=AppConfig(collect_metrics=False)) self.assertEqual(app.collection.name, "embedchain_store") @@ -119,7 +119,7 @@ class TestChromaDbCollection(unittest.TestCase): """ Test if the `App` instance is initialized with the correct custom collection name. """ - config = AppConfig(collection_name="test_collection") + config = AppConfig(collection_name="test_collection", collect_metrics=False) app = App(config) self.assertEqual(app.collection.name, "test_collection") @@ -128,7 +128,7 @@ class TestChromaDbCollection(unittest.TestCase): """ Test if the `App` collection is correctly switched using the `set_collection` method. """ - app = App() + app = App(config=AppConfig(collect_metrics=False)) app.set_collection("test_collection") self.assertEqual(app.collection.name, "test_collection") @@ -140,7 +140,7 @@ class TestChromaDbCollection(unittest.TestCase): # Start with a clean app App().reset() - app = App() + app = App(config=AppConfig(collect_metrics=False)) app.set_collection("test_collection_1") # Collection should be empty when created self.assertEqual(app.count(), 0) @@ -166,12 +166,12 @@ class TestChromaDbCollection(unittest.TestCase): # Start with a clean app App().reset() - app = App() + app = App(config=AppConfig(collect_metrics=False)) app.set_collection("test_collection_1") app.collection.add(embeddings=[[0, 0, 0]], ids=["0"]) del app - app = App() + app = App(config=AppConfig(collect_metrics=False)) app.set_collection("test_collection_1") self.assertEqual(app.count(), 1) @@ -185,8 +185,8 @@ class TestChromaDbCollection(unittest.TestCase): App().reset() # Create two apps - app1 = App(AppConfig(collection_name="test_collection_1")) - app2 = App(AppConfig(collection_name="test_collection_2")) + app1 = App(AppConfig(collection_name="test_collection_1", collect_metrics=False)) + app2 = App(AppConfig(collection_name="test_collection_2", collect_metrics=False)) # app2 has been created last, but adding to app1 will still write to collection 1. app1.collection.add(embeddings=[0, 0, 0], ids=["0"]) @@ -211,8 +211,8 @@ class TestChromaDbCollection(unittest.TestCase): App().reset() # Create two apps - app1 = App(AppConfig(collection_name="one_collection", id="new_app_id_1")) - app2 = App(AppConfig(collection_name="one_collection", id="new_app_id_2")) + app1 = App(AppConfig(collection_name="one_collection", id="new_app_id_1", collect_metrics=False)) + app2 = App(AppConfig(collection_name="one_collection", id="new_app_id_2", collect_metrics=False)) # Add data app1.collection.add(embeddings=[[0, 0, 0], [1, 1, 1]], ids=["0", "1"]) @@ -231,10 +231,10 @@ class TestChromaDbCollection(unittest.TestCase): # Create four apps. # app1, which we are about to reset, shares an app with one, and an id with the other, none with the last. - app1 = App(AppConfig(collection_name="one_collection", id="new_app_id_1")) - app2 = App(AppConfig(collection_name="one_collection", id="new_app_id_2")) - app3 = App(AppConfig(collection_name="three_collection", id="new_app_id_1")) - app4 = App(AppConfig(collection_name="four_collection", id="new_app_id_4")) + app1 = App(AppConfig(collection_name="one_collection", id="new_app_id_1", collect_metrics=False)) + app2 = App(AppConfig(collection_name="one_collection", id="new_app_id_2", collect_metrics=False)) + app3 = App(AppConfig(collection_name="three_collection", id="new_app_id_1", collect_metrics=False)) + app4 = App(AppConfig(collection_name="four_collection", id="new_app_id_4", collect_metrics=False)) # Each one of them get data app1.collection.add(embeddings=[0, 0, 0], ids=["1"]) @@ -246,10 +246,10 @@ class TestChromaDbCollection(unittest.TestCase): app1.reset() # Reinstantiate them - app1 = App(AppConfig(collection_name="one_collection", id="new_app_id_1")) - app2 = App(AppConfig(collection_name="one_collection", id="new_app_id_2")) - app3 = App(AppConfig(collection_name="three_collection", id="new_app_id_3")) - app4 = App(AppConfig(collection_name="four_collection", id="new_app_id_3")) + app1 = App(AppConfig(collection_name="one_collection", id="new_app_id_1", collect_metrics=False)) + app2 = App(AppConfig(collection_name="one_collection", id="new_app_id_2", collect_metrics=False)) + app3 = App(AppConfig(collection_name="three_collection", id="new_app_id_3", collect_metrics=False)) + app4 = App(AppConfig(collection_name="four_collection", id="new_app_id_3", collect_metrics=False)) # All should be empty self.assertEqual(app1.count(), 0)