feat: anonymous telemetry (#423)

This commit is contained in:
cachho
2023-08-12 01:27:11 +02:00
committed by GitHub
parent 1e0d967bb5
commit 163f437582
12 changed files with 126 additions and 38 deletions

View File

@@ -10,6 +10,7 @@ title: '🔍 Query configurations'
| embedding_fn| embedding function | chromadb.utils.embedding_functions | \{text-embedding-ada-002\} |
| db | vector database (experimental) | BaseVectorDB | ChromaDB |
| collection_name | initial collection name for the database | string | embedchain_store |
| collection_metrics | collect anonymous telemetry data to improve embedchain | boolean | true |
## AddConfig

View File

@@ -1,4 +1,5 @@
import os
from typing import Optional
try:
from chromadb.utils import embedding_functions
@@ -16,7 +17,15 @@ class AppConfig(BaseAppConfig):
Config to initialize an embedchain custom `App` instance, with extra config options.
"""
def __init__(self, log_level=None, host=None, port=None, id=None, collection_name=None):
def __init__(
self,
log_level=None,
host=None,
port=None,
id=None,
collection_name=None,
collect_metrics: Optional[bool] = None,
):
"""
:param log_level: Optional. (String) Debug level
['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'].
@@ -24,6 +33,7 @@ class AppConfig(BaseAppConfig):
:param port: Optional. Port for the database server.
:param id: Optional. ID of the app. Document metadata will have this id.
:param collection_name: Optional. Collection name for the database.
:param collect_metrics: Defaults to True. Send anonymous telemetry to improve embedchain.
"""
super().__init__(
log_level=log_level,
@@ -32,6 +42,7 @@ class AppConfig(BaseAppConfig):
port=port,
id=id,
collection_name=collection_name,
collect_metrics=collect_metrics,
)
@staticmethod

View File

@@ -19,6 +19,7 @@ class BaseAppConfig(BaseConfig):
port=None,
id=None,
collection_name=None,
collect_metrics: bool = True,
db_type: VectorDatabases = None,
vector_dim: VectorDimensions = None,
es_config: ElasticsearchDBConfig = None,
@@ -32,6 +33,7 @@ class BaseAppConfig(BaseConfig):
:param port: Optional. Port for the database server.
:param id: Optional. ID of the app. Document metadata will have this id.
:param collection_name: Optional. Collection name for the database.
:param collect_metrics: Defaults to True. Send anonymous telemetry to improve embedchain.
:param db_type: Optional. type of Vector database to use
:param vector_dim: Vector dimension generated by embedding fn
:param es_config: Optional. elasticsearch database config to be used for connection
@@ -49,6 +51,7 @@ class BaseAppConfig(BaseConfig):
es_config=es_config,
)
self.id = id
self.collect_metrics = True if (collect_metrics is True or collect_metrics is None) else False
return
@staticmethod

View File

@@ -1,4 +1,4 @@
from typing import Any
from typing import Any, Optional
from chromadb.api.types import Documents, Embeddings
from dotenv import load_dotenv
@@ -30,6 +30,7 @@ class CustomAppConfig(BaseAppConfig):
provider: Providers = None,
open_source_app_config=None,
deployment_name=None,
collect_metrics: Optional[bool] = None,
db_type: VectorDatabases = None,
es_config: ElasticsearchDBConfig = None,
):
@@ -45,6 +46,7 @@ class CustomAppConfig(BaseAppConfig):
:param collection_name: Optional. Collection name for the database.
:param provider: Optional. (Providers): LLM Provider to use.
:param open_source_app_config: Optional. Config instance needed for open source apps.
:param collect_metrics: Defaults to True. Send anonymous telemetry to improve embedchain.
:param db_type: Optional. type of Vector database to use.
:param es_config: Optional. elasticsearch database config to be used for connection
"""
@@ -65,6 +67,7 @@ class CustomAppConfig(BaseAppConfig):
port=port,
id=id,
collection_name=collection_name,
collect_metrics=collect_metrics,
db_type=db_type,
vector_dim=CustomAppConfig.get_vector_dimension(embedding_function=embedding_fn),
es_config=es_config,

View File

@@ -1,3 +1,5 @@
from typing import Optional
from chromadb.utils import embedding_functions
from .BaseAppConfig import BaseAppConfig
@@ -8,7 +10,16 @@ class OpenSourceAppConfig(BaseAppConfig):
Config to initialize an embedchain custom `OpenSourceApp` instance, with extra config options.
"""
def __init__(self, log_level=None, host=None, port=None, id=None, collection_name=None, model=None):
def __init__(
self,
log_level=None,
host=None,
port=None,
id=None,
collection_name=None,
collect_metrics: Optional[bool] = None,
model=None,
):
"""
:param log_level: Optional. (String) Debug level
['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'].
@@ -16,6 +27,7 @@ class OpenSourceAppConfig(BaseAppConfig):
:param collection_name: Optional. Collection name for the database.
:param host: Optional. Hostname for the database server.
:param port: Optional. Port for the database server.
:param collect_metrics: Defaults to True. Send anonymous telemetry to improve embedchain.
:param model: Optional. GPT4ALL uses the model to instantiate the class.
So unlike `App`, it has to be provided before querying.
"""
@@ -28,6 +40,7 @@ class OpenSourceAppConfig(BaseAppConfig):
port=port,
id=id,
collection_name=collection_name,
collect_metrics=collect_metrics,
)
@staticmethod

View File

@@ -1,9 +1,14 @@
import importlib.metadata
import logging
import os
import threading
from typing import Optional
import requests
from dotenv import load_dotenv
from langchain.docstore.document import Document
from langchain.memory import ConversationBufferMemory
from tenacity import retry, stop_after_attempt, wait_fixed
from embedchain.chunkers.base_chunker import BaseChunker
from embedchain.config import AddConfig, ChatConfig, QueryConfig
@@ -36,6 +41,10 @@ class EmbedChain:
self.is_docs_site_instance = False
self.online = False
# Send anonymous telemetry
thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("init",))
thread_telemetry.start()
def add(self, data_type, url, metadata=None, config: AddConfig = None):
"""
Adds the data from the given URL to the vector db.
@@ -53,10 +62,21 @@ class EmbedChain:
data_formatter = DataFormatter(data_type, config)
self.user_asks.append([data_type, url, metadata])
self.load_and_embed(data_formatter.loader, data_formatter.chunker, url, metadata)
documents, _metadatas, _ids, new_chunks = self.load_and_embed(
data_formatter.loader, data_formatter.chunker, url, metadata
)
if data_type in ("docs_site",):
self.is_docs_site_instance = True
# Send anonymous telemetry
if self.config.collect_metrics:
# it's quicker to check the variable twice than to count words when they won't be submitted.
word_count = sum([len(document.split(" ")) for document in documents])
extra_metadata = {"data_type": data_type, "word_count": word_count, "chunks_count": new_chunks}
thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("add", extra_metadata))
thread_telemetry.start()
def add_local(self, data_type, content, metadata=None, config: AddConfig = None):
"""
Adds the data you supply to the vector db.
@@ -90,6 +110,7 @@ class EmbedChain:
:param src: The data to be handled by the loader. Can be a URL for
remote sources or local content for local loaders.
:param metadata: Optional. Metadata associated with the data source.
:return: (List) documents (embedded text), (List) metadata, (list) ids, (int) number of chunks
"""
embeddings_data = chunker.create_chunks(loader, src)
documents = embeddings_data["documents"]
@@ -109,7 +130,8 @@ class EmbedChain:
if not data_dict:
print(f"All data from {src} already exists in the database.")
return
# Make sure to return a matching return type
return [], [], [], 0
ids = list(data_dict.keys())
documents, metadatas = zip(*data_dict.values())
@@ -126,8 +148,10 @@ class EmbedChain:
# Add metadata to each document
metadatas_with_metadata = [{**meta, **metadata} for meta in metadatas]
self.db.add(documents=documents, metadatas=list(metadatas_with_metadata), ids=ids)
print((f"Successfully saved {src}. New chunks count: " f"{self.count() - chunks_before_addition}"))
self.db.add(documents=documents, metadatas=metadatas_with_metadata, ids=ids)
count_new_chunks = self.count() - chunks_before_addition
print((f"Successfully saved {src}. New chunks count: {count_new_chunks}"))
return list(documents), metadatas_with_metadata, ids, count_new_chunks
def _format_result(self, results):
return [
@@ -240,6 +264,10 @@ class EmbedChain:
answer = self.get_answer_from_llm(prompt, config)
# Send anonymous telemetry
thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("query",))
thread_telemetry.start()
if isinstance(answer, str):
logging.info(f"Answer: {answer}")
return answer
@@ -297,6 +325,10 @@ class EmbedChain:
memory.chat_memory.add_user_message(input_query)
# Send anonymous telemetry
thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("chat",))
thread_telemetry.start()
if isinstance(answer, str):
memory.chat_memory.add_ai_message(answer)
logging.info(f"Answer: {answer}")
@@ -321,7 +353,7 @@ class EmbedChain:
"""
self.collection = self.config.db._get_or_create_collection(collection_name)
def count(self):
def count(self) -> int:
"""
Count the number of embeddings.
@@ -334,4 +366,27 @@ class EmbedChain:
Resets the database. Deletes all embeddings irreversibly.
`App` has to be reinitialized after using this method.
"""
# Send anonymous telemetry
thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("reset",))
thread_telemetry.start()
self.db.reset()
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
def _send_telemetry_event(self, method: str, extra_metadata: Optional[dict] = None):
if not self.config.collect_metrics:
return
with threading.Lock():
url = "https://api.embedchain.ai/api/v1/telemetry/"
metadata = {
"app_id": self.config.id,
"version": importlib.metadata.version(__package__ or __name__),
"method": method,
"language": "py",
}
if extra_metadata:
metadata.update(extra_metadata)
response = requests.post(url, json={"metadata": metadata})
response.raise_for_status()

View File

@@ -3,13 +3,14 @@ import unittest
from unittest.mock import MagicMock, patch
from embedchain import App
from embedchain.config import AppConfig
class TestApp(unittest.TestCase):
os.environ["OPENAI_API_KEY"] = "test_key"
def setUp(self):
self.app = App()
self.app = App(config=AppConfig(collect_metrics=False))
@patch("chromadb.api.models.Collection.Collection.add", MagicMock)
def test_add(self):

View File

@@ -3,13 +3,14 @@ import unittest
from unittest.mock import patch
from embedchain import App
from embedchain.config import AppConfig
class TestApp(unittest.TestCase):
os.environ["OPENAI_API_KEY"] = "test_key"
def setUp(self):
self.app = App()
self.app = App(config=AppConfig(collect_metrics=False))
@patch("embedchain.embedchain.memory", autospec=True)
@patch.object(App, "retrieve_from_database", return_value=["Test context"])

View File

@@ -25,7 +25,7 @@ class TestChromaDbHostsLoglevel(unittest.TestCase):
"""
Test if the `App` instance is initialized without a config that does not contain default hosts and ports.
"""
config = AppConfig(log_level="DEBUG")
config = AppConfig(log_level="DEBUG", collect_metrics=False)
app = App(config)

View File

@@ -2,12 +2,12 @@ import unittest
from string import Template
from embedchain import App
from embedchain.embedchain import QueryConfig
from embedchain.config import AppConfig, QueryConfig
class TestGeneratePrompt(unittest.TestCase):
def setUp(self):
self.app = App()
self.app = App(config=AppConfig(collect_metrics=False))
def test_generate_prompt_with_template(self):
"""

View File

@@ -3,14 +3,14 @@ import unittest
from unittest.mock import MagicMock, patch
from embedchain import App
from embedchain.embedchain import QueryConfig
from embedchain.config import AppConfig, QueryConfig
class TestApp(unittest.TestCase):
os.environ["OPENAI_API_KEY"] = "test_key"
def setUp(self):
self.app = App()
self.app = App(config=AppConfig(collect_metrics=False))
@patch("chromadb.api.models.Collection.Collection.add", MagicMock)
def test_query(self):

View File

@@ -39,7 +39,7 @@ class TestChromaDbHostsInit(unittest.TestCase):
host = "test-host"
port = "1234"
config = AppConfig(host=host, port=port)
config = AppConfig(host=host, port=port, collect_metrics=False)
_app = App(config)
@@ -54,7 +54,7 @@ class TestChromaDbHostsNone(unittest.TestCase):
Test if the `App` instance is initialized without default hosts and ports.
"""
_app = App()
_app = App(config=AppConfig(collect_metrics=False))
self.assertEqual(mock_client.call_args[0][0].chroma_server_host, None)
self.assertEqual(mock_client.call_args[0][0].chroma_server_http_port, None)
@@ -68,7 +68,7 @@ class TestChromaDbHostsLoglevel(unittest.TestCase):
"""
config = AppConfig(log_level="DEBUG")
_app = App(config)
_app = App(config=AppConfig(collect_metrics=False))
self.assertEqual(mock_client.call_args[0][0].chroma_server_host, None)
self.assertEqual(mock_client.call_args[0][0].chroma_server_http_port, None)
@@ -82,7 +82,7 @@ class TestChromaDbDuplicateHandling:
# Start with a clean app
App().reset()
app = App()
app = App(config=AppConfig(collect_metrics=False))
app.collection.add(embeddings=[[0, 0, 0]], ids=["0"])
app.collection.add(embeddings=[[0, 0, 0]], ids=["0"])
assert "Insert of existing embedding ID: 0" in caplog.text
@@ -97,7 +97,7 @@ class TestChromaDbDuplicateHandling:
# Start with a clean app
App().reset()
app = App()
app = App(config=AppConfig(collect_metrics=False))
app.set_collection("test_collection_1")
app.collection.add(embeddings=[[0, 0, 0]], ids=["0"])
app.set_collection("test_collection_2")
@@ -111,7 +111,7 @@ class TestChromaDbCollection(unittest.TestCase):
"""
Test if the `App` instance is initialized with the correct default collection name.
"""
app = App()
app = App(config=AppConfig(collect_metrics=False))
self.assertEqual(app.collection.name, "embedchain_store")
@@ -119,7 +119,7 @@ class TestChromaDbCollection(unittest.TestCase):
"""
Test if the `App` instance is initialized with the correct custom collection name.
"""
config = AppConfig(collection_name="test_collection")
config = AppConfig(collection_name="test_collection", collect_metrics=False)
app = App(config)
self.assertEqual(app.collection.name, "test_collection")
@@ -128,7 +128,7 @@ class TestChromaDbCollection(unittest.TestCase):
"""
Test if the `App` collection is correctly switched using the `set_collection` method.
"""
app = App()
app = App(config=AppConfig(collect_metrics=False))
app.set_collection("test_collection")
self.assertEqual(app.collection.name, "test_collection")
@@ -140,7 +140,7 @@ class TestChromaDbCollection(unittest.TestCase):
# Start with a clean app
App().reset()
app = App()
app = App(config=AppConfig(collect_metrics=False))
app.set_collection("test_collection_1")
# Collection should be empty when created
self.assertEqual(app.count(), 0)
@@ -166,12 +166,12 @@ class TestChromaDbCollection(unittest.TestCase):
# Start with a clean app
App().reset()
app = App()
app = App(config=AppConfig(collect_metrics=False))
app.set_collection("test_collection_1")
app.collection.add(embeddings=[[0, 0, 0]], ids=["0"])
del app
app = App()
app = App(config=AppConfig(collect_metrics=False))
app.set_collection("test_collection_1")
self.assertEqual(app.count(), 1)
@@ -185,8 +185,8 @@ class TestChromaDbCollection(unittest.TestCase):
App().reset()
# Create two apps
app1 = App(AppConfig(collection_name="test_collection_1"))
app2 = App(AppConfig(collection_name="test_collection_2"))
app1 = App(AppConfig(collection_name="test_collection_1", collect_metrics=False))
app2 = App(AppConfig(collection_name="test_collection_2", collect_metrics=False))
# app2 has been created last, but adding to app1 will still write to collection 1.
app1.collection.add(embeddings=[0, 0, 0], ids=["0"])
@@ -211,8 +211,8 @@ class TestChromaDbCollection(unittest.TestCase):
App().reset()
# Create two apps
app1 = App(AppConfig(collection_name="one_collection", id="new_app_id_1"))
app2 = App(AppConfig(collection_name="one_collection", id="new_app_id_2"))
app1 = App(AppConfig(collection_name="one_collection", id="new_app_id_1", collect_metrics=False))
app2 = App(AppConfig(collection_name="one_collection", id="new_app_id_2", collect_metrics=False))
# Add data
app1.collection.add(embeddings=[[0, 0, 0], [1, 1, 1]], ids=["0", "1"])
@@ -231,10 +231,10 @@ class TestChromaDbCollection(unittest.TestCase):
# Create four apps.
# app1, which we are about to reset, shares an app with one, and an id with the other, none with the last.
app1 = App(AppConfig(collection_name="one_collection", id="new_app_id_1"))
app2 = App(AppConfig(collection_name="one_collection", id="new_app_id_2"))
app3 = App(AppConfig(collection_name="three_collection", id="new_app_id_1"))
app4 = App(AppConfig(collection_name="four_collection", id="new_app_id_4"))
app1 = App(AppConfig(collection_name="one_collection", id="new_app_id_1", collect_metrics=False))
app2 = App(AppConfig(collection_name="one_collection", id="new_app_id_2", collect_metrics=False))
app3 = App(AppConfig(collection_name="three_collection", id="new_app_id_1", collect_metrics=False))
app4 = App(AppConfig(collection_name="four_collection", id="new_app_id_4", collect_metrics=False))
# Each one of them get data
app1.collection.add(embeddings=[0, 0, 0], ids=["1"])
@@ -246,10 +246,10 @@ class TestChromaDbCollection(unittest.TestCase):
app1.reset()
# Reinstantiate them
app1 = App(AppConfig(collection_name="one_collection", id="new_app_id_1"))
app2 = App(AppConfig(collection_name="one_collection", id="new_app_id_2"))
app3 = App(AppConfig(collection_name="three_collection", id="new_app_id_3"))
app4 = App(AppConfig(collection_name="four_collection", id="new_app_id_3"))
app1 = App(AppConfig(collection_name="one_collection", id="new_app_id_1", collect_metrics=False))
app2 = App(AppConfig(collection_name="one_collection", id="new_app_id_2", collect_metrics=False))
app3 = App(AppConfig(collection_name="three_collection", id="new_app_id_3", collect_metrics=False))
app4 = App(AppConfig(collection_name="four_collection", id="new_app_id_3", collect_metrics=False))
# All should be empty
self.assertEqual(app1.count(), 0)