feat: anonymous telemetry (#423)

This commit is contained in:
cachho
2023-08-12 01:27:11 +02:00
committed by GitHub
parent 1e0d967bb5
commit 163f437582
12 changed files with 126 additions and 38 deletions

View File

@@ -1,9 +1,14 @@
import importlib.metadata
import logging
import os
import threading
from typing import Optional
import requests
from dotenv import load_dotenv
from langchain.docstore.document import Document
from langchain.memory import ConversationBufferMemory
from tenacity import retry, stop_after_attempt, wait_fixed
from embedchain.chunkers.base_chunker import BaseChunker
from embedchain.config import AddConfig, ChatConfig, QueryConfig
@@ -36,6 +41,10 @@ class EmbedChain:
self.is_docs_site_instance = False
self.online = False
# Send anonymous telemetry
thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("init",))
thread_telemetry.start()
def add(self, data_type, url, metadata=None, config: AddConfig = None):
"""
Adds the data from the given URL to the vector db.
@@ -53,10 +62,21 @@ class EmbedChain:
data_formatter = DataFormatter(data_type, config)
self.user_asks.append([data_type, url, metadata])
self.load_and_embed(data_formatter.loader, data_formatter.chunker, url, metadata)
documents, _metadatas, _ids, new_chunks = self.load_and_embed(
data_formatter.loader, data_formatter.chunker, url, metadata
)
if data_type in ("docs_site",):
self.is_docs_site_instance = True
# Send anonymous telemetry
if self.config.collect_metrics:
# it's quicker to check the variable twice than to count words when they won't be submitted.
word_count = sum([len(document.split(" ")) for document in documents])
extra_metadata = {"data_type": data_type, "word_count": word_count, "chunks_count": new_chunks}
thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("add", extra_metadata))
thread_telemetry.start()
def add_local(self, data_type, content, metadata=None, config: AddConfig = None):
"""
Adds the data you supply to the vector db.
@@ -90,6 +110,7 @@ class EmbedChain:
:param src: The data to be handled by the loader. Can be a URL for
remote sources or local content for local loaders.
:param metadata: Optional. Metadata associated with the data source.
:return: (List) documents (embedded text), (List) metadata, (list) ids, (int) number of chunks
"""
embeddings_data = chunker.create_chunks(loader, src)
documents = embeddings_data["documents"]
@@ -109,7 +130,8 @@ class EmbedChain:
if not data_dict:
print(f"All data from {src} already exists in the database.")
return
# Make sure to return a matching return type
return [], [], [], 0
ids = list(data_dict.keys())
documents, metadatas = zip(*data_dict.values())
@@ -126,8 +148,10 @@ class EmbedChain:
# Add metadata to each document
metadatas_with_metadata = [{**meta, **metadata} for meta in metadatas]
self.db.add(documents=documents, metadatas=list(metadatas_with_metadata), ids=ids)
print((f"Successfully saved {src}. New chunks count: " f"{self.count() - chunks_before_addition}"))
self.db.add(documents=documents, metadatas=metadatas_with_metadata, ids=ids)
count_new_chunks = self.count() - chunks_before_addition
print((f"Successfully saved {src}. New chunks count: {count_new_chunks}"))
return list(documents), metadatas_with_metadata, ids, count_new_chunks
def _format_result(self, results):
return [
@@ -240,6 +264,10 @@ class EmbedChain:
answer = self.get_answer_from_llm(prompt, config)
# Send anonymous telemetry
thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("query",))
thread_telemetry.start()
if isinstance(answer, str):
logging.info(f"Answer: {answer}")
return answer
@@ -297,6 +325,10 @@ class EmbedChain:
memory.chat_memory.add_user_message(input_query)
# Send anonymous telemetry
thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("chat",))
thread_telemetry.start()
if isinstance(answer, str):
memory.chat_memory.add_ai_message(answer)
logging.info(f"Answer: {answer}")
@@ -321,7 +353,7 @@ class EmbedChain:
"""
self.collection = self.config.db._get_or_create_collection(collection_name)
def count(self):
def count(self) -> int:
"""
Count the number of embeddings.
@@ -334,4 +366,27 @@ class EmbedChain:
Resets the database. Deletes all embeddings irreversibly.
`App` has to be reinitialized after using this method.
"""
# Send anonymous telemetry
thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("reset",))
thread_telemetry.start()
self.db.reset()
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
def _send_telemetry_event(self, method: str, extra_metadata: Optional[dict] = None):
if not self.config.collect_metrics:
return
with threading.Lock():
url = "https://api.embedchain.ai/api/v1/telemetry/"
metadata = {
"app_id": self.config.id,
"version": importlib.metadata.version(__package__ or __name__),
"method": method,
"language": "py",
}
if extra_metadata:
metadata.update(extra_metadata)
response = requests.post(url, json={"metadata": metadata})
response.raise_for_status()