[Feature]: Add posthog anonymous telemetry and update docs (#867)

This commit is contained in:
Deshraj Yadav
2023-10-29 01:20:21 -07:00
committed by GitHub
parent 35c2b83015
commit 81336668b3
34 changed files with 242 additions and 195 deletions

View File

@@ -24,7 +24,7 @@ Once you have obtained the key, you can use it like this:
```python main.py
import os
from embedchain import App
from embedchain import Pipeline as App
os.environ['OPENAI_API_KEY'] = 'xxx'
@@ -52,7 +52,7 @@ To use Azure OpenAI embedding model, you have to set some of the azure openai re
```python main.py
import os
from embedchain import App
from embedchain import Pipeline as App
os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_BASE"] = "https://xxx.openai.azure.com/"
@@ -90,7 +90,7 @@ GPT4All supports generating high quality embeddings of arbitrary length document
<CodeGroup>
```python main.py
from embedchain import App
from embedchain import Pipeline as App
# load embedding model configuration from config.yaml file
app = App.from_config(yaml_path="config.yaml")
@@ -119,7 +119,7 @@ Hugging Face supports generating embeddings of arbitrary length documents of tex
<CodeGroup>
```python main.py
from embedchain import App
from embedchain import Pipeline as App
# load embedding model configuration from config.yaml file
app = App.from_config(yaml_path="config.yaml")
@@ -150,7 +150,7 @@ Embedchain supports Google's VertexAI embeddings model through a simple interfac
<CodeGroup>
```python main.py
from embedchain import App
from embedchain import Pipeline as App
# load embedding model configuration from config.yaml file
app = App.from_config(yaml_path="config.yaml")

View File

@@ -26,7 +26,7 @@ Once you have obtained the key, you can use it like this:
```python
import os
from embedchain import App
from embedchain import Pipeline as App
os.environ['OPENAI_API_KEY'] = 'xxx'
@@ -41,7 +41,7 @@ If you are looking to configure the different parameters of the LLM, you can do
```python main.py
import os
from embedchain import App
from embedchain import Pipeline as App
os.environ['OPENAI_API_KEY'] = 'xxx'
@@ -71,7 +71,7 @@ To use Azure OpenAI model, you have to set some of the azure openai related envi
```python main.py
import os
from embedchain import App
from embedchain import Pipeline as App
os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_BASE"] = "https://xxx.openai.azure.com/"
@@ -110,7 +110,7 @@ To use anthropic's model, please set the `ANTHROPIC_API_KEY` which you find on t
```python main.py
import os
from embedchain import App
from embedchain import Pipeline as App
os.environ["ANTHROPIC_API_KEY"] = "xxx"
@@ -147,7 +147,7 @@ Once you have the API key, you are all set to use it with Embedchain.
```python main.py
import os
from embedchain import App
from embedchain import Pipeline as App
os.environ["COHERE_API_KEY"] = "xxx"
@@ -180,7 +180,7 @@ GPT4all is a free-to-use, locally running, privacy-aware chatbot. No GPU or inte
<CodeGroup>
```python main.py
from embedchain import App
from embedchain import Pipeline as App
# load llm configuration from config.yaml file
app = App.from_config(yaml_path="config.yaml")
@@ -212,7 +212,7 @@ Once you have the key, load the app using the config yaml file:
```python main.py
import os
from embedchain import App
from embedchain import Pipeline as App
os.environ["JINACHAT_API_KEY"] = "xxx"
# load llm configuration from config.yaml file
@@ -248,7 +248,7 @@ Once you have the token, load the app using the config yaml file:
```python main.py
import os
from embedchain import App
from embedchain import Pipeline as App
os.environ["HUGGINGFACE_ACCESS_TOKEN"] = "xxx"
@@ -278,7 +278,7 @@ Once you have the token, load the app using the config yaml file:
```python main.py
import os
from embedchain import App
from embedchain import Pipeline as App
os.environ["REPLICATE_API_TOKEN"] = "xxx"
@@ -305,7 +305,7 @@ Setup Google Cloud Platform application credentials by following the instruction
<CodeGroup>
```python main.py
from embedchain import App
from embedchain import Pipeline as App
# load llm configuration from config.yaml file
app = App.from_config(yaml_path="config.yaml")

View File

@@ -22,7 +22,7 @@ Utilizing a vector database alongside Embedchain is a seamless process. All you
<CodeGroup>
```python main.py
from embedchain import App
from embedchain import Pipeline as App
# load chroma configuration from yaml file
app = App.from_config(yaml_path="config1.yaml")
@@ -61,7 +61,7 @@ pip install --upgrade 'embedchain[elasticsearch]'
<CodeGroup>
```python main.py
from embedchain import App
from embedchain import Pipeline as App
# load elasticsearch configuration from yaml file
app = App.from_config(yaml_path="config.yaml")
@@ -89,7 +89,7 @@ pip install --upgrade 'embedchain[opensearch]'
<CodeGroup>
```python main.py
from embedchain import App
from embedchain import Pipeline as App
# load opensearch configuration from yaml file
app = App.from_config(yaml_path="config.yaml")
@@ -125,7 +125,7 @@ Set the Zilliz environment variables `ZILLIZ_CLOUD_URI` and `ZILLIZ_CLOUD_TOKEN`
```python main.py
import os
from embedchain import App
from embedchain import Pipeline as App
os.environ['ZILLIZ_CLOUD_URI'] = 'https://xxx.zillizcloud.com'
os.environ['ZILLIZ_CLOUD_TOKEN'] = 'xxx'
@@ -164,7 +164,7 @@ In order to use Pinecone as vector database, set the environment variables `PINE
<CodeGroup>
```python main.py
from embedchain import App
from embedchain import Pipeline as App
# load pinecone configuration from yaml file
app = App.from_config(yaml_path="config.yaml")
@@ -187,7 +187,7 @@ In order to use Qdrant as a vector database, set the environment variables `QDRA
<CodeGroup>
```python main.py
from embedchain import App
from embedchain import Pipeline as App
# load qdrant configuration from yaml file
app = App.from_config(yaml_path="config.yaml")
@@ -207,7 +207,7 @@ In order to use Weaviate as a vector database, set the environment variables `WE
<CodeGroup>
```python main.py
from embedchain import App
from embedchain import Pipeline as App
# load weaviate configuration from yaml file
app = App.from_config(yaml_path="config.yaml")

View File

@@ -5,7 +5,7 @@ title: '📊 CSV'
To add any csv file, use the data_type as `csv`. `csv` allows remote urls and conventional file paths. Headers are included for each line, so if you have an `age` column, `18` will be added as `age: 18`. Eg:
```python
from embedchain import App
from embedchain import Pipeline as App
app = App()
app.add('https://people.sc.fsu.edu/~jburkardt/data/csv/airtravel.csv', data_type="csv")

View File

@@ -35,7 +35,7 @@ Default behavior is to create a persistent vector db in the directory **./db**.
Create a local index:
```python
from embedchain import App
from embedchain import Pipeline as App
naval_chat_bot = App()
naval_chat_bot.add("https://www.youtube.com/watch?v=3qHkcs3kG44")
@@ -45,7 +45,7 @@ naval_chat_bot.add("https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Alma
You can reuse the local index with the same code, but without adding new documents:
```python
from embedchain import App
from embedchain import Pipeline as App
naval_chat_bot = App()
print(naval_chat_bot.query("What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?"))

View File

@@ -5,7 +5,7 @@ title: '📚🌐 Code documentation'
To add any code documentation website as a loader, use the data_type as `docs_site`. Eg:
```python
from embedchain import App
from embedchain import Pipeline as App
app = App()
app.add("https://docs.embedchain.ai/", data_type="docs_site")

View File

@@ -7,7 +7,7 @@ title: '📄 Docx file'
To add any doc/docx file, use the data_type as `docx`. `docx` allows remote urls and conventional file paths. Eg:
```python
from embedchain import App
from embedchain import Pipeline as App
app = App()
app.add('https://example.com/content/intro.docx', data_type="docx")

View File

@@ -5,7 +5,7 @@ title: '📝 Mdx file'
To add any `.mdx` file to your app, use the data_type (first argument to `.add()` method) as `mdx`. Note that this supports support mdx file present on machine, so this should be a file path. Eg:
```python
from embedchain import App
from embedchain import Pipeline as App
app = App()
app.add('path/to/file.mdx', data_type='mdx')

View File

@@ -8,7 +8,7 @@ To load a notion page, use the data_type as `notion`. Since it is hard to automa
The next argument must **end** with the `notion page id`. The id is a 32-character string. Eg:
```python
from embedchain import App
from embedchain import Pipeline as App
app = App()

View File

@@ -5,7 +5,7 @@ title: '📰 PDF file'
To add any pdf file, use the data_type as `pdf_file`. Eg:
```python
from embedchain import App
from embedchain import Pipeline as App
app = App()

View File

@@ -5,7 +5,7 @@ title: '❓💬 Queston and answer pair'
QnA pair is a local data type. To supply your own QnA pair, use the data_type as `qna_pair` and enter a tuple. Eg:
```python
from embedchain import App
from embedchain import Pipeline as App
app = App()

View File

@@ -5,7 +5,7 @@ title: '🗺️ Sitemap'
Add all web pages from an xml-sitemap. Filters non-text files. Use the data_type as `sitemap`. Eg:
```python
from embedchain import App
from embedchain import Pipeline as App
app = App()

View File

@@ -7,7 +7,7 @@ title: '📝 Text'
Text is a local data type. To supply your own text, use the data_type as `text` and enter a string. The text is not processed, this can be very versatile. Eg:
```python
from embedchain import App
from embedchain import Pipeline as App
app = App()

View File

@@ -5,7 +5,7 @@ title: '🌐📄 Web page'
To add any web page, use the data_type as `web_page`. Eg:
```python
from embedchain import App
from embedchain import Pipeline as App
app = App()

View File

@@ -7,7 +7,7 @@ title: '🧾 XML file'
To add any xml file, use the data_type as `xml`. Eg:
```python
from embedchain import App
from embedchain import Pipeline as App
app = App()

View File

@@ -6,7 +6,7 @@ title: '🎥📺 Youtube video'
To add any youtube video to your app, use the data_type (first argument to `.add()` method) as `youtube_video`. Eg:
```python
from embedchain import App
from embedchain import Pipeline as App
app = App()
app.add('a_valid_youtube_url_here', data_type='youtube_video')

View File

@@ -9,7 +9,7 @@ description: 'Collections of all the frequently asked questions'
```python main.py
import os
from embedchain import App
from embedchain import Pipeline as App
os.environ['OPENAI_API_KEY'] = 'xxx'
@@ -36,7 +36,7 @@ llm:
```python main.py
import os
from embedchain import App
from embedchain import Pipeline as App
os.environ['OPENAI_API_KEY'] = 'xxx'

View File

@@ -12,7 +12,7 @@ You can add data from different data sources using the `.add()` method. Then, si
If you want to create a Naval Ravikant bot with a YouTube video, a book in PDF format, two blog posts, and a question and answer pair, all you need to do is add the respective links. Embedchain will take care of the rest, creating a bot for you.
```python
from embedchain import App
from embedchain import Pipeline as App
naval_bot = App()
# Add online data

View File

@@ -16,22 +16,22 @@ Creating an app involves 3 steps:
<Steps>
<Step title="⚙️ Import app instance">
```python
from embedchain import App
from embedchain import Pipeline as App
app = App()
```
</Step>
<Step title="🗃️ Add data sources">
```python
# Add different data sources
elon_bot.add("https://en.wikipedia.org/wiki/Elon_Musk")
elon_bot.add("https://www.forbes.com/profile/elon-musk")
app.add("https://en.wikipedia.org/wiki/Elon_Musk")
app.add("https://www.forbes.com/profile/elon-musk")
# You can also add local data sources such as pdf, csv files etc.
# elon_bot.add("/path/to/file.pdf")
# app.add("/path/to/file.pdf")
```
</Step>
<Step title="💬 Query or chat on your data and get answers">
<Step title="💬 Query or chat or search context on your data">
```python
elon_bot.query("What is the net worth of Elon Musk today?")
app.query("What is the net worth of Elon Musk today?")
# Answer: The net worth of Elon Musk today is $258.7 billion.
```
</Step>
@@ -41,18 +41,18 @@ Putting it together, you can run your first app using the following code. Make s
```python
import os
from embedchain import App
from embedchain import Pipeline as App
os.environ["OPENAI_API_KEY"] = "xxx"
elon_bot = App()
app = App()
# Add different data sources
elon_bot.add("https://en.wikipedia.org/wiki/Elon_Musk")
elon_bot.add("https://www.forbes.com/profile/elon-musk")
app.add("https://en.wikipedia.org/wiki/Elon_Musk")
app.add("https://www.forbes.com/profile/elon-musk")
# You can also add local data sources such as pdf, csv files etc.
# elon_bot.add("/path/to/file.pdf")
# app.add("/path/to/file.pdf")
response = elon_bot.query("What is the net worth of Elon Musk today?")
response = app.query("What is the net worth of Elon Musk today?")
print(response)
# Answer: The net worth of Elon Musk today is $258.7 billion.
```

View File

@@ -39,7 +39,7 @@ os.environ['LANGCHAIN_PROJECT] = <your-project>
```python
from embedchain import App
from embedchain import Pipeline as App
app = App()
app.add("https://en.wikipedia.org/wiki/Elon_Musk")

View File

@@ -71,10 +71,6 @@
"group": "Examples",
"pages": ["examples/full_stack", "examples/api_server", "examples/discord_bot", "examples/slack_bot", "examples/telegram_bot", "examples/whatsapp_bot", "examples/poe_bot"]
},
{
"group": "Pipelines",
"pages": ["pipelines/quickstart"]
},
{
"group": "Community",
"pages": [

View File

@@ -1,44 +0,0 @@
---
title: '🚀 Pipelines'
description: '💡 Start building LLM powered data pipelines in 1 minute'
---
Embedchain lets you build data pipelines on your own data sources and deploy it in production in less than a minute. It can load, index, retrieve, and sync any unstructured data.
Install embedchain python package:
```bash
pip install embedchain
```
Creating a pipeline involves 3 steps:
<Steps>
<Step title="⚙️ Import pipeline instance">
```python
from embedchain import Pipeline
p = Pipeline(name="Elon Musk")
```
</Step>
<Step title="🗃️ Add data sources">
```python
# Add different data sources
p.add("https://en.wikipedia.org/wiki/Elon_Musk")
p.add("https://www.forbes.com/profile/elon-musk")
# You can also add local data sources such as pdf, csv files etc.
# p.add("/path/to/file.pdf")
```
</Step>
<Step title="💬 Deploy your pipeline to Embedchain platform">
```python
p.deploy()
```
</Step>
</Steps>
That's it. Now, head to the [Embedchain platform](https://app.embedchain.ai) and your pipeline is available there. Make sure to set the `OPENAI_API_KEY` 🔑 environment variable in the code.
After you deploy your pipeline to Embedchain platform, you can still add more data sources and update the pipeline multiple times.
Here is a Google Colab notebook for you to get started: [![Open in Colab](https://camo.githubusercontent.com/84f0493939e0c4de4e6dbe113251b4bfb5353e57134ffd9fcab6b8714514d4d1/68747470733a2f2f636f6c61622e72657365617263682e676f6f676c652e636f6d2f6173736574732f636f6c61622d62616467652e737667)](https://colab.research.google.com/drive/1YVXaBO4yqlHZY4ho67GCJ6aD4CHNiScD?usp=sharing)

View File

@@ -15,7 +15,7 @@ class AppConfig(BaseAppConfig):
self,
log_level: str = "WARNING",
id: Optional[str] = None,
collect_metrics: Optional[bool] = None,
collect_metrics: Optional[bool] = True,
collection_name: Optional[str] = None,
):
"""

View File

@@ -16,7 +16,7 @@ class PipelineConfig(BaseAppConfig):
log_level: str = "WARNING",
id: Optional[str] = None,
name: Optional[str] = None,
collect_metrics: Optional[bool] = False,
collect_metrics: Optional[bool] = True,
):
"""
Initializes a configuration class instance for an App. This is the simplest form of an embedchain app.

View File

@@ -1,18 +1,13 @@
import hashlib
import importlib.metadata
import json
import logging
import os
import sqlite3
import threading
import uuid
from pathlib import Path
from typing import Any, Dict, List, Optional
import requests
from dotenv import load_dotenv
from langchain.docstore.document import Document
from tenacity import retry, stop_after_attempt, wait_fixed
from embedchain.chunkers.base_chunker import BaseChunker
from embedchain.config import AddConfig, BaseLlmConfig
@@ -24,6 +19,7 @@ from embedchain.llm.base import BaseLlm
from embedchain.loaders.base_loader import BaseLoader
from embedchain.models.data_type import (DataType, DirectDataType,
IndirectDataType, SpecialDataType)
from embedchain.telemetry.posthog import AnonymousTelemetry
from embedchain.utils import detect_datatype
from embedchain.vectordb.base import BaseVectorDB
@@ -89,9 +85,8 @@ class EmbedChain(JSONSerializable):
self.user_asks = []
# Send anonymous telemetry
self.s_id = self.config.id if self.config.id else str(uuid.uuid4())
self.u_id = self._load_or_generate_user_id()
self._telemetry_props = {"class": self.__class__.__name__}
self.telemetry = AnonymousTelemetry(enabled=self.config.collect_metrics)
# Establish a connection to the SQLite database
self.connection = sqlite3.connect(SQLITE_PATH)
self.cursor = self.connection.cursor()
@@ -111,12 +106,8 @@ class EmbedChain(JSONSerializable):
"""
)
self.connection.commit()
# NOTE: Uncomment the next two lines when running tests to see if any test fires a telemetry event.
# if (self.config.collect_metrics):
# raise ConnectionRefusedError("Collection of metrics should not be allowed.")
thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("init",))
thread_telemetry.start()
# Send anonymous telemetry
self.telemetry.capture(event_name="init", properties=self._telemetry_props)
@property
def collect_metrics(self):
@@ -138,29 +129,6 @@ class EmbedChain(JSONSerializable):
raise ValueError(f"Boolean value expected but got {type(value)}.")
self.llm.online = value
def _load_or_generate_user_id(self) -> str:
"""
Loads the user id from the config file if it exists, otherwise generates a new
one and saves it to the config file.
:return: user id
:rtype: str
"""
if not os.path.exists(CONFIG_DIR):
os.makedirs(CONFIG_DIR)
if os.path.exists(CONFIG_FILE):
with open(CONFIG_FILE, "r") as f:
data = json.load(f)
if "user_id" in data:
return data["user_id"]
u_id = str(uuid.uuid4())
with open(CONFIG_FILE, "w") as f:
json.dump({"user_id": u_id}, f)
return u_id
def add(
self,
source: Any,
@@ -259,9 +227,14 @@ class EmbedChain(JSONSerializable):
# it's quicker to check the variable twice than to count words when they won't be submitted.
word_count = data_formatter.chunker.get_word_count(documents)
extra_metadata = {"data_type": data_type.value, "word_count": word_count, "chunks_count": new_chunks}
thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("add", extra_metadata))
thread_telemetry.start()
# Send anonymous telemetry
event_properties = {
**self._telemetry_props,
"data_type": data_type.value,
"word_count": word_count,
"chunks_count": new_chunks,
}
self.telemetry.capture(event_name="add", properties=event_properties)
return source_hash
@@ -535,9 +508,7 @@ class EmbedChain(JSONSerializable):
answer = self.llm.query(input_query=input_query, contexts=contexts, config=config, dry_run=dry_run)
# Send anonymous telemetry
thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("query",))
thread_telemetry.start()
self.telemetry.capture(event_name="query", properties=self._telemetry_props)
return answer
def chat(
@@ -569,10 +540,8 @@ class EmbedChain(JSONSerializable):
"""
contexts = self.retrieve_from_database(input_query=input_query, config=config, where=where)
answer = self.llm.chat(input_query=input_query, contexts=contexts, config=config, dry_run=dry_run)
# Send anonymous telemetry
thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("chat",))
thread_telemetry.start()
self.telemetry.capture(event_name="chat", properties=self._telemetry_props)
return answer
@@ -608,34 +577,8 @@ class EmbedChain(JSONSerializable):
Resets the database. Deletes all embeddings irreversibly.
`App` does not have to be reinitialized after using this method.
"""
# Send anonymous telemetry
thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("reset",))
thread_telemetry.start()
self.db.reset()
self.cursor.execute("DELETE FROM data_sources WHERE pipeline_id = ?", (self.config.id,))
self.connection.commit()
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
def _send_telemetry_event(self, method: str, extra_metadata: Optional[dict] = None):
"""
Send telemetry event to the embedchain server. This is anonymous. It can be toggled off in `AppConfig`.
"""
if not self.config.collect_metrics:
return
with threading.Lock():
url = "https://api.embedchain.ai/api/v1/telemetry/"
metadata = {
"s_id": self.s_id,
"version": importlib.metadata.version(__package__ or __name__),
"method": method,
"language": "py",
"u_id": self.u_id,
}
if extra_metadata:
metadata.update(extra_metadata)
response = requests.post(url, json={"metadata": metadata})
if response.status_code != 200:
logging.warning(f"Telemetry event failed with status code {response.status_code}")
# Send anonymous telemetry
self.telemetry.capture(event_name="reset", properties=self._telemetry_props)

View File

@@ -18,6 +18,7 @@ from embedchain.factory import EmbedderFactory, LlmFactory, VectorDBFactory
from embedchain.helper.json_serializable import register_deserializable
from embedchain.llm.base import BaseLlm
from embedchain.llm.openai import OpenAILlm
from embedchain.telemetry.posthog import AnonymousTelemetry
from embedchain.vectordb.base import BaseVectorDB
from embedchain.vectordb.chroma import ChromaDB
@@ -109,8 +110,9 @@ class Pipeline(EmbedChain):
self.llm = llm or OpenAILlm()
self._init_db()
# setup user id and directory
self.u_id = self._load_or_generate_user_id()
# Send anonymous telemetry
self._telemetry_props = {"class": self.__class__.__name__}
self.telemetry = AnonymousTelemetry(enabled=self.config.collect_metrics)
# Establish a connection to the SQLite database
self.connection = sqlite3.connect(SQLITE_PATH)
@@ -131,8 +133,10 @@ class Pipeline(EmbedChain):
"""
)
self.connection.commit()
# Send anonymous telemetry
self.telemetry.capture(event_name="init", properties=self._telemetry_props)
self.user_asks = [] # legacy defaults
self.user_asks = []
if self.auto_deploy:
self.deploy()
@@ -219,6 +223,9 @@ class Pipeline(EmbedChain):
"""
Search for similar documents related to the query in the vector database.
"""
# Send anonymous telemetry
self.telemetry.capture(event_name="search", properties=self._telemetry_props)
# TODO: Search will call the endpoint rather than fetching the data from the db itself when deploy=True.
if self.id is None:
where = {"app_id": self.local_id}
@@ -312,6 +319,9 @@ class Pipeline(EmbedChain):
data_hash, data_type, data_value = result[1], result[2], result[3]
self._process_and_upload_data(data_hash, data_type, data_value)
# Send anonymous telemetry
self.telemetry.capture(event_name="deploy", properties=self._telemetry_props)
@classmethod
def from_config(cls, yaml_path: str, auto_deploy: bool = False):
"""
@@ -347,6 +357,11 @@ class Pipeline(EmbedChain):
embedding_model = EmbedderFactory.create(
embedding_model_provider, embedding_model_config_data.get("config", {})
)
# Send anonymous telemetry
event_properties = {"init_type": "yaml_config"}
AnonymousTelemetry().capture(event_name="init", properties=event_properties)
return cls(
config=pipeline_config,
llm=llm,

View File

View File

@@ -0,0 +1,67 @@
import json
import logging
import os
import uuid
from pathlib import Path
from posthog import Posthog
import embedchain
HOME_DIR = str(Path.home())
CONFIG_DIR = os.path.join(HOME_DIR, ".embedchain")
CONFIG_FILE = os.path.join(CONFIG_DIR, "config.json")
logger = logging.getLogger(__name__)
class AnonymousTelemetry:
def __init__(self, host="https://app.posthog.com", enabled=True):
self.project_api_key = "phc_XnMmNHzwxE7PVHX4mD2r8K6nfxVM48a2sq2U3N1p2lO"
self.host = host
self.posthog = Posthog(project_api_key=self.project_api_key, host=self.host)
self.user_id = self.get_user_id()
self.enabled = enabled
# Check if telemetry tracking is disabled via environment variable
if "EC_TELEMETRY" in os.environ and os.environ["EC_TELEMETRY"].lower() not in [
"1",
"true",
"yes",
]:
self.enabled = False
if not self.enabled:
self.posthog.disabled = True
# Silence posthog logging
posthog_logger = logging.getLogger("posthog")
posthog_logger.disabled = True
def get_user_id(self):
if not os.path.exists(CONFIG_DIR):
os.makedirs(CONFIG_DIR)
if os.path.exists(CONFIG_FILE):
with open(CONFIG_FILE, "r") as f:
data = json.load(f)
if "user_id" in data:
return data["user_id"]
user_id = str(uuid.uuid4())
with open(CONFIG_FILE, "w") as f:
json.dump({"user_id": user_id}, f)
return user_id
def capture(self, event_name, properties=None):
default_properties = {
"version": embedchain.__version__,
"language": "python",
"pid": os.getpid(),
}
properties.update(default_properties)
try:
self.posthog.capture(self.user_id, event_name, properties)
except Exception:
logger.exception(f"Failed to send telemetry {event_name=}")

View File

@@ -38,7 +38,7 @@ class ChromaDB(BaseVectorDB):
else:
self.config = ChromaDbConfig()
self.settings = Settings()
self.settings = Settings(anonymized_telemetry=False)
self.settings.allow_reset = self.config.allow_reset if hasattr(self.config, "allow_reset") else False
if self.config.chroma_settings:
for key, value in self.config.chroma_settings.items():

2
poetry.lock generated
View File

@@ -7196,4 +7196,4 @@ whatsapp = ["flask", "twilio"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.9,<3.13"
content-hash = "4021d63d76a9128c8d7342bc482dfe90fa9878fde1359c2f0730fa9188e503af"
content-hash = "fad388baca92d2669530c2d928920e9f7abbe0ad74786b34dc915603b4a08a17"

View File

@@ -1,6 +1,6 @@
[tool.poetry]
name = "embedchain"
version = "0.0.82"
version = "0.0.83"
description = "Data platform for LLMs - Load, index, retrieve and sync any unstructured data"
authors = [
"Taranjeet Singh <taranjeet@embedchain.ai>",
@@ -93,13 +93,14 @@ python-dotenv = "^1.0.0"
langchain = "^0.0.279"
requests = "^2.31.0"
openai = ">=0.28.0"
tiktoken = { version="^0.4.0", optional=true }
chromadb ="^0.4.8"
youtube-transcript-api = { version="^0.6.1", optional=true }
beautifulsoup4 = { version="^4.12.2", optional=true }
pypdf = { version="^3.11.0", optional=true }
pytube = { version="^15.0.0", optional=true }
duckduckgo-search = { version="^3.8.5", optional=true }
chromadb = "^0.4.8"
posthog = "^3.0.2"
tiktoken = { version = "^0.4.0", optional = true }
youtube-transcript-api = { version = "^0.6.1", optional = true }
beautifulsoup4 = { version = "^4.12.2", optional = true }
pypdf = { version = "^3.11.0", optional = true }
pytube = { version = "^15.0.0", optional = true }
duckduckgo-search = { version = "^3.8.5", optional = true }
llama-hub = { version = "^0.0.29", optional = true }
sentence-transformers = { version = "^2.2.2", optional = true }
torch = { version = "2.0.0", optional = true }
@@ -113,12 +114,12 @@ twilio = { version = "^8.5.0", optional = true }
fastapi-poe = { version = "0.0.16", optional = true }
discord = { version = "^2.3.2", optional = true }
slack-sdk = { version = "3.21.3", optional = true }
cohere = { version = "^4.27", optional= true }
weaviate-client = { version = "^3.24.1", optional= true }
docx2txt = { version="^0.8", optional=true }
cohere = { version = "^4.27", optional = true }
weaviate-client = { version = "^3.24.1", optional = true }
docx2txt = { version = "^0.8", optional = true }
pinecone-client = { version = "^2.2.4", optional = true }
qdrant-client = { version = "1.6.3", optional = true }
unstructured = {extras = ["local-inference"], version = "^0.10.18", optional=true}
unstructured = {extras = ["local-inference"], version = "^0.10.18", optional = true}
pillow = { version = "10.0.1", optional = true }
torchvision = { version = ">=0.15.1, !=0.15.2", optional = true }
ftfy = { version = "6.1.1", optional = true }
@@ -127,7 +128,7 @@ huggingface_hub = { version = "^0.17.3", optional = true }
pymilvus = { version = "2.3.1", optional = true }
google-cloud-aiplatform = { version = "^1.26.1", optional = true }
replicate = { version = "^0.15.4", optional = true }
jq = { version=">=1.6.0", optional = true}
jq = { version = ">=1.6.0", optional = true}
[tool.poetry.group.dev.dependencies]
black = "^23.3.0"

View File

@@ -14,3 +14,10 @@ def setup():
clean_db()
yield
clean_db()
@pytest.fixture(autouse=True)
def disable_telemetry():
os.environ["EC_TELEMETRY"] = "false"
yield
del os.environ["EC_TELEMETRY"]

View File

@@ -0,0 +1,62 @@
import os
import logging
from embedchain.telemetry.posthog import AnonymousTelemetry
class TestAnonymousTelemetry:
def test_init(self, mocker):
# Enable telemetry specifically for this test
os.environ["EC_TELEMETRY"] = "true"
mock_posthog = mocker.patch("embedchain.telemetry.posthog.Posthog")
telemetry = AnonymousTelemetry()
assert telemetry.project_api_key == "phc_XnMmNHzwxE7PVHX4mD2r8K6nfxVM48a2sq2U3N1p2lO"
assert telemetry.host == "https://app.posthog.com"
assert telemetry.enabled is True
assert telemetry.user_id
mock_posthog.assert_called_once_with(project_api_key=telemetry.project_api_key, host=telemetry.host)
def test_init_with_disabled_telemetry(self, mocker, monkeypatch):
mocker.patch("embedchain.telemetry.posthog.Posthog")
telemetry = AnonymousTelemetry()
assert telemetry.enabled is False
assert telemetry.posthog.disabled is True
def test_get_user_id(self, mocker, tmpdir):
mock_uuid = mocker.patch("embedchain.telemetry.posthog.uuid.uuid4")
mock_uuid.return_value = "unique_user_id"
config_file = tmpdir.join("config.json")
mocker.patch("embedchain.telemetry.posthog.CONFIG_FILE", str(config_file))
telemetry = AnonymousTelemetry()
user_id = telemetry.get_user_id()
assert user_id == "unique_user_id"
assert config_file.read() == '{"user_id": "unique_user_id"}'
def test_capture(self, mocker):
# Enable telemetry specifically for this test
os.environ["EC_TELEMETRY"] = "true"
mock_posthog = mocker.patch("embedchain.telemetry.posthog.Posthog")
telemetry = AnonymousTelemetry()
event_name = "test_event"
properties = {"key": "value"}
telemetry.capture(event_name, properties)
mock_posthog.assert_called_once_with(
project_api_key=telemetry.project_api_key,
host=telemetry.host,
)
mock_posthog.return_value.capture.assert_called_once_with(
telemetry.user_id,
event_name,
properties,
)
def test_capture_with_exception(self, mocker, caplog):
mock_posthog = mocker.patch("embedchain.telemetry.posthog.Posthog")
mock_posthog.return_value.capture.side_effect = Exception("Test Exception")
telemetry = AnonymousTelemetry()
event_name = "test_event"
properties = {"key": "value"}
with caplog.at_level(logging.ERROR):
telemetry.capture(event_name, properties)
assert "Failed to send telemetry event" in caplog.text

View File

@@ -76,7 +76,7 @@ class TestQdrantDB(unittest.TestCase):
qdrant_client_mock.return_value.upsert.assert_called_once_with(
collection_name="embedchain-store-1526",
points=Batch(
ids=["def", "ghi"],
ids=["abc", "def"],
payloads=[
{
"identifier": "123",