[Feature]: Add posthog anonymous telemetry and update docs (#867)
This commit is contained in:
@@ -24,7 +24,7 @@ Once you have obtained the key, you can use it like this:
|
||||
|
||||
```python main.py
|
||||
import os
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
os.environ['OPENAI_API_KEY'] = 'xxx'
|
||||
|
||||
@@ -52,7 +52,7 @@ To use Azure OpenAI embedding model, you have to set some of the azure openai re
|
||||
|
||||
```python main.py
|
||||
import os
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
os.environ["OPENAI_API_TYPE"] = "azure"
|
||||
os.environ["OPENAI_API_BASE"] = "https://xxx.openai.azure.com/"
|
||||
@@ -90,7 +90,7 @@ GPT4All supports generating high quality embeddings of arbitrary length document
|
||||
<CodeGroup>
|
||||
|
||||
```python main.py
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
# load embedding model configuration from config.yaml file
|
||||
app = App.from_config(yaml_path="config.yaml")
|
||||
@@ -119,7 +119,7 @@ Hugging Face supports generating embeddings of arbitrary length documents of tex
|
||||
<CodeGroup>
|
||||
|
||||
```python main.py
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
# load embedding model configuration from config.yaml file
|
||||
app = App.from_config(yaml_path="config.yaml")
|
||||
@@ -150,7 +150,7 @@ Embedchain supports Google's VertexAI embeddings model through a simple interfac
|
||||
<CodeGroup>
|
||||
|
||||
```python main.py
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
# load embedding model configuration from config.yaml file
|
||||
app = App.from_config(yaml_path="config.yaml")
|
||||
|
||||
@@ -26,7 +26,7 @@ Once you have obtained the key, you can use it like this:
|
||||
|
||||
```python
|
||||
import os
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
os.environ['OPENAI_API_KEY'] = 'xxx'
|
||||
|
||||
@@ -41,7 +41,7 @@ If you are looking to configure the different parameters of the LLM, you can do
|
||||
|
||||
```python main.py
|
||||
import os
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
os.environ['OPENAI_API_KEY'] = 'xxx'
|
||||
|
||||
@@ -71,7 +71,7 @@ To use Azure OpenAI model, you have to set some of the azure openai related envi
|
||||
|
||||
```python main.py
|
||||
import os
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
os.environ["OPENAI_API_TYPE"] = "azure"
|
||||
os.environ["OPENAI_API_BASE"] = "https://xxx.openai.azure.com/"
|
||||
@@ -110,7 +110,7 @@ To use anthropic's model, please set the `ANTHROPIC_API_KEY` which you find on t
|
||||
|
||||
```python main.py
|
||||
import os
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
os.environ["ANTHROPIC_API_KEY"] = "xxx"
|
||||
|
||||
@@ -147,7 +147,7 @@ Once you have the API key, you are all set to use it with Embedchain.
|
||||
|
||||
```python main.py
|
||||
import os
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
os.environ["COHERE_API_KEY"] = "xxx"
|
||||
|
||||
@@ -180,7 +180,7 @@ GPT4all is a free-to-use, locally running, privacy-aware chatbot. No GPU or inte
|
||||
<CodeGroup>
|
||||
|
||||
```python main.py
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
# load llm configuration from config.yaml file
|
||||
app = App.from_config(yaml_path="config.yaml")
|
||||
@@ -212,7 +212,7 @@ Once you have the key, load the app using the config yaml file:
|
||||
|
||||
```python main.py
|
||||
import os
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
os.environ["JINACHAT_API_KEY"] = "xxx"
|
||||
# load llm configuration from config.yaml file
|
||||
@@ -248,7 +248,7 @@ Once you have the token, load the app using the config yaml file:
|
||||
|
||||
```python main.py
|
||||
import os
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
os.environ["HUGGINGFACE_ACCESS_TOKEN"] = "xxx"
|
||||
|
||||
@@ -278,7 +278,7 @@ Once you have the token, load the app using the config yaml file:
|
||||
|
||||
```python main.py
|
||||
import os
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
os.environ["REPLICATE_API_TOKEN"] = "xxx"
|
||||
|
||||
@@ -305,7 +305,7 @@ Setup Google Cloud Platform application credentials by following the instruction
|
||||
<CodeGroup>
|
||||
|
||||
```python main.py
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
# load llm configuration from config.yaml file
|
||||
app = App.from_config(yaml_path="config.yaml")
|
||||
|
||||
@@ -22,7 +22,7 @@ Utilizing a vector database alongside Embedchain is a seamless process. All you
|
||||
<CodeGroup>
|
||||
|
||||
```python main.py
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
# load chroma configuration from yaml file
|
||||
app = App.from_config(yaml_path="config1.yaml")
|
||||
@@ -61,7 +61,7 @@ pip install --upgrade 'embedchain[elasticsearch]'
|
||||
<CodeGroup>
|
||||
|
||||
```python main.py
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
# load elasticsearch configuration from yaml file
|
||||
app = App.from_config(yaml_path="config.yaml")
|
||||
@@ -89,7 +89,7 @@ pip install --upgrade 'embedchain[opensearch]'
|
||||
<CodeGroup>
|
||||
|
||||
```python main.py
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
# load opensearch configuration from yaml file
|
||||
app = App.from_config(yaml_path="config.yaml")
|
||||
@@ -125,7 +125,7 @@ Set the Zilliz environment variables `ZILLIZ_CLOUD_URI` and `ZILLIZ_CLOUD_TOKEN`
|
||||
|
||||
```python main.py
|
||||
import os
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
os.environ['ZILLIZ_CLOUD_URI'] = 'https://xxx.zillizcloud.com'
|
||||
os.environ['ZILLIZ_CLOUD_TOKEN'] = 'xxx'
|
||||
@@ -164,7 +164,7 @@ In order to use Pinecone as vector database, set the environment variables `PINE
|
||||
<CodeGroup>
|
||||
|
||||
```python main.py
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
# load pinecone configuration from yaml file
|
||||
app = App.from_config(yaml_path="config.yaml")
|
||||
@@ -187,7 +187,7 @@ In order to use Qdrant as a vector database, set the environment variables `QDRA
|
||||
|
||||
<CodeGroup>
|
||||
```python main.py
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
# load qdrant configuration from yaml file
|
||||
app = App.from_config(yaml_path="config.yaml")
|
||||
@@ -207,7 +207,7 @@ In order to use Weaviate as a vector database, set the environment variables `WE
|
||||
|
||||
<CodeGroup>
|
||||
```python main.py
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
# load weaviate configuration from yaml file
|
||||
app = App.from_config(yaml_path="config.yaml")
|
||||
|
||||
@@ -5,7 +5,7 @@ title: '📊 CSV'
|
||||
To add any csv file, use the data_type as `csv`. `csv` allows remote urls and conventional file paths. Headers are included for each line, so if you have an `age` column, `18` will be added as `age: 18`. Eg:
|
||||
|
||||
```python
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
app = App()
|
||||
app.add('https://people.sc.fsu.edu/~jburkardt/data/csv/airtravel.csv', data_type="csv")
|
||||
|
||||
@@ -35,7 +35,7 @@ Default behavior is to create a persistent vector db in the directory **./db**.
|
||||
Create a local index:
|
||||
|
||||
```python
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
naval_chat_bot = App()
|
||||
naval_chat_bot.add("https://www.youtube.com/watch?v=3qHkcs3kG44")
|
||||
@@ -45,7 +45,7 @@ naval_chat_bot.add("https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Alma
|
||||
You can reuse the local index with the same code, but without adding new documents:
|
||||
|
||||
```python
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
naval_chat_bot = App()
|
||||
print(naval_chat_bot.query("What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?"))
|
||||
|
||||
@@ -5,7 +5,7 @@ title: '📚🌐 Code documentation'
|
||||
To add any code documentation website as a loader, use the data_type as `docs_site`. Eg:
|
||||
|
||||
```python
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
app = App()
|
||||
app.add("https://docs.embedchain.ai/", data_type="docs_site")
|
||||
|
||||
@@ -7,7 +7,7 @@ title: '📄 Docx file'
|
||||
To add any doc/docx file, use the data_type as `docx`. `docx` allows remote urls and conventional file paths. Eg:
|
||||
|
||||
```python
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
app = App()
|
||||
app.add('https://example.com/content/intro.docx', data_type="docx")
|
||||
|
||||
@@ -5,7 +5,7 @@ title: '📝 Mdx file'
|
||||
To add any `.mdx` file to your app, use the data_type (first argument to `.add()` method) as `mdx`. Note that this supports support mdx file present on machine, so this should be a file path. Eg:
|
||||
|
||||
```python
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
app = App()
|
||||
app.add('path/to/file.mdx', data_type='mdx')
|
||||
|
||||
@@ -8,7 +8,7 @@ To load a notion page, use the data_type as `notion`. Since it is hard to automa
|
||||
The next argument must **end** with the `notion page id`. The id is a 32-character string. Eg:
|
||||
|
||||
```python
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
app = App()
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ title: '📰 PDF file'
|
||||
To add any pdf file, use the data_type as `pdf_file`. Eg:
|
||||
|
||||
```python
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
app = App()
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ title: '❓💬 Queston and answer pair'
|
||||
QnA pair is a local data type. To supply your own QnA pair, use the data_type as `qna_pair` and enter a tuple. Eg:
|
||||
|
||||
```python
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
app = App()
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ title: '🗺️ Sitemap'
|
||||
Add all web pages from an xml-sitemap. Filters non-text files. Use the data_type as `sitemap`. Eg:
|
||||
|
||||
```python
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
app = App()
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ title: '📝 Text'
|
||||
Text is a local data type. To supply your own text, use the data_type as `text` and enter a string. The text is not processed, this can be very versatile. Eg:
|
||||
|
||||
```python
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
app = App()
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ title: '🌐📄 Web page'
|
||||
To add any web page, use the data_type as `web_page`. Eg:
|
||||
|
||||
```python
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
app = App()
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ title: '🧾 XML file'
|
||||
To add any xml file, use the data_type as `xml`. Eg:
|
||||
|
||||
```python
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
app = App()
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ title: '🎥📺 Youtube video'
|
||||
To add any youtube video to your app, use the data_type (first argument to `.add()` method) as `youtube_video`. Eg:
|
||||
|
||||
```python
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
app = App()
|
||||
app.add('a_valid_youtube_url_here', data_type='youtube_video')
|
||||
|
||||
@@ -9,7 +9,7 @@ description: 'Collections of all the frequently asked questions'
|
||||
|
||||
```python main.py
|
||||
import os
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
os.environ['OPENAI_API_KEY'] = 'xxx'
|
||||
|
||||
@@ -36,7 +36,7 @@ llm:
|
||||
|
||||
```python main.py
|
||||
import os
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
os.environ['OPENAI_API_KEY'] = 'xxx'
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ You can add data from different data sources using the `.add()` method. Then, si
|
||||
If you want to create a Naval Ravikant bot with a YouTube video, a book in PDF format, two blog posts, and a question and answer pair, all you need to do is add the respective links. Embedchain will take care of the rest, creating a bot for you.
|
||||
|
||||
```python
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
naval_bot = App()
|
||||
# Add online data
|
||||
|
||||
@@ -16,22 +16,22 @@ Creating an app involves 3 steps:
|
||||
<Steps>
|
||||
<Step title="⚙️ Import app instance">
|
||||
```python
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
app = App()
|
||||
```
|
||||
</Step>
|
||||
<Step title="🗃️ Add data sources">
|
||||
```python
|
||||
# Add different data sources
|
||||
elon_bot.add("https://en.wikipedia.org/wiki/Elon_Musk")
|
||||
elon_bot.add("https://www.forbes.com/profile/elon-musk")
|
||||
app.add("https://en.wikipedia.org/wiki/Elon_Musk")
|
||||
app.add("https://www.forbes.com/profile/elon-musk")
|
||||
# You can also add local data sources such as pdf, csv files etc.
|
||||
# elon_bot.add("/path/to/file.pdf")
|
||||
# app.add("/path/to/file.pdf")
|
||||
```
|
||||
</Step>
|
||||
<Step title="💬 Query or chat on your data and get answers">
|
||||
<Step title="💬 Query or chat or search context on your data">
|
||||
```python
|
||||
elon_bot.query("What is the net worth of Elon Musk today?")
|
||||
app.query("What is the net worth of Elon Musk today?")
|
||||
# Answer: The net worth of Elon Musk today is $258.7 billion.
|
||||
```
|
||||
</Step>
|
||||
@@ -41,18 +41,18 @@ Putting it together, you can run your first app using the following code. Make s
|
||||
|
||||
```python
|
||||
import os
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
os.environ["OPENAI_API_KEY"] = "xxx"
|
||||
elon_bot = App()
|
||||
app = App()
|
||||
|
||||
# Add different data sources
|
||||
elon_bot.add("https://en.wikipedia.org/wiki/Elon_Musk")
|
||||
elon_bot.add("https://www.forbes.com/profile/elon-musk")
|
||||
app.add("https://en.wikipedia.org/wiki/Elon_Musk")
|
||||
app.add("https://www.forbes.com/profile/elon-musk")
|
||||
# You can also add local data sources such as pdf, csv files etc.
|
||||
# elon_bot.add("/path/to/file.pdf")
|
||||
# app.add("/path/to/file.pdf")
|
||||
|
||||
response = elon_bot.query("What is the net worth of Elon Musk today?")
|
||||
response = app.query("What is the net worth of Elon Musk today?")
|
||||
print(response)
|
||||
# Answer: The net worth of Elon Musk today is $258.7 billion.
|
||||
```
|
||||
|
||||
@@ -39,7 +39,7 @@ os.environ['LANGCHAIN_PROJECT] = <your-project>
|
||||
|
||||
|
||||
```python
|
||||
from embedchain import App
|
||||
from embedchain import Pipeline as App
|
||||
|
||||
app = App()
|
||||
app.add("https://en.wikipedia.org/wiki/Elon_Musk")
|
||||
|
||||
@@ -71,10 +71,6 @@
|
||||
"group": "Examples",
|
||||
"pages": ["examples/full_stack", "examples/api_server", "examples/discord_bot", "examples/slack_bot", "examples/telegram_bot", "examples/whatsapp_bot", "examples/poe_bot"]
|
||||
},
|
||||
{
|
||||
"group": "Pipelines",
|
||||
"pages": ["pipelines/quickstart"]
|
||||
},
|
||||
{
|
||||
"group": "Community",
|
||||
"pages": [
|
||||
|
||||
@@ -1,44 +0,0 @@
|
||||
---
|
||||
title: '🚀 Pipelines'
|
||||
description: '💡 Start building LLM powered data pipelines in 1 minute'
|
||||
---
|
||||
|
||||
Embedchain lets you build data pipelines on your own data sources and deploy it in production in less than a minute. It can load, index, retrieve, and sync any unstructured data.
|
||||
|
||||
Install embedchain python package:
|
||||
|
||||
```bash
|
||||
pip install embedchain
|
||||
```
|
||||
|
||||
Creating a pipeline involves 3 steps:
|
||||
|
||||
<Steps>
|
||||
<Step title="⚙️ Import pipeline instance">
|
||||
```python
|
||||
from embedchain import Pipeline
|
||||
p = Pipeline(name="Elon Musk")
|
||||
```
|
||||
</Step>
|
||||
|
||||
<Step title="🗃️ Add data sources">
|
||||
```python
|
||||
# Add different data sources
|
||||
p.add("https://en.wikipedia.org/wiki/Elon_Musk")
|
||||
p.add("https://www.forbes.com/profile/elon-musk")
|
||||
# You can also add local data sources such as pdf, csv files etc.
|
||||
# p.add("/path/to/file.pdf")
|
||||
```
|
||||
</Step>
|
||||
<Step title="💬 Deploy your pipeline to Embedchain platform">
|
||||
```python
|
||||
p.deploy()
|
||||
```
|
||||
</Step>
|
||||
</Steps>
|
||||
|
||||
That's it. Now, head to the [Embedchain platform](https://app.embedchain.ai) and your pipeline is available there. Make sure to set the `OPENAI_API_KEY` 🔑 environment variable in the code.
|
||||
|
||||
After you deploy your pipeline to Embedchain platform, you can still add more data sources and update the pipeline multiple times.
|
||||
|
||||
Here is a Google Colab notebook for you to get started: [](https://colab.research.google.com/drive/1YVXaBO4yqlHZY4ho67GCJ6aD4CHNiScD?usp=sharing)
|
||||
@@ -15,7 +15,7 @@ class AppConfig(BaseAppConfig):
|
||||
self,
|
||||
log_level: str = "WARNING",
|
||||
id: Optional[str] = None,
|
||||
collect_metrics: Optional[bool] = None,
|
||||
collect_metrics: Optional[bool] = True,
|
||||
collection_name: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
|
||||
@@ -16,7 +16,7 @@ class PipelineConfig(BaseAppConfig):
|
||||
log_level: str = "WARNING",
|
||||
id: Optional[str] = None,
|
||||
name: Optional[str] = None,
|
||||
collect_metrics: Optional[bool] = False,
|
||||
collect_metrics: Optional[bool] = True,
|
||||
):
|
||||
"""
|
||||
Initializes a configuration class instance for an App. This is the simplest form of an embedchain app.
|
||||
|
||||
@@ -1,18 +1,13 @@
|
||||
import hashlib
|
||||
import importlib.metadata
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sqlite3
|
||||
import threading
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import requests
|
||||
from dotenv import load_dotenv
|
||||
from langchain.docstore.document import Document
|
||||
from tenacity import retry, stop_after_attempt, wait_fixed
|
||||
|
||||
from embedchain.chunkers.base_chunker import BaseChunker
|
||||
from embedchain.config import AddConfig, BaseLlmConfig
|
||||
@@ -24,6 +19,7 @@ from embedchain.llm.base import BaseLlm
|
||||
from embedchain.loaders.base_loader import BaseLoader
|
||||
from embedchain.models.data_type import (DataType, DirectDataType,
|
||||
IndirectDataType, SpecialDataType)
|
||||
from embedchain.telemetry.posthog import AnonymousTelemetry
|
||||
from embedchain.utils import detect_datatype
|
||||
from embedchain.vectordb.base import BaseVectorDB
|
||||
|
||||
@@ -89,9 +85,8 @@ class EmbedChain(JSONSerializable):
|
||||
self.user_asks = []
|
||||
|
||||
# Send anonymous telemetry
|
||||
self.s_id = self.config.id if self.config.id else str(uuid.uuid4())
|
||||
self.u_id = self._load_or_generate_user_id()
|
||||
|
||||
self._telemetry_props = {"class": self.__class__.__name__}
|
||||
self.telemetry = AnonymousTelemetry(enabled=self.config.collect_metrics)
|
||||
# Establish a connection to the SQLite database
|
||||
self.connection = sqlite3.connect(SQLITE_PATH)
|
||||
self.cursor = self.connection.cursor()
|
||||
@@ -111,12 +106,8 @@ class EmbedChain(JSONSerializable):
|
||||
"""
|
||||
)
|
||||
self.connection.commit()
|
||||
|
||||
# NOTE: Uncomment the next two lines when running tests to see if any test fires a telemetry event.
|
||||
# if (self.config.collect_metrics):
|
||||
# raise ConnectionRefusedError("Collection of metrics should not be allowed.")
|
||||
thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("init",))
|
||||
thread_telemetry.start()
|
||||
# Send anonymous telemetry
|
||||
self.telemetry.capture(event_name="init", properties=self._telemetry_props)
|
||||
|
||||
@property
|
||||
def collect_metrics(self):
|
||||
@@ -138,29 +129,6 @@ class EmbedChain(JSONSerializable):
|
||||
raise ValueError(f"Boolean value expected but got {type(value)}.")
|
||||
self.llm.online = value
|
||||
|
||||
def _load_or_generate_user_id(self) -> str:
|
||||
"""
|
||||
Loads the user id from the config file if it exists, otherwise generates a new
|
||||
one and saves it to the config file.
|
||||
|
||||
:return: user id
|
||||
:rtype: str
|
||||
"""
|
||||
if not os.path.exists(CONFIG_DIR):
|
||||
os.makedirs(CONFIG_DIR)
|
||||
|
||||
if os.path.exists(CONFIG_FILE):
|
||||
with open(CONFIG_FILE, "r") as f:
|
||||
data = json.load(f)
|
||||
if "user_id" in data:
|
||||
return data["user_id"]
|
||||
|
||||
u_id = str(uuid.uuid4())
|
||||
with open(CONFIG_FILE, "w") as f:
|
||||
json.dump({"user_id": u_id}, f)
|
||||
|
||||
return u_id
|
||||
|
||||
def add(
|
||||
self,
|
||||
source: Any,
|
||||
@@ -259,9 +227,14 @@ class EmbedChain(JSONSerializable):
|
||||
# it's quicker to check the variable twice than to count words when they won't be submitted.
|
||||
word_count = data_formatter.chunker.get_word_count(documents)
|
||||
|
||||
extra_metadata = {"data_type": data_type.value, "word_count": word_count, "chunks_count": new_chunks}
|
||||
thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("add", extra_metadata))
|
||||
thread_telemetry.start()
|
||||
# Send anonymous telemetry
|
||||
event_properties = {
|
||||
**self._telemetry_props,
|
||||
"data_type": data_type.value,
|
||||
"word_count": word_count,
|
||||
"chunks_count": new_chunks,
|
||||
}
|
||||
self.telemetry.capture(event_name="add", properties=event_properties)
|
||||
|
||||
return source_hash
|
||||
|
||||
@@ -535,9 +508,7 @@ class EmbedChain(JSONSerializable):
|
||||
answer = self.llm.query(input_query=input_query, contexts=contexts, config=config, dry_run=dry_run)
|
||||
|
||||
# Send anonymous telemetry
|
||||
thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("query",))
|
||||
thread_telemetry.start()
|
||||
|
||||
self.telemetry.capture(event_name="query", properties=self._telemetry_props)
|
||||
return answer
|
||||
|
||||
def chat(
|
||||
@@ -569,10 +540,8 @@ class EmbedChain(JSONSerializable):
|
||||
"""
|
||||
contexts = self.retrieve_from_database(input_query=input_query, config=config, where=where)
|
||||
answer = self.llm.chat(input_query=input_query, contexts=contexts, config=config, dry_run=dry_run)
|
||||
|
||||
# Send anonymous telemetry
|
||||
thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("chat",))
|
||||
thread_telemetry.start()
|
||||
self.telemetry.capture(event_name="chat", properties=self._telemetry_props)
|
||||
|
||||
return answer
|
||||
|
||||
@@ -608,34 +577,8 @@ class EmbedChain(JSONSerializable):
|
||||
Resets the database. Deletes all embeddings irreversibly.
|
||||
`App` does not have to be reinitialized after using this method.
|
||||
"""
|
||||
# Send anonymous telemetry
|
||||
thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("reset",))
|
||||
thread_telemetry.start()
|
||||
|
||||
self.db.reset()
|
||||
self.cursor.execute("DELETE FROM data_sources WHERE pipeline_id = ?", (self.config.id,))
|
||||
self.connection.commit()
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
|
||||
def _send_telemetry_event(self, method: str, extra_metadata: Optional[dict] = None):
|
||||
"""
|
||||
Send telemetry event to the embedchain server. This is anonymous. It can be toggled off in `AppConfig`.
|
||||
"""
|
||||
if not self.config.collect_metrics:
|
||||
return
|
||||
|
||||
with threading.Lock():
|
||||
url = "https://api.embedchain.ai/api/v1/telemetry/"
|
||||
metadata = {
|
||||
"s_id": self.s_id,
|
||||
"version": importlib.metadata.version(__package__ or __name__),
|
||||
"method": method,
|
||||
"language": "py",
|
||||
"u_id": self.u_id,
|
||||
}
|
||||
if extra_metadata:
|
||||
metadata.update(extra_metadata)
|
||||
|
||||
response = requests.post(url, json={"metadata": metadata})
|
||||
if response.status_code != 200:
|
||||
logging.warning(f"Telemetry event failed with status code {response.status_code}")
|
||||
# Send anonymous telemetry
|
||||
self.telemetry.capture(event_name="reset", properties=self._telemetry_props)
|
||||
|
||||
@@ -18,6 +18,7 @@ from embedchain.factory import EmbedderFactory, LlmFactory, VectorDBFactory
|
||||
from embedchain.helper.json_serializable import register_deserializable
|
||||
from embedchain.llm.base import BaseLlm
|
||||
from embedchain.llm.openai import OpenAILlm
|
||||
from embedchain.telemetry.posthog import AnonymousTelemetry
|
||||
from embedchain.vectordb.base import BaseVectorDB
|
||||
from embedchain.vectordb.chroma import ChromaDB
|
||||
|
||||
@@ -109,8 +110,9 @@ class Pipeline(EmbedChain):
|
||||
self.llm = llm or OpenAILlm()
|
||||
self._init_db()
|
||||
|
||||
# setup user id and directory
|
||||
self.u_id = self._load_or_generate_user_id()
|
||||
# Send anonymous telemetry
|
||||
self._telemetry_props = {"class": self.__class__.__name__}
|
||||
self.telemetry = AnonymousTelemetry(enabled=self.config.collect_metrics)
|
||||
|
||||
# Establish a connection to the SQLite database
|
||||
self.connection = sqlite3.connect(SQLITE_PATH)
|
||||
@@ -131,8 +133,10 @@ class Pipeline(EmbedChain):
|
||||
"""
|
||||
)
|
||||
self.connection.commit()
|
||||
# Send anonymous telemetry
|
||||
self.telemetry.capture(event_name="init", properties=self._telemetry_props)
|
||||
|
||||
self.user_asks = [] # legacy defaults
|
||||
self.user_asks = []
|
||||
if self.auto_deploy:
|
||||
self.deploy()
|
||||
|
||||
@@ -219,6 +223,9 @@ class Pipeline(EmbedChain):
|
||||
"""
|
||||
Search for similar documents related to the query in the vector database.
|
||||
"""
|
||||
# Send anonymous telemetry
|
||||
self.telemetry.capture(event_name="search", properties=self._telemetry_props)
|
||||
|
||||
# TODO: Search will call the endpoint rather than fetching the data from the db itself when deploy=True.
|
||||
if self.id is None:
|
||||
where = {"app_id": self.local_id}
|
||||
@@ -312,6 +319,9 @@ class Pipeline(EmbedChain):
|
||||
data_hash, data_type, data_value = result[1], result[2], result[3]
|
||||
self._process_and_upload_data(data_hash, data_type, data_value)
|
||||
|
||||
# Send anonymous telemetry
|
||||
self.telemetry.capture(event_name="deploy", properties=self._telemetry_props)
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, yaml_path: str, auto_deploy: bool = False):
|
||||
"""
|
||||
@@ -347,6 +357,11 @@ class Pipeline(EmbedChain):
|
||||
embedding_model = EmbedderFactory.create(
|
||||
embedding_model_provider, embedding_model_config_data.get("config", {})
|
||||
)
|
||||
|
||||
# Send anonymous telemetry
|
||||
event_properties = {"init_type": "yaml_config"}
|
||||
AnonymousTelemetry().capture(event_name="init", properties=event_properties)
|
||||
|
||||
return cls(
|
||||
config=pipeline_config,
|
||||
llm=llm,
|
||||
|
||||
0
embedchain/telemetry/__init__.py
Normal file
0
embedchain/telemetry/__init__.py
Normal file
67
embedchain/telemetry/posthog.py
Normal file
67
embedchain/telemetry/posthog.py
Normal file
@@ -0,0 +1,67 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
from posthog import Posthog
|
||||
|
||||
import embedchain
|
||||
|
||||
HOME_DIR = str(Path.home())
|
||||
CONFIG_DIR = os.path.join(HOME_DIR, ".embedchain")
|
||||
CONFIG_FILE = os.path.join(CONFIG_DIR, "config.json")
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AnonymousTelemetry:
|
||||
def __init__(self, host="https://app.posthog.com", enabled=True):
|
||||
self.project_api_key = "phc_XnMmNHzwxE7PVHX4mD2r8K6nfxVM48a2sq2U3N1p2lO"
|
||||
self.host = host
|
||||
self.posthog = Posthog(project_api_key=self.project_api_key, host=self.host)
|
||||
self.user_id = self.get_user_id()
|
||||
self.enabled = enabled
|
||||
|
||||
# Check if telemetry tracking is disabled via environment variable
|
||||
if "EC_TELEMETRY" in os.environ and os.environ["EC_TELEMETRY"].lower() not in [
|
||||
"1",
|
||||
"true",
|
||||
"yes",
|
||||
]:
|
||||
self.enabled = False
|
||||
|
||||
if not self.enabled:
|
||||
self.posthog.disabled = True
|
||||
|
||||
# Silence posthog logging
|
||||
posthog_logger = logging.getLogger("posthog")
|
||||
posthog_logger.disabled = True
|
||||
|
||||
def get_user_id(self):
|
||||
if not os.path.exists(CONFIG_DIR):
|
||||
os.makedirs(CONFIG_DIR)
|
||||
|
||||
if os.path.exists(CONFIG_FILE):
|
||||
with open(CONFIG_FILE, "r") as f:
|
||||
data = json.load(f)
|
||||
if "user_id" in data:
|
||||
return data["user_id"]
|
||||
|
||||
user_id = str(uuid.uuid4())
|
||||
with open(CONFIG_FILE, "w") as f:
|
||||
json.dump({"user_id": user_id}, f)
|
||||
return user_id
|
||||
|
||||
def capture(self, event_name, properties=None):
|
||||
default_properties = {
|
||||
"version": embedchain.__version__,
|
||||
"language": "python",
|
||||
"pid": os.getpid(),
|
||||
}
|
||||
properties.update(default_properties)
|
||||
|
||||
try:
|
||||
self.posthog.capture(self.user_id, event_name, properties)
|
||||
except Exception:
|
||||
logger.exception(f"Failed to send telemetry {event_name=}")
|
||||
@@ -38,7 +38,7 @@ class ChromaDB(BaseVectorDB):
|
||||
else:
|
||||
self.config = ChromaDbConfig()
|
||||
|
||||
self.settings = Settings()
|
||||
self.settings = Settings(anonymized_telemetry=False)
|
||||
self.settings.allow_reset = self.config.allow_reset if hasattr(self.config, "allow_reset") else False
|
||||
if self.config.chroma_settings:
|
||||
for key, value in self.config.chroma_settings.items():
|
||||
|
||||
2
poetry.lock
generated
2
poetry.lock
generated
@@ -7196,4 +7196,4 @@ whatsapp = ["flask", "twilio"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.9,<3.13"
|
||||
content-hash = "4021d63d76a9128c8d7342bc482dfe90fa9878fde1359c2f0730fa9188e503af"
|
||||
content-hash = "fad388baca92d2669530c2d928920e9f7abbe0ad74786b34dc915603b4a08a17"
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "embedchain"
|
||||
version = "0.0.82"
|
||||
version = "0.0.83"
|
||||
description = "Data platform for LLMs - Load, index, retrieve and sync any unstructured data"
|
||||
authors = [
|
||||
"Taranjeet Singh <taranjeet@embedchain.ai>",
|
||||
@@ -93,13 +93,14 @@ python-dotenv = "^1.0.0"
|
||||
langchain = "^0.0.279"
|
||||
requests = "^2.31.0"
|
||||
openai = ">=0.28.0"
|
||||
tiktoken = { version="^0.4.0", optional=true }
|
||||
chromadb ="^0.4.8"
|
||||
youtube-transcript-api = { version="^0.6.1", optional=true }
|
||||
beautifulsoup4 = { version="^4.12.2", optional=true }
|
||||
pypdf = { version="^3.11.0", optional=true }
|
||||
pytube = { version="^15.0.0", optional=true }
|
||||
duckduckgo-search = { version="^3.8.5", optional=true }
|
||||
chromadb = "^0.4.8"
|
||||
posthog = "^3.0.2"
|
||||
tiktoken = { version = "^0.4.0", optional = true }
|
||||
youtube-transcript-api = { version = "^0.6.1", optional = true }
|
||||
beautifulsoup4 = { version = "^4.12.2", optional = true }
|
||||
pypdf = { version = "^3.11.0", optional = true }
|
||||
pytube = { version = "^15.0.0", optional = true }
|
||||
duckduckgo-search = { version = "^3.8.5", optional = true }
|
||||
llama-hub = { version = "^0.0.29", optional = true }
|
||||
sentence-transformers = { version = "^2.2.2", optional = true }
|
||||
torch = { version = "2.0.0", optional = true }
|
||||
@@ -113,12 +114,12 @@ twilio = { version = "^8.5.0", optional = true }
|
||||
fastapi-poe = { version = "0.0.16", optional = true }
|
||||
discord = { version = "^2.3.2", optional = true }
|
||||
slack-sdk = { version = "3.21.3", optional = true }
|
||||
cohere = { version = "^4.27", optional= true }
|
||||
weaviate-client = { version = "^3.24.1", optional= true }
|
||||
docx2txt = { version="^0.8", optional=true }
|
||||
cohere = { version = "^4.27", optional = true }
|
||||
weaviate-client = { version = "^3.24.1", optional = true }
|
||||
docx2txt = { version = "^0.8", optional = true }
|
||||
pinecone-client = { version = "^2.2.4", optional = true }
|
||||
qdrant-client = { version = "1.6.3", optional = true }
|
||||
unstructured = {extras = ["local-inference"], version = "^0.10.18", optional=true}
|
||||
unstructured = {extras = ["local-inference"], version = "^0.10.18", optional = true}
|
||||
pillow = { version = "10.0.1", optional = true }
|
||||
torchvision = { version = ">=0.15.1, !=0.15.2", optional = true }
|
||||
ftfy = { version = "6.1.1", optional = true }
|
||||
@@ -127,7 +128,7 @@ huggingface_hub = { version = "^0.17.3", optional = true }
|
||||
pymilvus = { version = "2.3.1", optional = true }
|
||||
google-cloud-aiplatform = { version = "^1.26.1", optional = true }
|
||||
replicate = { version = "^0.15.4", optional = true }
|
||||
jq = { version=">=1.6.0", optional = true}
|
||||
jq = { version = ">=1.6.0", optional = true}
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
black = "^23.3.0"
|
||||
|
||||
@@ -14,3 +14,10 @@ def setup():
|
||||
clean_db()
|
||||
yield
|
||||
clean_db()
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def disable_telemetry():
|
||||
os.environ["EC_TELEMETRY"] = "false"
|
||||
yield
|
||||
del os.environ["EC_TELEMETRY"]
|
||||
|
||||
62
tests/telemetry/test_posthog.py
Normal file
62
tests/telemetry/test_posthog.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import os
|
||||
import logging
|
||||
from embedchain.telemetry.posthog import AnonymousTelemetry
|
||||
|
||||
|
||||
class TestAnonymousTelemetry:
|
||||
def test_init(self, mocker):
|
||||
# Enable telemetry specifically for this test
|
||||
os.environ["EC_TELEMETRY"] = "true"
|
||||
mock_posthog = mocker.patch("embedchain.telemetry.posthog.Posthog")
|
||||
telemetry = AnonymousTelemetry()
|
||||
assert telemetry.project_api_key == "phc_XnMmNHzwxE7PVHX4mD2r8K6nfxVM48a2sq2U3N1p2lO"
|
||||
assert telemetry.host == "https://app.posthog.com"
|
||||
assert telemetry.enabled is True
|
||||
assert telemetry.user_id
|
||||
mock_posthog.assert_called_once_with(project_api_key=telemetry.project_api_key, host=telemetry.host)
|
||||
|
||||
def test_init_with_disabled_telemetry(self, mocker, monkeypatch):
|
||||
mocker.patch("embedchain.telemetry.posthog.Posthog")
|
||||
telemetry = AnonymousTelemetry()
|
||||
assert telemetry.enabled is False
|
||||
assert telemetry.posthog.disabled is True
|
||||
|
||||
def test_get_user_id(self, mocker, tmpdir):
|
||||
mock_uuid = mocker.patch("embedchain.telemetry.posthog.uuid.uuid4")
|
||||
mock_uuid.return_value = "unique_user_id"
|
||||
config_file = tmpdir.join("config.json")
|
||||
mocker.patch("embedchain.telemetry.posthog.CONFIG_FILE", str(config_file))
|
||||
telemetry = AnonymousTelemetry()
|
||||
|
||||
user_id = telemetry.get_user_id()
|
||||
assert user_id == "unique_user_id"
|
||||
assert config_file.read() == '{"user_id": "unique_user_id"}'
|
||||
|
||||
def test_capture(self, mocker):
|
||||
# Enable telemetry specifically for this test
|
||||
os.environ["EC_TELEMETRY"] = "true"
|
||||
mock_posthog = mocker.patch("embedchain.telemetry.posthog.Posthog")
|
||||
telemetry = AnonymousTelemetry()
|
||||
event_name = "test_event"
|
||||
properties = {"key": "value"}
|
||||
telemetry.capture(event_name, properties)
|
||||
|
||||
mock_posthog.assert_called_once_with(
|
||||
project_api_key=telemetry.project_api_key,
|
||||
host=telemetry.host,
|
||||
)
|
||||
mock_posthog.return_value.capture.assert_called_once_with(
|
||||
telemetry.user_id,
|
||||
event_name,
|
||||
properties,
|
||||
)
|
||||
|
||||
def test_capture_with_exception(self, mocker, caplog):
|
||||
mock_posthog = mocker.patch("embedchain.telemetry.posthog.Posthog")
|
||||
mock_posthog.return_value.capture.side_effect = Exception("Test Exception")
|
||||
telemetry = AnonymousTelemetry()
|
||||
event_name = "test_event"
|
||||
properties = {"key": "value"}
|
||||
with caplog.at_level(logging.ERROR):
|
||||
telemetry.capture(event_name, properties)
|
||||
assert "Failed to send telemetry event" in caplog.text
|
||||
@@ -76,7 +76,7 @@ class TestQdrantDB(unittest.TestCase):
|
||||
qdrant_client_mock.return_value.upsert.assert_called_once_with(
|
||||
collection_name="embedchain-store-1526",
|
||||
points=Batch(
|
||||
ids=["def", "ghi"],
|
||||
ids=["abc", "def"],
|
||||
payloads=[
|
||||
{
|
||||
"identifier": "123",
|
||||
|
||||
Reference in New Issue
Block a user